From 6c8a3c099bc12bb00fa3899753e41183f05fe9fc Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Fri, 19 Sep 2025 13:10:44 +0800 Subject: [PATCH 001/185] [Docs] Fix griffe warnings in vllm/multimodal (#25216) Signed-off-by: windsonsea --- vllm/multimodal/inputs.py | 6 +++--- vllm/multimodal/utils.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 240e34e139cfe..e00c10fb66eeb 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -569,8 +569,8 @@ class MultiModalFieldConfig: Args: modality: The modality of the multi-modal item that uses this keyword argument. - slices: For each multi-modal item, the size of the slice that - is used to extract the data corresponding to it. + size_per_item: For each multi-modal item, the size of the slice + that is used to extract the data corresponding to it. dim: The dimension to slice, default to 0. Example: @@ -590,7 +590,7 @@ class MultiModalFieldConfig: ``` Given: - slices: [3, 4, 2] + size_per_item: [3, 4, 2] dim: 1 Input: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index b308366fca282..f4e2ed72e2d7d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -395,7 +395,9 @@ def group_mm_kwargs_by_modality( modality together into the same `MultiModalKwargs` instance. Args: - mm_inputs: List of `MultiModalKwargsItem`. + mm_kwargs: List of `MultiModalKwargsItem`. + device: The device to place the grouped tensors on. + pin_memory: Whether to pin memory for faster host-to-device transfer. Yields: A tuple `(modality, num_items, grouped_kwargs)`. From a6149aa587d6582545b7878a2dffed3a2419605d Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 19 Sep 2025 00:41:53 -0500 Subject: [PATCH 002/185] [OOT] Support sync_model_loading for OOT (#25126) Signed-off-by: Chendi Xue --- vllm/model_executor/parameter.py | 6 +++--- vllm/model_executor/utils.py | 17 +++-------------- vllm/platforms/interface.py | 23 +++++++++++++++++++++++ vllm/platforms/tpu.py | 4 ++++ 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 221712ba9a338..03e5e5809b678 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -12,7 +12,6 @@ from torch.nn import Parameter from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.logger import init_logger -from vllm.model_executor.utils import _make_synced_weight_loader __all__ = [ "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter", @@ -53,8 +52,9 @@ class BasevLLMParameter(Parameter): # This sometimes causes OOM errors during model loading. To avoid this, # we sync the param tensor after its weight loader is called. from vllm.platforms import current_platform - if current_platform.is_tpu(): - weight_loader = _make_synced_weight_loader(weight_loader) + if current_platform.use_sync_weight_loader(): + weight_loader = current_platform.make_synced_weight_loader( + weight_loader) self._weight_loader = weight_loader self.tp_rank = get_tensor_model_parallel_rank() diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 65436786f82ac..543918418953b 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -44,23 +44,12 @@ def set_weight_attrs( # TODO(woosuk): Remove this hack once we have a better solution. from vllm.platforms import current_platform - if current_platform.is_tpu() and key == "weight_loader": - value = _make_synced_weight_loader(value) + if current_platform.use_sync_weight_loader( + ) and key == "weight_loader": + value = current_platform.make_synced_weight_loader(value) setattr(weight, key, value) -def _make_synced_weight_loader(original_weight_loader): - - def _synced_weight_loader(param, *args, **kwargs): - out = original_weight_loader(param, *args, **kwargs) - # torch._sync doesn't support, is not needed for CPU tensors. - if param.device != torch.device("cpu"): - torch._sync(param) - return out - - return _synced_weight_loader - - def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: parent_map = getattr(model, "packed_modules_mapping", None) parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 054d08c3a85be..53fc762dce540 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -594,6 +594,29 @@ class Platform: """ return False + @classmethod + def use_sync_weight_loader(cls) -> bool: + """ + Returns if the current platform needs to sync weight loader. + """ + return False + + @classmethod + def make_synced_weight_loader(cls, original_weight_loader): + """ + Wrap the original weight loader to make it synced. + """ + if not cls.use_sync_weight_loader(): + return original_weight_loader + + def _synced_weight_loader(param, *args, **kwargs): + out = original_weight_loader(param, *args, **kwargs) + if param.device != torch.device("cpu"): + torch._sync(param) + return out + + return _synced_weight_loader + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 6a061956d8141..4e4db116abca0 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -226,6 +226,10 @@ class TpuPlatform(Platform): torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True) dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu() + @classmethod + def use_sync_weight_loader(cls) -> bool: + return True + try: from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform From 486c5599e3ab7d721c94dd01e89c87742c01e1ac Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 19 Sep 2025 02:27:17 -0400 Subject: [PATCH 003/185] [Build] Update Xgrammar to 0.1.24 to get a CVE fix (#25188) Signed-off-by: Russell Bryant --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index b8665104bd09a..7973da080c37d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,7 +24,7 @@ outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" +xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs From 8c1d4acbfe70a1dffdb9a3db57b4d12329350295 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 19 Sep 2025 15:27:22 +0800 Subject: [PATCH 004/185] [CPU] Disable oneDNN linear on non-x86 platforms (#25166) Signed-off-by: jiang1.li --- vllm/model_executor/layers/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index d2b135c1e4d4e..a1675ffbaa950 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -7,7 +7,7 @@ import torch from vllm import _custom_ops as ops from vllm import envs -from vllm.platforms import current_platform +from vllm.platforms import CpuArchEnum, current_platform from vllm.utils import direct_register_custom_op @@ -167,7 +167,8 @@ def dispatch_cpu_unquantized_gemm( if remove_weight: layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) - elif ops._supports_onednn: + elif (ops._supports_onednn + and current_platform.get_cpu_architecture() == CpuArchEnum.X86): origin_weight = layer.weight if remove_weight: layer.weight = torch.nn.Parameter(torch.empty(0), From 825fdb11add30237e7f592f1a132d3913cd632ec Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 19 Sep 2025 15:41:12 +0800 Subject: [PATCH 005/185] [Bugfix][CPU] Add placeholder to avoid import errors when using fused_moe ops on platforms without triton (#25137) Signed-off-by: jiang1.li --- vllm/model_executor/layers/fused_moe/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 6730f051e3d71..75f56cd01a4ea 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -78,3 +78,12 @@ if HAS_TRITON: "TritonOrDeepGemmExperts", "BatchedTritonOrDeepGemmExperts", ] +else: + # Some model classes directly use the custom ops. Add placeholders + # to avoid import errors. + def _raise_exception(method: str): + raise NotImplementedError( + f"{method} is not implemented as lack of triton.") + + fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk") + fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts") From f2718d2948e83319d83dbbade1883fef2302357e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 19 Sep 2025 15:44:56 +0800 Subject: [PATCH 006/185] [Misc] Cleanup test conftest for deprecated encoder-decoder models (#25231) Signed-off-by: Isotr0py --- tests/conftest.py | 138 ---------------------------------------------- 1 file changed, 138 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0440e859fe02d..9d433dedbf479 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -244,39 +244,6 @@ class DecoderPromptType(Enum): EMPTY_STR = 3 -@pytest.fixture -def example_encoder_decoder_prompts( -) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]: - ''' - Returns an encoder prompt list and a decoder prompt list, wherein each pair - of same-index entries in both lists corresponds to an (encoder prompt, - decoder prompt) tuple. - - Returns: - - * Encoder prompt list - * Decoder prompt list (reverse of encoder prompt list) - ''' - - encoder_prompts = [] - for filename in _TEST_PROMPTS: - encoder_prompts += _read_prompts(filename) - - custom_decoder_prompts = encoder_prompts[::-1] - empty_str_decoder_prompts = [""] * len(encoder_prompts) - none_decoder_prompts = [None] * len(encoder_prompts) - - # NONE decoder prompt type - return { - DecoderPromptType.NONE: - zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts), - DecoderPromptType.EMPTY_STR: - zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts), - DecoderPromptType.CUSTOM: - zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts), - } - - @pytest.fixture def example_long_prompts() -> list[str]: prompts = [] @@ -690,68 +657,6 @@ class HfRunner: return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs] - def generate_encoder_decoder_greedy_logprobs_limit( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - max_tokens: int, - num_logprobs: Optional[int], - images: Optional[PromptImageInput] = None, - **kwargs: Any, - ) -> list[TokensTextLogprobs]: - ''' - Greedy logprobs generation for vLLM encoder/decoder models - ''' - - all_logprobs: list[list[dict[int, float]]] = [] - all_output_ids: list[list[int]] = [] - all_output_strs: list[str] = [] - - for i, (encoder_prompt, decoder_prompt) in enumerate( - to_enc_dec_tuple_list(encoder_decoder_prompts)): - processor_kwargs: dict[str, Any] = { - "text": encoder_prompt, - "return_tensors": "pt", - } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - - encoder_inputs = self.processor(**processor_kwargs) - encoder_inputs = self.wrap_device(encoder_inputs) - - if decoder_prompt is None: - decoder_input_ids = None - else: - decoder_inputs = self.tokenizer(decoder_prompt, - return_tensors="pt") - decoder_input_ids = self.wrap_device(decoder_inputs.input_ids) - - output = self.model.generate( - decoder_input_ids=decoder_input_ids, - use_cache=True, - do_sample=False, - max_new_tokens=max_tokens, - output_hidden_states=True, - return_dict_in_generate=True, - **encoder_inputs, - **kwargs, - ) - - ( - seq_logprobs_lst, - output_len, - ) = self._hidden_states_to_logprobs(output.decoder_hidden_states, - num_logprobs) - - all_logprobs.append(seq_logprobs_lst) - seq_ids = output.sequences[0] - output_ids = seq_ids[-output_len:] - all_output_ids.append(output_ids.tolist()) - all_output_strs.append(self.tokenizer.decode(output_ids)) - - outputs = zip(all_output_ids, all_output_strs, all_logprobs) - return [(output_ids, output_str, output_logprobs) - for output_ids, output_str, output_logprobs in outputs] - def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]: return self.model.encode(prompts, *args, **kwargs) @@ -940,26 +845,6 @@ class VllmRunner: if sampling_params.prompt_logprobs is None else toks_str_logsprobs_prompt_logprobs) - def generate_encoder_decoder_w_logprobs( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - sampling_params: SamplingParams, - ) -> Union[list[TokensTextLogprobs], - list[TokensTextLogprobsPromptLogprobs]]: - ''' - Logprobs generation for vLLM encoder/decoder models - ''' - - assert sampling_params.logprobs is not None - req_outputs = self.llm.generate(encoder_decoder_prompts, - sampling_params=sampling_params) - toks_str_logsprobs_prompt_logprobs = ( - self._final_steps_generate_w_logprobs(req_outputs)) - # Omit prompt logprobs if not required by sampling params - return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] - if sampling_params.prompt_logprobs is None else - toks_str_logsprobs_prompt_logprobs) - def generate_greedy( self, prompts: Union[list[str], list[torch.Tensor]], @@ -1037,29 +922,6 @@ class VllmRunner: return perplexities - def generate_encoder_decoder_greedy_logprobs( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - max_tokens: int, - num_logprobs: Optional[int], - num_prompt_logprobs: Optional[int] = None, - skip_special_tokens: bool = True, - ) -> Union[list[TokensTextLogprobs], - list[TokensTextLogprobsPromptLogprobs]]: - greedy_logprobs_params = SamplingParams( - temperature=0.0, - max_tokens=max_tokens, - logprobs=num_logprobs, - prompt_logprobs=(num_prompt_logprobs), - skip_special_tokens=skip_special_tokens, - ) - ''' - Greedy logprobs generation for vLLM encoder/decoder models - ''' - - return self.generate_encoder_decoder_w_logprobs( - encoder_decoder_prompts, greedy_logprobs_params) - def generate_beam_search( self, prompts: list[str], From a684c0124cb8ac04984b6fd621d99e1463016eac Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Fri, 19 Sep 2025 16:45:06 +0800 Subject: [PATCH 007/185] [bugfix] fix MHA for models like OpenGVLab/InternVL3_5-38B (#25146) Signed-off-by: Yan Ma Co-authored-by: Isotr0py --- vllm/attention/layer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 15c0ce33e9659..8d5ebd93e063d 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -430,9 +430,11 @@ class MultiHeadAttention(nn.Module): key: torch.Tensor, value: torch.Tensor, ) -> torch.Tensor: - """Input shape: batch_size x seq_len x hidden_size""" - # TODO(Isotr0py): Use existing backend implementations and support FA3 - bsz, q_len, _ = query.size() + """Input shape: + (batch_size x seq_len x hidden_size) or + (batch_size x seq_len x num_heads x head_size) + """ + bsz, q_len = query.size()[:2] kv_len = key.size(1) query = query.view(bsz, q_len, self.num_heads, self.head_size) From cea91a32f2364d19d5e708026e84ce21a450c53d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 19 Sep 2025 18:27:49 +0800 Subject: [PATCH 008/185] [Kernel][Performance] Add Triton kernel for Qwen3-VL interleaved MRoPE (#25055) Signed-off-by: Isotr0py --- tests/kernels/core/test_mrope.py | 92 +++++++++++++------ .../layers/rotary_embedding/mrope.py | 36 +++++--- 2 files changed, 85 insertions(+), 43 deletions(-) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 3f2f330f6dc3b..5a903438f5e99 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import NamedTuple import pytest import torch +from packaging.version import Version from transformers import AutoConfig +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform @@ -15,6 +18,7 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int, head_size: int, max_position_embeddings: int, dtype: torch.dtype, device: torch.device): """Generate test data for given configuration.""" + current_platform.seed_everything(42) # Create 2D positions (3, num_tokens) for multimodal case positions = torch.randint(0, max_position_embeddings // 4, (3, num_tokens), @@ -33,22 +37,37 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int, return positions, query, key -def unroll_model_tp_dict(model_tp_dict): - return [(model_name, tp_size) - for model_name, tp_sizes in model_tp_dict.items() - for tp_size in tp_sizes] +class MRoPETestInfo(NamedTuple): + model_name: str + # https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 + atol: float = 1e-2 + rtol: float = 1.6e-2 + marks: list[pytest.MarkDecorator] = [] -model_tp_dict = { - "Qwen/Qwen2-VL-7B-Instruct": [1, 2], - "Qwen/Qwen2-VL-72B-Instruct": [1, 2], - "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2], - "zai-org/GLM-4.1V-9B-Thinking": [1, 2], -} +TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version -# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 -dtype_atol_rtol_list = [ - [torch.bfloat16, 1e-2, 1.6e-2], +MODELS_TO_TEST = [ + MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"), + MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"), + MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"), + MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"), + MRoPETestInfo( + model_name="Qwen/Qwen3-VL-4B-Instruct", + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), + reason="Qwen3-VL only available after Transformers v4.57", + ) + ]), + MRoPETestInfo( + model_name="Qwen/Qwen3-VL-30B-A3B-Instruct", + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), + reason="Qwen3-VL only available after Transformers v4.57", + ) + ]), ] num_tokens_list = [11, 8192] @@ -56,20 +75,29 @@ num_tokens_list = [11, 8192] @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests.") -@pytest.mark.parametrize("model_name, tp_size", - unroll_model_tp_dict(model_tp_dict)) -@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) +@pytest.mark.parametrize("model_info, model_name", [ + pytest.param(test_config, test_config.model_name, marks=test_config.marks) + for test_config in MODELS_TO_TEST +]) +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("num_tokens", num_tokens_list) -def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): +def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int, + dtype: torch.dtype, num_tokens: int): + + atol = model_info.atol + rtol = model_info.rtol config = AutoConfig.from_pretrained(model_name) + config = config.get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads total_num_heads = config.num_attention_heads num_heads = total_num_heads // tp_size num_kv_heads = max(1, total_num_kv_heads // tp_size) - head_dim = config.hidden_size // total_num_heads + head_dim = (config.head_dim if hasattr(config, "head_dim") else + config.hidden_size // total_num_heads) is_neox_style = True rope_theta = config.rope_theta @@ -111,24 +139,30 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests.") -@pytest.mark.parametrize( - "model_name, tp_size", - unroll_model_tp_dict({ - "Qwen/Qwen2-VL-7B-Instruct": [1, 2], - "zai-org/GLM-4.1V-9B-Thinking": [1, 2] - })) -@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) -@pytest.mark.parametrize("num_tokens", [4]) -def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, - num_tokens): +@pytest.mark.parametrize("model_info, model_name", [ + pytest.param(test_config, test_config.model_name, marks=test_config.marks) + for test_config in MODELS_TO_TEST +]) +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("num_tokens", num_tokens_list) +def test_mrope_torch_compile_tracing(model_name: str, + model_info: MRoPETestInfo, tp_size: int, + dtype: torch.dtype, num_tokens: int): + + atol = model_info.atol + rtol = model_info.rtol + config = AutoConfig.from_pretrained(model_name) + config = config.get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads total_num_heads = config.num_attention_heads num_heads = total_num_heads // tp_size num_kv_heads = max(1, total_num_kv_heads // tp_size) - head_dim = config.hidden_size // total_num_heads + head_dim = (config.head_dim if hasattr(config, "head_dim") else + config.hidden_size // total_num_heads) is_neox_style = True rope_theta = config.rope_theta max_position = config.max_position_embeddings diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index ef61dbc1a5ab1..ccc59bbbe233f 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -15,7 +15,7 @@ from .common import apply_rotary_emb_dispatch @triton.jit -def _triton_qwen2vl_mrope_forward( +def _triton_mrope_forward( q_ptr, k_ptr, cos, @@ -30,12 +30,14 @@ def _triton_qwen2vl_mrope_forward( pad_hd: tl.constexpr, mrope_section_t: tl.constexpr, mrope_section_h: tl.constexpr, + mrope_section_w: tl.constexpr, + is_interleaved: tl.constexpr, ): # Adapted from # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py # This version supports flatten input tensors from vllm # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2) - # instead of (3, bsz, seq_len, head_dim) + # instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary pid = tl.program_id(0) # locate start address q_ptr = q_ptr + pid * (n_qh * hd) @@ -47,9 +49,6 @@ def _triton_qwen2vl_mrope_forward( # #################################################################### # Note: cos and sin now have shape (3, num_tokens, head_dim // 2) - t_end = mrope_section_t - h_end = t_end + mrope_section_h - # Updated stride calculation for half head_dim half_rd = rd // 2 t_cos = cos + pid * half_rd @@ -61,9 +60,18 @@ def _triton_qwen2vl_mrope_forward( # Updated offsets for half head_dim cos_offsets = tl.arange(0, pad_hd // 2) - t_mask = cos_offsets < t_end - h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) - w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) + if is_interleaved: + h_mask = (((cos_offsets % 3) == 1) & + (cos_offsets <= 3 * mrope_section_h)) + w_mask = (((cos_offsets % 3) == 2) & + (cos_offsets <= 3 * mrope_section_w)) + t_mask = ~(h_mask | w_mask) + else: + t_end = mrope_section_t + h_end = t_end + mrope_section_h + t_mask = cos_offsets < mrope_section_t + h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) + w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) @@ -131,6 +139,7 @@ def triton_mrope( mrope_section: list[int], head_size: int, rotary_dim: int, + mrope_interleaved: bool, ) -> tuple[torch.Tensor, torch.Tensor]: """Qwen2VL mrope kernel. @@ -158,7 +167,7 @@ def triton_mrope( cos = cos.contiguous() sin = sin.contiguous() - _triton_qwen2vl_mrope_forward[(n_row, )]( + _triton_mrope_forward[(n_row, )]( q, k, cos, @@ -173,6 +182,8 @@ def triton_mrope( pad_hd, mrope_section[0], mrope_section[1], + mrope_section[2], + mrope_interleaved, ) return q, k @@ -201,7 +212,7 @@ class MRotaryEmbedding(RotaryEmbedding): is_neox_style: bool, dtype: torch.dtype, mrope_section: Optional[list[int]] = None, - mrope_interleaved: Optional[bool] = False, + mrope_interleaved: bool = False, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -282,10 +293,6 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None - if self.mrope_interleaved: - # TODO: add triton implementation to support mrope-interleaved - return self.forward_native(positions, query, key) - num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) @@ -302,6 +309,7 @@ class MRotaryEmbedding(RotaryEmbedding): self.mrope_section, self.head_size, self.rotary_dim, + self.mrope_interleaved, ) return q.reshape(query_shape), k.reshape(key_shape) From 1dfea5f4a95df8d14b46433a479a28d56e60494c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 19 Sep 2025 03:46:16 -0700 Subject: [PATCH 009/185] [Bugfix][Perf] Misc fixes for Qwen3 VL (#25238) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++------------ vllm/model_executor/models/qwen3_vl_moe.py | 2 ++ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 2c36dfbce7f67..c224b78e2c27c 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1075,6 +1075,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, config.text_config.hidden_size) for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: @@ -1313,12 +1315,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ] multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) - visual_dim = multimodal_embeddings_cat.shape[-1] // ( - self.deepstack_num_level + 1) - - main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 - multimodal_embeddings_cat, [main_dim, multi_dim], + multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim], dim=-1) multimodal_embeddings = torch.split(multimodal_embeddings_main, @@ -1340,10 +1338,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ], ) deepstack_input_embeds = deepstack_input_embeds.view( - inputs_embeds.shape[0], self.deepstack_num_level, - visual_dim).contiguous() - deepstack_input_embeds = deepstack_input_embeds.permute( - 1, 0, 2).contiguous() + inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim) + deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) return deepstack_input_embeds, multimodal_embeddings def get_input_embeddings( @@ -1353,9 +1349,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: deepstack_input_embeds = None inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None and self.use_deepstack: - deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 - input_ids, inputs_embeds, multimodal_embeddings) + if multimodal_embeddings is not None: + if self.use_deepstack: + deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 + input_ids, inputs_embeds, multimodal_embeddings) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [self.config.image_token_id, self.config.video_token_id]) @@ -1531,4 +1528,4 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, language_model="language_model", connector="model.visual.merger", tower_model="model.visual.", - ) + ) \ No newline at end of file diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index d25bc71dcb59b..625f94cf7ad77 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -344,3 +344,5 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): config.text_config.hidden_size) for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level \ No newline at end of file From 058525b9973cabfe27b7ab34dad6dbcbb6859f74 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:02:55 +0100 Subject: [PATCH 010/185] Move `PoolerConfig` from `config/__init__.py` to `config/pooler.py` (#25181) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/pooling_models.md | 4 +- docs/models/supported_models.md | 4 +- .../openai_embedding_long_text/README.md | 2 +- .../openai_embedding_long_text/client.py | 4 +- .../openai_embedding_long_text/service.sh | 2 +- .../openai/test_embedding_long_text.py | 2 +- .../models/language/pooling/test_embedding.py | 2 +- .../pooling/test_mm_classifier_conversion.py | 3 +- ...y => test_pooler_config_init_behaviour.py} | 62 ++++--- tests/test_config.py | 18 +- vllm/config/__init__.py | 155 ++++-------------- vllm/config/pooler.py | 97 +++++++++++ vllm/engine/arg_utils.py | 7 +- vllm/entrypoints/llm.py | 10 +- 14 files changed, 193 insertions(+), 179 deletions(-) rename tests/models/language/pooling/{test_override_pooler_config.py => test_pooler_config_init_behaviour.py} (74%) create mode 100644 vllm/config/pooler.py diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 0521a22c07029..50982d3d0d0f3 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -59,7 +59,7 @@ enabling the corresponding APIs: #### Predefined models If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, -you can override some of its attributes via the `--override-pooler-config` option. +you can override some of its attributes via the `--pooler-config` option. #### Converted models @@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default: When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. -You can further customize this via the `--override-pooler-config` option, +You can further customize this via the `--pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b67ebcbe3c81a..3a6738a27be09 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -457,7 +457,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A !!! note `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. + You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`. !!! note For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. @@ -552,7 +552,7 @@ If your model is not in the above list, we will try to automatically convert the !!! important For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. #### Token Classification diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/online_serving/openai_embedding_long_text/README.md index 04edc4680ea0b..00d3ded3e41c1 100644 --- a/examples/online_serving/openai_embedding_long_text/README.md +++ b/examples/online_serving/openai_embedding_long_text/README.md @@ -42,7 +42,7 @@ python client.py ### Server Configuration -The key parameters for chunked processing are in the `--override-pooler-config`: +The key parameters for chunked processing are in the `--pooler-config`: ```json { diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/online_serving/openai_embedding_long_text/client.py index 6e9838ac6d8db..4a3674bb3f2a8 100644 --- a/examples/online_serving/openai_embedding_long_text/client.py +++ b/examples/online_serving/openai_embedding_long_text/client.py @@ -13,7 +13,7 @@ Prerequisites: # MEAN pooling (processes all chunks, recommended for complete coverage) vllm serve intfloat/multilingual-e5-large \ - --override-pooler-config \ + --pooler-config \ '{"pooling_type": "MEAN", "normalize": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \ --served-model-name multilingual-e5-large \ @@ -23,7 +23,7 @@ Prerequisites: # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) vllm serve BAAI/bge-large-en-v1.5 \ - --override-pooler-config \ + --pooler-config \ '{"pooling_type": "CLS", "normalize": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \ --served-model-name bge-large-en-v1.5 \ diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index 56888c8aa0e4c..1577de85f7ff2 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab vllm serve "$MODEL_NAME" \ --tensor-parallel-size "$GPU_COUNT" \ --enforce-eager \ - --override-pooler-config "$POOLER_CONFIG" \ + --pooler-config "$POOLER_CONFIG" \ --served-model-name ${MODEL_CODE} \ --api-key "$API_KEY" \ --trust-remote-code \ diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index 2d3da238d245e..ab5f765c28ed6 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -216,7 +216,7 @@ def server_with_chunked_processing(): "--enforce-eager", "--max-model-len", "512", # Set smaller max_model_len to trigger chunking mechanism - '--override-pooler-config', + '--pooler-config', ('{"pooling_type": "MEAN", "normalize": true, ' '"enable_chunked_processing": true, "max_embed_len": 10000}'), "--gpu-memory-utilization", diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index d61ac08475e3c..17513d1bb20d7 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -58,7 +58,7 @@ def test_models( vllm_extra_kwargs = {} if model == "ssmits/Qwen2-7B-Instruct-embed-base": - vllm_extra_kwargs["override_pooler_config"] = \ + vllm_extra_kwargs["pooler_config"] = \ PoolerConfig(pooling_type="MEAN", normalize=False) max_model_len: Optional[int] = 512 diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py index 166b953de43e7..9814cad48a807 100644 --- a/tests/models/language/pooling/test_mm_classifier_conversion.py +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.config.pooler import PoolerConfig from vllm.platforms import current_platform @@ -99,7 +100,7 @@ def test_gemma_multimodal( convert="classify", load_format="auto", hf_overrides=update_config, - override_pooler_config={"pooling_type": "LAST"}, + pooler_config=PoolerConfig(pooling_type="LAST"), max_model_len=512, enforce_eager=True, tensor_parallel_size=1, diff --git a/tests/models/language/pooling/test_override_pooler_config.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py similarity index 74% rename from tests/models/language/pooling/test_override_pooler_config.py rename to tests/models/language/pooling/test_pooler_config_init_behaviour.py index 2b1c74652e76f..9b3fbd6a6cd09 100644 --- a/tests/models/language/pooling/test_override_pooler_config.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -24,18 +24,18 @@ def test_classify_models_using_activation( dtype: str, ) -> None: - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - activation=False)) as vllm_model: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(activation=False)) as vllm_model: wo_activation_out = vllm_model.classify(example_prompts) - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - activation=True)) as vllm_model: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(activation=True)) as vllm_model: w_activation_out = vllm_model.classify(example_prompts) for wo_activation, w_activation in zip(wo_activation_out, @@ -43,9 +43,8 @@ def test_classify_models_using_activation( wo_activation = torch.tensor(wo_activation) w_activation = torch.tensor(w_activation) - assert not torch.allclose( - wo_activation, w_activation, - atol=1e-2), "override_pooler_config is not working" + assert not torch.allclose(wo_activation, w_activation, + atol=1e-2), "pooler_config is not working" assert torch.allclose(softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2) @@ -65,23 +64,22 @@ def test_embed_models_using_normalize( dtype: str, ) -> None: - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - normalize=False)) as vllm_model: - wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) - with vllm_runner( model, max_model_len=512, dtype=dtype, - override_pooler_config=PoolerConfig(normalize=True)) as vllm_model: + pooler_config=PoolerConfig(normalize=False)) as vllm_model: + wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=True)) as vllm_model: w_normalize = torch.tensor(vllm_model.embed(example_prompts)) assert not torch.allclose( wo_normalize, w_normalize, - atol=1e-2), "override_pooler_config normalize is not working" + atol=1e-2), "pooler_config normalize is not working" assert torch.allclose( F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2), "w_normal should be close to normal(wo_normal)." @@ -102,18 +100,16 @@ def test_reward_models_using_softmax( dtype: str, ) -> None: - with vllm_runner( - model, - max_model_len=1024, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=False)) as vllm_model: + with vllm_runner(model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(softmax=False)) as vllm_model: wo_softmax = vllm_model.encode(example_prompts) - with vllm_runner( - model, - max_model_len=1024, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=True)) as vllm_model: + with vllm_runner(model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(softmax=True)) as vllm_model: w_softmax = vllm_model.encode(example_prompts) for wo, w in zip(wo_softmax, w_softmax): @@ -121,7 +117,7 @@ def test_reward_models_using_softmax( w = torch.tensor(w) assert not torch.allclose( - wo, w, atol=1e-2), "override_pooler_config softmax is not working" + wo, w, atol=1e-2), "pooler_config softmax is not working" assert torch.allclose( softmax(wo), w, atol=1e-2), "w_softmax should be close to softmax(wo_softmax)." diff --git a/tests/test_config.py b/tests/test_config.py index 6e37bdbee59eb..0796447c079b6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -207,25 +207,19 @@ def test_get_pooling_config(): model_id = "sentence-transformers/all-MiniLM-L12-v2" model_config = ModelConfig(model_id) - pooling_config = model_config._init_pooler_config() - assert pooling_config is not None - - assert pooling_config.normalize - assert pooling_config.pooling_type == PoolingType.MEAN.name + assert model_config.pooler_config is not None + assert model_config.pooler_config.normalize + assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_pooling_config_from_args(): model_id = "sentence-transformers/all-MiniLM-L12-v2" - model_config = ModelConfig(model_id) + pooler_config = PoolerConfig(pooling_type="CLS", normalize=True) + model_config = ModelConfig(model_id, pooler_config=pooler_config) - override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True) - model_config.override_pooler_config = override_pooler_config - - pooling_config = model_config._init_pooler_config() - assert pooling_config is not None - assert asdict(pooling_config) == asdict(override_pooler_config) + assert asdict(model_config.pooler_config) == asdict(pooler_config) @pytest.mark.parametrize( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 25daca00c02d9..45504e010d688 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -40,6 +40,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) +from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig from vllm.config.structured_outputs import StructuredOutputsConfig @@ -406,13 +407,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - pooler_config: Optional["PoolerConfig"] = field(init=False) - """Pooler config which controls the behaviour of output pooling in pooling - models.""" - override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None - """Initialize non-default pooling config or override default pooling config - for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`. - """ logits_processor_pattern: Optional[str] = None """Optional regex pattern specifying valid logits processor qualified names that can be passed with the `logits_processors` extra completion argument. @@ -448,6 +442,14 @@ class ModelConfig: io_processor_plugin: Optional[str] = None """IOProcessor plugin name to load at model startup""" + # Pooler config + pooler_config: Optional[PoolerConfig] = None + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: Optional[Union[dict, PoolerConfig]] = None + """[DEPRECATED] Use `pooler_config` instead. This field will be removed in + v0.12.0 or v1.0.0, whichever is sooner.""" + # Multimodal config and init vars multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred @@ -709,7 +711,33 @@ class ModelConfig: self._architecture = arch logger.info("Resolved architecture: %s", arch) - self.pooler_config = self._init_pooler_config() + # Init pooler config if needed + if self.runner_type == "pooling": + if self.override_pooler_config is not None: + logger.warning_once( + "`override_pooler_config` is deprecated and will be " + "removed in v0.12.0 or v1.0.0, whichever is sooner. " + "Please use `pooler_config` instead.") + + if isinstance(self.override_pooler_config, dict): + self.pooler_config = PoolerConfig( + **self.override_pooler_config) + else: + self.pooler_config = self.override_pooler_config + + if self.pooler_config is None: + self.pooler_config = PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(self.pooler_config, k) is None: + setattr(self.pooler_config, k, v) + + default_pooling_type = self._model_info.default_pooling_type + if self.pooler_config.pooling_type is None: + self.pooler_config.pooling_type = default_pooling_type self.dtype: torch.dtype = _get_and_verify_dtype( self.model, @@ -869,29 +897,6 @@ class ModelConfig: return get_sentence_transformer_tokenizer_config( self.model, self.revision) - def _init_pooler_config(self) -> Optional["PoolerConfig"]: - if self.runner_type == "pooling": - if isinstance(self.override_pooler_config, dict): - self.override_pooler_config = PoolerConfig( - **self.override_pooler_config) - - pooler_config = self.override_pooler_config or PoolerConfig() - - base_config = get_pooling_config(self.model, self.revision) - if base_config is not None: - # Only set values that are not overridden by the user - for k, v in base_config.items(): - if getattr(pooler_config, k) is None: - setattr(pooler_config, k, v) - - default_pooling_type = self._model_info.default_pooling_type - if pooler_config.pooling_type is None: - pooler_config.pooling_type = default_pooling_type - - return pooler_config - - return None - def _verify_tokenizer_mode(self) -> None: tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) if tokenizer_mode not in get_args(TokenizerMode): @@ -1833,94 +1838,6 @@ class DeviceConfig: self.device = torch.device(self.device_type) -@config -@dataclass -class PoolerConfig: - """Controls the behavior of output pooling in pooling models.""" - - pooling_type: Optional[str] = None - """ - The pooling method of the pooling model. This should be a key in - [`vllm.model_executor.layers.pooler.PoolingType`][]. - """ - - ## for embeddings models - normalize: Optional[bool] = None - """ - Whether to normalize the embeddings outputs. Defaults to True. - """ - dimensions: Optional[int] = None - """ - Reduce the dimensions of embeddings if model - support matryoshka representation. Defaults to None. - """ - enable_chunked_processing: Optional[bool] = None - """ - Whether to enable chunked processing for long inputs that exceed the model's - maximum position embeddings. When enabled, long inputs will be split into - chunks, processed separately, and then aggregated using weighted averaging. - This allows embedding models to handle arbitrarily long text without CUDA - errors. Defaults to False. - """ - max_embed_len: Optional[int] = None - """ - Maximum input length allowed for embedding generation. When set, allows - inputs longer than max_embed_len to be accepted for embedding models. - When an input exceeds max_embed_len, it will be handled according to - the original max_model_len validation logic. - Defaults to None (i.e. set to max_model_len). - """ - - ## for classification models - activation: Optional[bool] = None - """ - Whether to apply activation function to the classification outputs. - Defaults to True. - """ - logit_bias: Optional[float] = None - """ - If provided, apply classification logit biases. Defaults to None. - """ - - ## for reward models - softmax: Optional[bool] = None - """ - Whether to apply softmax to the reward outputs. - Defaults to True. - """ - step_tag_id: Optional[int] = None - """ - If set, only the score corresponding to the ``step_tag_id`` in the - generated sentence should be returned. Otherwise, the scores for all tokens - are returned. - """ - returned_token_ids: Optional[list[int]] = None - """ - A list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of ``good_token`` and ``bad_token`` in the - ``math-shepherd-mistral-7b-prm`` model. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.float16, "float16": torch.float16, diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py new file mode 100644 index 0000000000000..85b5a1ace85f6 --- /dev/null +++ b/vllm/config/pooler.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Optional + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + + +@config +@dataclass +class PoolerConfig: + """Controls the behavior of output pooling in pooling models.""" + + pooling_type: Optional[str] = None + """ + The pooling method of the pooling model. This should be a key in + [`vllm.model_executor.layers.pooler.PoolingType`][]. + """ + + ## for embeddings models + normalize: Optional[bool] = None + """ + Whether to normalize the embeddings outputs. Defaults to True. + """ + dimensions: Optional[int] = None + """ + Reduce the dimensions of embeddings if model + support matryoshka representation. Defaults to None. + """ + enable_chunked_processing: Optional[bool] = None + """ + Whether to enable chunked processing for long inputs that exceed the model's + maximum position embeddings. When enabled, long inputs will be split into + chunks, processed separately, and then aggregated using weighted averaging. + This allows embedding models to handle arbitrarily long text without CUDA + errors. Defaults to False. + """ + max_embed_len: Optional[int] = None + """ + Maximum input length allowed for embedding generation. When set, allows + inputs longer than max_embed_len to be accepted for embedding models. + When an input exceeds max_embed_len, it will be handled according to + the original max_model_len validation logic. + Defaults to None (i.e. set to max_model_len). + """ + + ## for classification models + activation: Optional[bool] = None + """ + Whether to apply activation function to the classification outputs. + Defaults to True. + """ + logit_bias: Optional[float] = None + """ + If provided, apply classification logit biases. Defaults to None. + """ + + ## for reward models + softmax: Optional[bool] = None + """ + Whether to apply softmax to the reward outputs. + Defaults to True. + """ + step_tag_id: Optional[int] = None + """ + If set, only the score corresponding to the ``step_tag_id`` in the + generated sentence should be returned. Otherwise, the scores for all tokens + are returned. + """ + returned_token_ids: Optional[list[int]] = None + """ + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the + ``math-shepherd-mistral-7b-prm`` model. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 63282c4253509..27462b8fa0dad 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -441,6 +441,7 @@ class EngineArgs: scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls + pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config compilation_config: CompilationConfig = \ @@ -579,8 +580,11 @@ class EngineArgs: help=model_kwargs["hf_token"]["help"]) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) + model_group.add_argument("--pooler-config", + **model_kwargs["pooler_config"]) model_group.add_argument("--override-pooler-config", - **model_kwargs["override_pooler_config"]) + **model_kwargs["override_pooler_config"], + deprecated=True) model_group.add_argument("--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]) model_group.add_argument("--generation-config", @@ -1031,6 +1035,7 @@ class EngineArgs: mm_shm_cache_max_object_size_mb=self. mm_shm_cache_max_object_size_mb, mm_encoder_tp_mode=self.mm_encoder_tp_mode, + pooler_config=self.pooler_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index df6b16c73d6e7..e21bfce0ab085 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -151,9 +151,11 @@ class LLM: multi-modal processor obtained from `AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - override_pooler_config: Initialize non-default pooling config or - override default pooling config for the pooling model. - e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. + pooler_config: Initialize non-default pooling config for the pooling + model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. + override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This + argument is deprecated and will be removed in v0.12.0 or v1.0.0, + whichever is sooner. compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. @@ -191,6 +193,7 @@ class LLM: hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, + pooler_config: Optional[PoolerConfig] = None, override_pooler_config: Optional[PoolerConfig] = None, structured_outputs_config: Optional[Union[dict[ str, Any], StructuredOutputsConfig]] = None, @@ -288,6 +291,7 @@ class LLM: hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, + pooler_config=pooler_config, override_pooler_config=override_pooler_config, structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, From a3d087adecadd4f6f83b72181ade40ec2de92aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 19 Sep 2025 13:09:14 +0200 Subject: [PATCH 011/185] [P/D][Nixl] Introduce `KVTransferMetrics` and aggregation strategy (#22188) Signed-off-by: NickLucche --- .../kv_connector/unit/test_nixl_connector.py | 211 +++++++++++++++++- .../kv_transfer/kv_connector/utils.py | 21 +- .../kv_transfer/kv_connector/v1/base.py | 22 +- .../kv_transfer/kv_connector/v1/metrics.py | 100 +++++++++ .../kv_connector/v1/multi_connector.py | 68 +++++- .../kv_connector/v1/nixl_connector.py | 68 +++++- vllm/v1/core/sched/scheduler.py | 27 ++- vllm/v1/metrics/loggers.py | 8 +- vllm/v1/metrics/stats.py | 3 +- vllm/v1/outputs.py | 11 +- .../worker/kv_connector_model_runner_mixin.py | 11 +- 11 files changed, 525 insertions(+), 25 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/metrics.py diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 040b44dc5d2ca..6e58d158c3f4b 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -18,12 +18,18 @@ import torch from vllm import LLM from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) +from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import ( + MultiKVConnectorStats) from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, - NixlConnectorWorker) + NixlConnectorWorker, NixlKVConnectorStats) from vllm.forward_context import ForwardContext from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from .utils import create_request, create_scheduler, create_vllm_config @@ -475,6 +481,209 @@ class TestNixlHandshake: # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # we put here is important. First run ray, it will clean up the resources, then # the rest of the tests. +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper) +def test_kv_connector_stats(dist_init): + """Test that KV transfer stats are properly recorded and retrieved.""" + vllm_config = create_vllm_config() + + # Test worker role in decode server. + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker(vllm_config, + connector.engine_id, + hand_shake_latency=0) + + # Verify that xfer_stats starts empty + initial_stats = connector.get_kv_connector_stats() + assert initial_stats is None + + # Create transfer metadata + request_id = "test_req_for_stats" + metadata = NixlConnectorMetadata() + metadata.add_new_req(request_id=request_id, + local_block_ids=[1, 2, 3], + kv_transfer_params={ + "remote_block_ids": [4, 5, 6], + "remote_engine_id": + FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_host": "localhost", + "remote_port": 1234, + "remote_tp_size": 1, + }) + connector.bind_connector_metadata(metadata) + + # Start the transfer + dummy_ctx = ForwardContext( + no_compile_layers={}, + attn_metadata={}, + virtual_engine=0, + ) + connector.start_load_kv(dummy_ctx) + + # Verify stats are recorded after transfer is complete + max_iterations = 2 + # Clear metadata before start_load_kv to prevent reprocessing same request + connector.bind_connector_metadata(NixlConnectorMetadata()) + for _ in range(max_iterations): + # Need to call start_load_kv to process completed handshakes + connector.start_load_kv(dummy_ctx) + _, done_recving = connector.get_finished(finished_req_ids=set()) + if len(done_recving) > 0 and request_id in done_recving: + break + time.sleep( + 0.1) # Small delay to allow background handshake to complete + else: + assert "Transfer did not complete within expected iterations" + + # Now check that stats were recorded + stats_after_transfer = connector.get_kv_connector_stats() + assert isinstance(stats_after_transfer, NixlKVConnectorStats) + + # Verify stats values are recorded + assert not stats_after_transfer.is_empty() + assert stats_after_transfer.data["num_successful_transfers"] == 1 + + # Verify stats are reset after retrieval + stats_after_reset = connector.get_kv_connector_stats() + assert stats_after_reset is None + + +def test_kv_connector_stats_aggregation(): + """ + Test KV transfer stats aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + + # Create KVOutputAggregator for 3 workers (simulating TP=3), same thing + # done in MultiprocExecutor.execute_model + aggregator = KVOutputAggregator(world_size=3) + + # Create stats for multiple workers with different transfer patterns + worker1_stats = NixlKVConnectorStats() + worker2_stats = NixlKVConnectorStats() + worker3_stats = NixlKVConnectorStats() + + # Record different transfers on each worker + # Worker 1: 2 transfers + worker1_stats.record_transfer() + worker1_stats.record_transfer() + + # Worker 2: 1 transfer + worker2_stats.record_transfer() + + # Worker 3: 3 transfers + worker3_stats.record_transfer() + worker3_stats.record_transfer() + worker3_stats.record_transfer() + + # Create ModelRunnerOutput instances for each worker + worker_outputs = [] + for i, worker_stats in enumerate( + [worker1_stats, worker2_stats, worker3_stats]): + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], # dummy token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) + if i < 2 else None, # Workers 0,1 finished sending + finished_recving=set([f"req_{i}_recv"]) + if i > 0 else None, # Workers 1,2 finished receiving + kv_connector_stats=worker_stats, + )) + worker_outputs.append(output) + + # Use the real aggregation mechanism (like MultiprocExecutor.execute_model) + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_connector_stats = \ + aggregated_output.kv_connector_output.kv_connector_stats + assert isinstance(kv_connector_stats, NixlKVConnectorStats) + # Number of total transfers across all workers. + assert kv_connector_stats.data["num_successful_transfers"] == 6 + + +def test_multi_kv_connector_stats_aggregation(): + """ + Test MultiKVConnectorStats aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + + aggregator = KVOutputAggregator(world_size=3) + + from dataclasses import dataclass + + @dataclass + class FooKVConnectorStats(KVConnectorStats): + + def reset(self): + self.data = {"num_foo_transfers": 0} + + def record_transfer(self): + if "num_foo_transfers" not in self.data: + self.data["num_foo_transfers"] = 0 + self.data["num_foo_transfers"] += 1 + + def is_empty(self) -> bool: + return self.data["num_foo_transfers"] == 0 + + def aggregate(self, + other: "FooKVConnectorStats") -> "FooKVConnectorStats": + if not other.is_empty(): + self.data["num_foo_transfers"] += other.data[ + "num_foo_transfers"] + return self + + def make_multi_stats(nixl_count: int, + foo_count: int) -> MultiKVConnectorStats: + data: dict[str, KVConnectorStats] = {} + if nixl_count > 0: + nixl_stats = NixlKVConnectorStats() + for _ in range(nixl_count): + nixl_stats.record_transfer() + data["NixlConnector"] = nixl_stats + if foo_count > 0: + foo_stats = FooKVConnectorStats() + for _ in range(foo_count): + foo_stats.record_transfer() + data["FooConnector"] = foo_stats + return MultiKVConnectorStats(data=data) + + # Create heterogeneous stats across 3 workers + worker_patterns = [(2, 1), (3, 0), (0, 5)] # (Nixl, Foo) + + worker_outputs: list[ModelRunnerOutput] = [] + for i, (nixl, foo) in enumerate(worker_patterns): + stats = make_multi_stats(nixl, foo) + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) if i < 2 else None, + finished_recving=set([f"req_{i}_recv"]) if i > 0 else None, + kv_connector_stats=stats, + ), + ) + worker_outputs.append(output) + + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_connector_stats = \ + aggregated_output.kv_connector_output.kv_connector_stats + assert isinstance(kv_connector_stats, MultiKVConnectorStats) + + # Validate per-connector totals across workers + assert kv_connector_stats["NixlConnector"].data[ + "num_successful_transfers"] == 5 + assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6 + + @pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index f4dc248a12794..911d77ba36fa0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -129,7 +129,7 @@ class KVOutputAggregator: def aggregate(self, outputs: list[ModelRunnerOutput], output_rank: int = 0) -> ModelRunnerOutput: - # aggregate kv_connector_output from all workers + # Aggregate kv_connector_output from all workers def update_finished_set(req_ids: Optional[set[str]], remaining_count_dict: dict[str, int], @@ -142,8 +142,9 @@ class KVOutputAggregator: finished_sending = set[str]() finished_recving = set[str]() - for output in outputs: - output = output.kv_connector_output + aggregated_kv_connector_stats = None + for model_runner_output in outputs: + output = model_runner_output.kv_connector_output if not output: continue update_finished_set(output.finished_sending, @@ -151,12 +152,26 @@ class KVOutputAggregator: update_finished_set(output.finished_recving, self._recv_remaining_count, finished_recving) + # Aggregate kv_connector_stats from all workers. + if aggregated_kv_connector_stats is None: + # Use the first worker's kv_connector_stats as accumulator. + aggregated_kv_connector_stats = output.kv_connector_stats + elif kv_connector_stats := output.kv_connector_stats: + if aggregated_kv_connector_stats is None: + aggregated_kv_connector_stats = kv_connector_stats + else: + assert isinstance(aggregated_kv_connector_stats, + type(kv_connector_stats)) + aggregated_kv_connector_stats = \ + aggregated_kv_connector_stats.aggregate(kv_connector_stats) + # select output of the worker specified by output_rank output = outputs[output_rank] output.kv_connector_output = KVConnectorOutput( finished_sending=finished_sending or None, finished_recving=finished_recving or None, + kv_connector_stats=aggregated_kv_connector_stats or None, ) return output diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 70c07eac6304b..184d0a62f2c30 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -49,6 +49,8 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request @@ -235,6 +237,12 @@ class KVConnectorBase_V1(ABC): """ return None + def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + """ + Get the KV connector stats collected during the last interval. + """ + return None + # ============================== # Scheduler-side methods # ============================== @@ -365,4 +373,16 @@ class KVConnectorBase_V1(ABC): int: expected sending or receiving completion count. """ - return None \ No newline at end of file + return None + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional["KVConnectorStats"]: + """ + KVConnectorStats resolution method. This method allows dynamically + registered connectors to return their own KVConnectorStats object, + which can implement custom aggregation logic on the data dict. + """ + return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py new file mode 100644 index 0000000000000..e40007230ba45 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass, field +from typing import Any, Optional, Union + +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_transfer_state import ( + has_kv_transfer_group) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass +class KVConnectorStats: + """ + Base class for KV Connector Stats, a container for transfer performance + metrics or otherwise important telemetry from the connector. + All sub-classes need to be serializable as stats are sent from worker to + logger process. + """ + data: dict[str, Any] = field(default_factory=dict) + + def reset(self): + """Reset the stats, clear the state.""" + raise NotImplementedError + + def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats": + """ + Aggregate stats with another `KVConnectorStats` object. + """ + raise NotImplementedError + + def reduce(self) -> dict[str, Union[int, float]]: + """ + Reduce the observations collected during a time interval to one or + more representative values (eg avg/median/sum of the series). + This is meant to be called by the logger to produce a summary of the + stats for the last time interval. + """ + raise NotImplementedError + + def is_empty(self) -> bool: + """Return True if the stats are empty.""" + raise NotImplementedError + + +class KVConnectorLogging: + + def __init__(self, kv_tranfer_config: KVTransferConfig): + # This should be called on frontend process. + assert not has_kv_transfer_group() + # Instantiate the connector's stats class. + if kv_tranfer_config and kv_tranfer_config.kv_connector: + self.connector_cls = KVConnectorFactory.get_connector_class( + kv_tranfer_config) + self.reset() + + def reset(self): + self.transfer_stats_accumulator: Optional[KVConnectorStats] = None + + def observe(self, transfer_stats_data: dict[str, Any]): + # Should not be called when a KVConnector is not configured. + assert self.connector_cls is not None + # Called periodically when connector syncs with the scheduler. + # Note that this is not the same as the logging interval. + # We expect transfer_stats_data to be aggregated across all workers and + # consist of observations from a single connector or a MultiConnector. + transfer_stats = self.connector_cls.build_kv_connector_stats( + transfer_stats_data) + if transfer_stats is None: + logger.warning_once( + "The connector %s is collecting stats but " + "does not implement the " + "`build_kv_connector_stats` method. " + "Stats will not be logged.", self.connector_cls) + return + + if self.transfer_stats_accumulator is None: + self.transfer_stats_accumulator = transfer_stats + else: + # Accumulate last interval stats. + self.transfer_stats_accumulator = \ + self.transfer_stats_accumulator.aggregate(transfer_stats) + + def log(self, log_fn=logger.info): + """Log transfer metrics periodically, similar to throughput logging""" + if (self.transfer_stats_accumulator + and not self.transfer_stats_accumulator.is_empty()): + # Produce a single cumulative stats object for the last time + # interval from the recorded observations. + xfer_metrics = self.transfer_stats_accumulator.reduce() + xfer_metrics_str = ", ".join(f"{k}={v}" + for k, v in xfer_metrics.items()) + log_fn("KV Transfer metrics: %s", xfer_metrics_str) + + # Reset metrics for next interval + self.reset() \ No newline at end of file diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 616d158d67670..6836a71e58d62 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -9,19 +9,21 @@ import torch from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig -from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.logger import init_logger -from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata + from vllm.distributed.kv_events import KVCacheEvent from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request logger = init_logger(__name__) @@ -33,6 +35,43 @@ class MultiKVConnectorMetadata(KVConnectorMetadata): extra_async_saves: Optional[dict[str, int]] = None +@dataclass +class MultiKVConnectorStats(KVConnectorStats): + """ + Maintain a dict of KVConnectorStats objects, one for each connector. + This is used to aggregate the stats from all connectors separately. + """ + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + for connector_id, stats in other.data.items(): + if connector_id not in self.data: + self[connector_id] = stats + else: + assert isinstance(stats, type(self.data[connector_id])) + self[connector_id] = self[connector_id].aggregate(stats) + return self + + def reset(self): + for stats in self.data.values(): + stats.reset() + + def reduce(self) -> dict[str, Any]: + # TODO (NickLucche) Adjust for logging on separate lines + return { + connector_id: stats.reduce() + for connector_id, stats in self.data.items() + } + + def is_empty(self) -> bool: + return all(stats.is_empty() for stats in self.data.values()) + + def __getitem__(self, connector_id: str) -> KVConnectorStats: + return self.data[connector_id] + + def __setitem__(self, connector_id: str, stats: KVConnectorStats): + self.data[connector_id] = stats + + class MultiConnector(KVConnectorBase_V1): """ A wrapper for using multiple KVConnectors at the same time. @@ -46,6 +85,7 @@ class MultiConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._connectors: list[KVConnectorBase_V1] = [] + self._ktc_kv_transfer_config = [] ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "connectors") assert ktcs is not None @@ -57,6 +97,7 @@ class MultiConnector(KVConnectorBase_V1): **ktc, engine_id=engine_id) self._connectors.append( KVConnectorFactory.create_connector(temp_config, role)) + self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config) # A mapping from request id to the index of the connector chosen to # load the request from (if any). @@ -227,7 +268,7 @@ class MultiConnector(KVConnectorBase_V1): return async_saves > 0, kv_txfer_params - def take_events(self) -> Iterable[KVCacheEvent]: + def take_events(self) -> Iterable["KVCacheEvent"]: for c in self._connectors: yield from c.take_events() @@ -264,3 +305,24 @@ class MultiConnector(KVConnectorBase_V1): f"({', '.join(layouts) })." f"All connectors must use the same layout.") return next(iter(layouts), None) + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional[KVConnectorStats]: + return MultiKVConnectorStats(data=data) if data is not None \ + else MultiKVConnectorStats() + + def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]: + # Group connector stats by connector type. + stats_by_connector: Optional[MultiKVConnectorStats] = None + for c in self._connectors: + stats = c.get_kv_connector_stats() + if stats is None: + continue + if stats_by_connector is None: + # Lazy init to allow optional return value. + stats_by_connector = MultiKVConnectorStats() + stats_by_connector[c.__class__.__name__] = stats + return stats_by_connector diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 1ff1407aeb99b..ff62f60e5a42c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import copy import logging import math import queue @@ -11,7 +12,7 @@ from collections import defaultdict from collections.abc import Iterator from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Union import msgspec import numpy as np @@ -23,6 +24,8 @@ from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) @@ -33,7 +36,6 @@ from vllm.platforms import _Backend, current_platform from vllm.utils import make_zmq_path, make_zmq_socket from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.request import RequestStatus if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -206,6 +208,18 @@ class NixlConnector(KVConnectorBase_V1): assert self.connector_worker is not None return self.connector_worker.get_finished() + def get_kv_connector_stats(self) -> Optional[KVConnectorStats]: + assert self.connector_worker is not None + return self.connector_worker.get_kv_connector_stats() + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional[KVConnectorStats]: + return NixlKVConnectorStats(data=data) if data is not None \ + else NixlKVConnectorStats() + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: assert self.connector_worker is not None @@ -377,6 +391,7 @@ class NixlConnectorScheduler: Once a request is finished, determine whether request blocks should be freed now or will be sent asynchronously and freed later. """ + from vllm.v1.request import RequestStatus params = request.kv_transfer_params logger.debug( @@ -550,6 +565,7 @@ class NixlConnectorWorker: # With heterogeneous TP, P must wait for all assigned D TP workers to # finish reading before safely freeing the blocks. self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) + self.xfer_stats = NixlKVConnectorStats() def __del__(self): """Cleanup background threads on destruction.""" @@ -1097,6 +1113,8 @@ class NixlConnectorWorker: xfer_state = self.nixl_wrapper.check_xfer_state(handle) if xfer_state == "DONE": self.nixl_wrapper.release_xfer_handle(handle) + # TODO (NickLucche) Get from NIXL telemetry once integrated + self.xfer_stats.record_transfer() elif xfer_state == "PROC": in_progress = True continue @@ -1248,7 +1266,6 @@ class NixlConnectorWorker: self.nixl_wrapper.transfer(handle) # Use handle to check completion in future step(). - # TODO (NickLucche) surface xfer elapsed time self._recving_transfers[request_id].append( (handle, time.perf_counter())) @@ -1300,6 +1317,15 @@ class NixlConnectorWorker: block_len = self.block_len return block_len + def get_kv_connector_stats(self) -> Optional[KVConnectorStats]: + """ + Get the KV transfer stats for the connector. + """ + # Clear stats for next iteration + if not self.xfer_stats.is_empty(): + return self.xfer_stats.clone_and_reset() + return None + @contextlib.contextmanager def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: @@ -1318,3 +1344,39 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: finally: if ctx is not None: ctx.destroy(linger=0) + + +@dataclass +class NixlKVConnectorStats(KVConnectorStats): + """Container for transfer performance metrics""" + + def __post_init__(self): + if "num_successful_transfers" not in self.data: + self.data["num_successful_transfers"] = 0 + + def reset(self): + self.data = {"num_successful_transfers": 0} + + def record_transfer(self): + # TODO: record actual transfer stats when available + self.data["num_successful_transfers"] += 1 + + def clone_and_reset(self) -> "NixlKVConnectorStats": + old = copy.copy(self) + self.reset() + return old + + def is_empty(self) -> bool: + return self.data["num_successful_transfers"] == 0 + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + if not other.is_empty(): + self.data["num_successful_transfers"] += other.data[ + "num_successful_transfers"] + return self + + def reduce(self) -> dict[str, Union[int, float]]: + # TODO: reduce stats to a single value, calculate latency/throughput + return { + "num_successful_transfers": self.data["num_successful_transfers"] + } \ No newline at end of file diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 85ca858ad7bd6..b08898d253cab 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -15,6 +15,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, @@ -869,9 +871,12 @@ class Scheduler(SchedulerInterface): num_scheduled_tokens = scheduler_output.num_scheduled_tokens pooler_outputs = model_runner_output.pooler_output num_nans_in_logits = model_runner_output.num_nans_in_logits + kv_connector_output = model_runner_output.kv_connector_output outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: Optional[SpecDecodingStats] = None + kv_connector_stats = (kv_connector_output.kv_connector_stats + if kv_connector_output else None) # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best @@ -1007,7 +1012,8 @@ class Scheduler(SchedulerInterface): finished_requests=finished_set) finished_req_ids.clear() - if (stats := self.make_stats(spec_decoding_stats)) is not None: + if (stats := self.make_stats(spec_decoding_stats, + kv_connector_stats)) is not None: # Return stats to only one of the front-ends. if (eco := next(iter(engine_core_outputs.values()), None)) is None: # We must return the stats even if there are no request @@ -1172,20 +1178,21 @@ class Scheduler(SchedulerInterface): def make_stats( self, spec_decoding_stats: Optional[SpecDecodingStats] = None, + kv_connector_stats: Optional[KVConnectorStats] = None, ) -> Optional[SchedulerStats]: if not self.log_stats: return None prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats() assert prefix_cache_stats is not None - return SchedulerStats( - num_running_reqs=len(self.running), - num_waiting_reqs=len(self.waiting), - kv_cache_usage=self.kv_cache_manager.usage, - prefix_cache_stats=prefix_cache_stats, - spec_decoding_stats=spec_decoding_stats, - num_corrupted_reqs=sum(req.is_output_corrupted - for req in self.running), - ) + return SchedulerStats(num_running_reqs=len(self.running), + num_waiting_reqs=len(self.waiting), + kv_cache_usage=self.kv_cache_manager.usage, + prefix_cache_stats=prefix_cache_stats, + spec_decoding_stats=spec_decoding_stats, + num_corrupted_reqs=sum(req.is_output_corrupted + for req in self.running), + kv_connector_stats=kv_connector_stats.data + if kv_connector_stats else None) def make_spec_decoding_stats( self, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b30036a6f8e80..f0076b2d81dbf 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,6 +9,8 @@ from typing import Callable, Optional, Union import prometheus_client from vllm.config import SupportsMetricsInfo, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorLogging) from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason @@ -59,6 +61,8 @@ class LoggingStatLogger(StatLoggerBase): # TODO: Make the interval configurable. self.prefix_caching_metrics = PrefixCachingMetrics() self.spec_decoding_logging = SpecDecodingLogging() + kv_tranfer_config = self.vllm_config.kv_transfer_config + self.kv_transfer_logging = KVConnectorLogging(kv_tranfer_config) self.last_prompt_throughput: float = 0.0 self.last_generation_throughput: float = 0.0 @@ -97,7 +101,8 @@ class LoggingStatLogger(StatLoggerBase): if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_logging.observe( scheduler_stats.spec_decoding_stats) - + if kv_connector_stats := scheduler_stats.kv_connector_stats: + self.kv_transfer_logging.observe(kv_connector_stats) self.last_scheduler_stats = scheduler_stats def log(self): @@ -136,6 +141,7 @@ class LoggingStatLogger(StatLoggerBase): self.prefix_caching_metrics.hit_rate * 100, ) self.spec_decoding_logging.log(log_fn=log_fn) + self.kv_transfer_logging.log(log_fn=log_fn) def log_engine_initialized(self): if self.vllm_config.cache_config.num_gpu_blocks: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index e6c344d193df2..0eff557336bc0 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -3,7 +3,7 @@ import time from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Optional from vllm.v1.spec_decode.metrics import SpecDecodingStats @@ -43,6 +43,7 @@ class SchedulerStats: default_factory=PrefixCacheStats) spec_decoding_stats: Optional[SpecDecodingStats] = None + kv_connector_stats: Optional[dict[str, Any]] = None num_corrupted_reqs: int = 0 diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 1b2da8addb19e..e6cc6019b1728 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -3,10 +3,14 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import NamedTuple, Optional +from typing import TYPE_CHECKING, NamedTuple, Optional import torch +if TYPE_CHECKING: + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) + class LogprobsLists(NamedTuple): @@ -77,6 +81,11 @@ class KVConnectorOutput: # [req_ids] finished_sending: Optional[set[str]] = None finished_recving: Optional[set[str]] = None + kv_connector_stats: Optional["KVConnectorStats"] = None + + def is_empty(self): + return (not self.finished_sending and not self.finished_recving + and not self.kv_connector_stats) # ModelRunnerOutput is serialized and sent to the scheduler process. diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 3eb9f26e9f5b6..016a90c196ba3 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -13,6 +13,8 @@ from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group) from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, @@ -119,4 +121,11 @@ class KVConnectorModelRunnerMixin: output.finished_sending, output.finished_recving = ( kv_connector.get_finished(scheduler_output.finished_req_ids)) - kv_connector.clear_connector_metadata() + output.kv_connector_stats = KVConnectorModelRunnerMixin.\ + get_kv_connector_stats() + + @staticmethod + def get_kv_connector_stats() -> Optional[KVConnectorStats]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_kv_connector_stats() + return None From 5089fd749cbe4233a29f29ce706d56c47464c117 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 19 Sep 2025 19:10:52 +0800 Subject: [PATCH 012/185] [V0 Deprecation] Remove V0 logic from `get_input_embeddings` interface (#25242) Signed-off-by: DarkLight1337 --- .../models/hyperclovax_vision.py | 43 +++++++------------ vllm/model_executor/models/interfaces.py | 24 ----------- vllm/model_executor/models/ultravox.py | 19 ++------ vllm/model_executor/models/utils.py | 18 +------- 4 files changed, 21 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 53f0585541b1c..870addd0dcbca 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -46,7 +46,8 @@ from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix +from .utils import (AutoWeightsLoader, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) from .vision import get_vision_encoder_info EOT = "<|endofturn|>" @@ -740,33 +741,20 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - **kwargs, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if (kwargs.get("pixel_values_images") is not None - or kwargs.get("pixel_values_videos") - is not None): # v0 compatibility - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - if multimodal_embeddings is not None: - multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0) - _mask_image = input_ids == self.config.image_token_id - _mask_video = input_ids == self.config.video_token_id - assert _mask_image.sum() + _mask_video.sum() == len( - multimodal_embeddings) + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + placeholder_token_id=[ + self.config.image_token_id, + self.config.video_token_id, + ], + ) - if multimodal_embeddings.dtype != inputs_embeds.dtype: - multimodal_embeddings = multimodal_embeddings.to( - dtype=inputs_embeds.dtype) - if multimodal_embeddings.device != inputs_embeds.device: - multimodal_embeddings = multimodal_embeddings.to( - device=inputs_embeds.device) - - if _mask_image.sum() > 0: - inputs_embeds[ - _mask_image] = multimodal_embeddings[:sum(_mask_image)] - if _mask_video.sum() > 0: - inputs_embeds[_mask_video] = multimodal_embeddings[ - -sum(_mask_video):] return inputs_embeds def forward( @@ -783,8 +771,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: - inputs_embeds = self.get_input_embeddings(input_ids=input_ids, - **kwargs) + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index e9c600e36cfa7..6be70c4b3b214 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -23,7 +23,6 @@ from vllm.utils import supports_kw from .interfaces_base import is_pooling_model if TYPE_CHECKING: - from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.sequence import IntermediateTensors @@ -97,33 +96,10 @@ class SupportsMultiModal(Protocol): """ ... - # Only for models that support v0 chunked prefill - # TODO(ywang96): Remove this overload once v0 is deprecated - @overload def get_input_embeddings( self, input_ids: Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - attn_metadata: Optional["AttentionMetadata"] = None, - ) -> Tensor: - ... - - # TODO: Remove this overload once v0 is deprecated - @overload - def get_input_embeddings( - self, - input_ids: Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> Tensor: - ... - - def get_input_embeddings( - self, - input_ids: Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - # Only necessary so that the v0 overload is valid - # TODO: Remove attn_metadata once v0 is deprecated - attn_metadata: Optional["AttentionMetadata"] = None, ) -> Tensor: """ Returns the input embeddings merged from the text embeddings from diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 371ca817d5f92..f1f11c5fe8f00 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -13,9 +13,7 @@ from transformers import BatchFeature, ProcessorMixin from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder -from vllm import envs from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader @@ -37,8 +35,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings, - merge_multimodal_embeddings_from_map) + merge_multimodal_embeddings) _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -568,17 +565,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): safe_input_ids) if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: - - # TODO(ywang96): remove this block after v0 is deprecated. - if not envs.VLLM_USE_V1: - attn_metadata = get_forward_context().attn_metadata - merge_multimodal_embeddings_from_map( - inputs_embeds, multimodal_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) - else: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.audio_token_index) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.audio_token_index) return inputs_embeds def forward(self, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e716ec582baab..83e381b3b1578 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -15,7 +15,7 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors +from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available, is_uva_available) @@ -389,22 +389,6 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: _embedding_count_expression(inner) for inner in embeddings) -def merge_multimodal_embeddings_from_map( - inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor: - """ - Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided - placeholder map . - - Note: - This updates ``inputs_embeds`` in place. - """ - flattened_embeddings = _flatten_embeddings(multimodal_embeddings) - inputs_embeds[placeholder_map.dest] = flattened_embeddings[ - placeholder_map.src].to(dtype=inputs_embeds.dtype) - return inputs_embeds - - def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, From 838d7116ba59db528647b29f0d000742f4af9d4b Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Fri, 19 Sep 2025 20:25:12 +0800 Subject: [PATCH 013/185] [Qwen] Remove cuda hard-code in qwen3 next (#25243) Signed-off-by: Icey <1790571317@qq.com> --- vllm/model_executor/models/qwen3_next.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 0c974ee44eee2..98749c160ba4d 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -306,7 +306,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): eps=self.layer_norm_epsilon, group_size=None, norm_before_gate=True, - device=torch.cuda.current_device(), + device=current_platform.current_device(), dtype=config.torch_dtype, ) From cf278ff3b231b4fca0232db2d1183427dbc200bb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:12:55 +0100 Subject: [PATCH 014/185] Update CODEOWNERS (#25269) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 08717cdde643a..323675993467a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -66,18 +66,25 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/models/test_transformers.py @hmellor # Docs -/docs @hmellor +/docs/mkdocs @hmellor +/docs/**/*.yml @hmellor +/requirements/docs.txt @hmellor +.readthedocs.yaml @hmellor mkdocs.yaml @hmellor +# Linting +.markdownlint.yaml @hmellor +.pre-commit-config.yaml @hmellor + # CPU -/vllm/v1/worker/^cpu @bigPYJ1151 +/vllm/v1/worker/cpu* @bigPYJ1151 /csrc/cpu @bigPYJ1151 /vllm/platforms/cpu.py @bigPYJ1151 /cmake/cpu_extension.cmake @bigPYJ1151 /docker/Dockerfile.cpu @bigPYJ1151 # Intel GPU -/vllm/v1/worker/^xpu @jikunshang +/vllm/v1/worker/xpu* @jikunshang /vllm/platforms/xpu.py @jikunshang /docker/Dockerfile.xpu @jikunshang From aed16879a9191a58adc5b8ac3973454dddefe018 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:22:33 +0100 Subject: [PATCH 015/185] Move `ModelConfig` from `config/__init__.py` to `config/model.py` (#25252) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/conftest.py | 3 +- tests/distributed/test_pipeline_parallel.py | 2 +- tests/models/test_initialization.py | 5 +- tests/v1/sample/test_logprobs.py | 9 +- vllm/config/__init__.py | 2108 +------------------ vllm/config/model.py | 2006 ++++++++++++++++++ vllm/config/scheduler.py | 8 +- vllm/config/utils.py | 100 +- vllm/engine/arg_utils.py | 15 +- vllm/model_executor/model_loader/utils.py | 7 +- vllm/model_executor/models/registry.py | 22 +- vllm/v1/sample/ops/topk_topp_sampler.py | 13 +- vllm/v1/sample/sampler.py | 11 +- 13 files changed, 2160 insertions(+), 2149 deletions(-) create mode 100644 vllm/config/model.py diff --git a/tests/conftest.py b/tests/conftest.py index 9d433dedbf479..3cd93f4ad3289 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,7 +39,8 @@ from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype +from vllm.config.model import (ConvertOption, RunnerOption, + _get_and_verify_dtype) from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index fcd09844c0951..073b362b64749 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional import pytest -from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption +from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption from vllm.logger import init_logger from vllm.transformers_utils.config import get_config diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 56b5d32d16536..9281579b71e74 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,6 @@ from unittest.mock import patch import pytest from vllm import LLM -from vllm.config import ModelImpl from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import get_kv_cache_configs @@ -111,8 +110,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # these tests seem to produce leftover memory gpu_memory_utilization=0.80, load_format="dummy", - model_impl=ModelImpl.TRANSFORMERS - if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM, + model_impl="transformers" + if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm", hf_overrides=hf_overrides_fn, max_num_seqs=model_info.max_num_seqs) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 570e330208a39..71aa9e3d379cf 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -3,6 +3,7 @@ import itertools from collections.abc import Generator +from typing import get_args import pytest import torch @@ -464,7 +465,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): assert len(prompt_logprob) == vocab_size -@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) +@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): """Test with LLM engine with different logprobs_mode. @@ -493,14 +494,12 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, for logprobs in output.logprobs: for token_id in logprobs: logprob = logprobs[token_id] - if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, - LogprobsMode.PROCESSED_LOGPROBS): + if logprobs_mode in ("raw_logprobs", "processed_logprobs"): assert logprob.logprob <= 0 if logprob.logprob > 0: positive_values = positive_values + 1 total_token_with_logprobs = total_token_with_logprobs + 1 assert total_token_with_logprobs >= len(results[0].outputs) - if logprobs_mode in (LogprobsMode.RAW_LOGITS, - LogprobsMode.PROCESSED_LOGITS): + if logprobs_mode in ("raw_logits", "processed_logits"): assert positive_values > 0 del llm diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 45504e010d688..ddd8de4324f6f 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,27 +4,22 @@ # ruff: noqa: F401 import ast import copy -import enum import hashlib import inspect import json import os import textwrap -import warnings from contextlib import contextmanager -from dataclasses import InitVar, field, fields, is_dataclass, replace +from dataclasses import field, fields, is_dataclass, replace from functools import cached_property, lru_cache -from importlib.util import find_spec -from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol, - TypeVar, Union, cast, get_args) +from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar, + Union, cast) import regex as re import torch -from pydantic import (ConfigDict, SkipValidation, field_validator, - model_validator) +from pydantic import ConfigDict, SkipValidation from pydantic.dataclasses import dataclass -from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from typing_extensions import assert_never, runtime_checkable +from typing_extensions import runtime_checkable import vllm.envs as envs from vllm import version @@ -36,45 +31,31 @@ from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig +from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode, + ModelConfig, ModelDType, ModelImpl, + RunnerOption, TaskOption, TokenizerMode, + iter_architecture_defaults, + try_match_architecture_defaults) from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.pooler import PoolerConfig -from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy +from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig from vllm.config.structured_outputs import StructuredOutputsConfig -from vllm.config.utils import ConfigType, config +from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.platforms import current_platform -from vllm.transformers_utils.config import ( - ConfigFormat, get_config, get_hf_image_processor_config, - get_hf_text_config, get_pooling_config, - get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, maybe_override_with_speculators_target_model, - try_get_generation_config, try_get_safetensors_metadata, - try_get_tokenizer_config, uses_mrope) -from vllm.transformers_utils.runai_utils import (ObjectStorageModel, - is_runai_obj_uri) -from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, - STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, - LazyLoader, common_broadcastable_dtype, random_uuid) +from vllm.transformers_utils.runai_utils import is_runai_obj_uri +from vllm.utils import random_uuid if TYPE_CHECKING: from _typeshed import DataclassInstance from transformers.configuration_utils import PretrainedConfig - import vllm.model_executor.layers.quantization as me_quant - import vllm.model_executor.models as me_models - from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) - from vllm.v1.sample.logits_processor import LogitsProcessor - - HfOverrides = Union[dict, Callable[[type], type]] else: DataclassInstance = Any PretrainedConfig = Any @@ -82,83 +63,10 @@ else: QuantizationMethods = Any BaseModelLoader = Any LogitsProcessor = Any - HfOverrides = Union[dict[str, Any], Callable[[type], type]] - - me_quant = LazyLoader("model_executor", globals(), - "vllm.model_executor.layers.quantization") - me_models = LazyLoader("model_executor", globals(), - "vllm.model_executor.models") logger = init_logger(__name__) DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance) -TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", - "score", "reward", "transcription", "draft"] - -_ResolvedTask = Literal["generate", "transcription", "encode", "embed", - "classify", "reward", "draft"] - -RunnerOption = Literal["auto", "generate", "pooling", "draft"] - -RunnerType = Literal["generate", "pooling", "draft"] - -ConvertOption = Literal["auto", "none", "embed", "classify", "reward"] - -ConvertType = Literal["none", "embed", "classify", "reward"] - -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - -_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { - "generate": [], - "pooling": ["embed", "classify", "reward"], - "draft": [], -} - -# Some model suffixes are based on auto classes from Transformers: -# https://huggingface.co/docs/transformers/en/model_doc/auto -# NOTE: Items higher on this list priority over lower ones -_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ - ("ForCausalLM", ("generate", "none")), - ("ForConditionalGeneration", ("generate", "none")), - ("ChatModel", ("generate", "none")), - ("LMHeadModel", ("generate", "none")), - ("ForTextEncoding", ("pooling", "embed")), - ("EmbeddingModel", ("pooling", "embed")), - ("ForSequenceClassification", ("pooling", "classify")), - ("ForAudioClassification", ("pooling", "classify")), - ("ForImageClassification", ("pooling", "classify")), - ("ForVideoClassification", ("pooling", "classify")), - ("ClassificationModel", ("pooling", "classify")), - ("ForRewardModeling", ("pooling", "reward")), - ("RewardModel", ("pooling", "reward")), - # Let other `*Model`s take priority - ("Model", ("pooling", "embed")), -] - - -def iter_architecture_defaults(): - yield from _SUFFIX_TO_DEFAULTS - - -def try_match_architecture_defaults( - architecture: str, - *, - runner_type: Optional[RunnerType] = None, - convert_type: Optional[ConvertType] = None, -) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]: - for suffix, (default_runner_type, - default_convert_type) in iter_architecture_defaults(): - if ((runner_type is None or runner_type == default_runner_type) and - (convert_type is None or convert_type == default_convert_type) - and architecture.endswith(suffix)): - return suffix, (default_runner_type, default_convert_type) - - return None - @runtime_checkable class SupportsHash(Protocol): @@ -173,1608 +81,6 @@ class SupportsMetricsInfo(Protocol): ... -class ModelImpl(str, enum.Enum): - AUTO = "auto" - VLLM = "vllm" - TRANSFORMERS = "transformers" - TERRATORCH = "terratorch" - - -def get_attr_docs(cls: type[Any]) -> dict[str, str]: - """ - Get any docstrings placed after attribute assignments in a class body. - - https://davidism.com/mit-license/ - """ - - def pairwise(iterable): - """ - Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise - - Can be removed when Python 3.9 support is dropped. - """ - iterator = iter(iterable) - a = next(iterator, None) - - for b in iterator: - yield a, b - a = b - - try: - cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] - except (OSError, KeyError, TypeError): - # HACK: Python 3.13+ workaround - set missing __firstlineno__ - # Workaround can be removed after we upgrade to pydantic==2.12.0 - with open(inspect.getfile(cls)) as f: - for i, line in enumerate(f): - if f"class {cls.__name__}" in line and ":" in line: - cls.__firstlineno__ = i + 1 - break - cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] - - if not isinstance(cls_node, ast.ClassDef): - raise TypeError("Given object was not a class.") - - out = {} - - # Consider each pair of nodes. - for a, b in pairwise(cls_node.body): - # Must be an assignment then a constant string. - if (not isinstance(a, (ast.Assign, ast.AnnAssign)) - or not isinstance(b, ast.Expr) - or not isinstance(b.value, ast.Constant) - or not isinstance(b.value.value, str)): - continue - - doc = inspect.cleandoc(b.value.value) - - # An assignment can have multiple targets (a = b = v), but an - # annotated assignment only has one target. - targets = a.targets if isinstance(a, ast.Assign) else [a.target] - - for target in targets: - # Must be assigning to a plain name. - if not isinstance(target, ast.Name): - continue - - out[target.id] = doc - - return out - - -def is_init_field(cls: ConfigType, name: str) -> bool: - return next(f for f in fields(cls) if f.name == name).init - - -TokenizerMode = Literal["auto", "slow", "mistral", "custom"] -ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] - - -class LogprobsMode(enum.Enum): - RAW_LOGITS = "raw_logits" - RAW_LOGPROBS = "raw_logprobs" - PROCESSED_LOGITS = "processed_logits" - PROCESSED_LOGPROBS = "processed_logprobs" - - -@config -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class ModelConfig: - """Configuration for the model.""" - - model: str = "Qwen/Qwen3-0.6B" - """Name or path of the Hugging Face model to use. It is also used as the - content for `model_name` tag in metrics output when `served_model_name` is - not specified.""" - runner: RunnerOption = "auto" - """The type of model runner to use. Each vLLM instance only supports one - model runner, even if the same model can be used for multiple types.""" - convert: ConvertOption = "auto" - """Convert the model using adapters defined in - [vllm.model_executor.models.adapters][]. The most common use case is to - adapt a text generation model to be used for pooling tasks.""" - task: Optional[TaskOption] = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ - tokenizer: SkipValidation[str] = None # type: ignore - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode = "auto" - """Tokenizer mode:\n - - "auto" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "custom" will use --tokenizer to select the preregistered tokenizer.""" - trust_remote_code: bool = False - """Trust remote code (e.g., from HuggingFace) when downloading the model - and tokenizer.""" - dtype: Union[ModelDType, torch.dtype] = "auto" - """Data type for model weights and activations:\n - - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 - precision for BF16 models.\n - - "half" for FP16. Recommended for AWQ quantization.\n - - "float16" is the same as "half".\n - - "bfloat16" for a balance between precision and range.\n - - "float" is shorthand for FP32 precision.\n - - "float32" for FP32 precision.""" - seed: Optional[int] = None - """Random seed for reproducibility. Initialized to None in V0, but - initialized to 0 in V1.""" - hf_config_path: Optional[str] = None - """Name or path of the Hugging Face config to use. If unspecified, model - name or path will be used.""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - revision: Optional[str] = None - """The specific model version to use. It can be a branch name, a tag name, - or a commit id. If unspecified, will use the default version.""" - code_revision: Optional[str] = None - """The specific revision to use for the model code on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" - rope_scaling: dict[str, Any] = field(default_factory=dict) - """RoPE scaling configuration. For example, - `{"rope_type":"dynamic","factor":2.0}`.""" - rope_theta: Optional[float] = None - """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE - theta improves the performance of the scaled model.""" - tokenizer_revision: Optional[str] = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" - max_model_len: SkipValidation[int] = None # type: ignore - """Model context length (prompt and output). If unspecified, will be - automatically derived from the model config. - - When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable - format. Examples:\n - - 1k -> 1000\n - - 1K -> 1024\n - - 25.6k -> 25,600""" - spec_target_max_model_len: Optional[int] = None - """Specify the maximum length for spec decoding draft models.""" - quantization: SkipValidation[Optional[QuantizationMethods]] = None - """Method used to quantize the weights. If `None`, we first check the - `quantization_config` attribute in the model config file. If that is - `None`, we assume the model weights are not quantized and use `dtype` to - determine the data type of the weights.""" - enforce_eager: bool = False - """Whether to always use eager-mode PyTorch. If True, we will disable CUDA - graph and always execute the model in eager mode. If False, we will use - CUDA graph and eager execution in hybrid for maximal performance and - flexibility.""" - max_seq_len_to_capture: int = 8192 - """Maximum sequence len covered by CUDA graphs. When a sequence has context - length larger than this, we fall back to eager mode. Additionally for - encoder-decoder models, if the sequence length of the encoder input is - larger than this, we fall back to the eager mode.""" - max_logprobs: int = 20 - """Maximum number of log probabilities to return when `logprobs` is - specified in `SamplingParams`. The default value comes the default for the - OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * - vocab_size) logprobs are allowed to be returned and it may cause OOM.""" - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS - """Indicates the content returned in the logprobs and prompt_logprobs. - Supported mode: - 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. - Raw means the values before applying any logit processors, like bad words. - Processed means the values after applying all processors, including - temperature and top_k/top_p. - """ - disable_sliding_window: bool = False - """Whether to disable sliding window. If True, we will disable the sliding - window functionality of the model, capping to sliding window size. If the - model does not support sliding window, this argument is ignored.""" - disable_cascade_attn: bool = False - """Disable cascade attention for V1. While cascade attention does not - change the mathematical correctness, disabling it could be useful for - preventing potential numerical issues. Note that even if this is set to - False, cascade attention will be only used when the heuristic tells that - it's beneficial.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" - enable_prompt_embeds: bool = False - """If `True`, enables passing text embeddings as inputs via the - `prompt_embeds` key. Note that enabling this will double the time required - for graph compilation.""" - served_model_name: Optional[Union[str, list[str]]] = None - """The model name(s) used in the API. If multiple names are provided, the - server will respond to any of the provided names. The model name in the - model field of a response will be the first name in this list. If not - specified, the model name will be the same as the `--model` argument. Noted - that this name(s) will also be used in `model_name` tag content of - prometheus metrics, if multiple names provided, metrics tag will take the - first one.""" - use_async_output_proc: bool = True - """Whether to use async output processor.""" - config_format: Union[str, ConfigFormat] = "auto" - """The format of the model config to load:\n - - "auto" will try to load the config in hf format if available else it - will try to load in mistral format.\n - - "hf" will load the config in hf format.\n - - "mistral" will load the config in mistral format.""" - hf_token: Optional[Union[bool, str]] = None - """The token to use as HTTP bearer authorization for remote files . If - `True`, will use the token generated when running `huggingface-cli login` - (stored in `~/.huggingface`).""" - hf_overrides: HfOverrides = field(default_factory=dict) - """If a dictionary, contains arguments to be forwarded to the Hugging Face - config. If a callable, it is called to update the HuggingFace config.""" - logits_processor_pattern: Optional[str] = None - """Optional regex pattern specifying valid logits processor qualified names - that can be passed with the `logits_processors` extra completion argument. - Defaults to `None`, which allows no processors.""" - generation_config: str = "auto" - """The folder path to the generation config. Defaults to `"auto"`, the - generation config will be loaded from model path. If set to `"vllm"`, no - generation config is loaded, vLLM defaults will be used. If set to a folder - path, the generation config will be loaded from the specified folder path. - If `max_new_tokens` is specified in generation config, then it sets a - server-wide limit on the number of output tokens for all requests.""" - override_generation_config: dict[str, Any] = field(default_factory=dict) - """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If - used with `--generation-config auto`, the override parameters will be - merged with the default config from the model. If used with - `--generation-config vllm`, only the override parameters are used.""" - enable_sleep_mode: bool = False - """Enable sleep mode for the engine (only cuda platform is supported).""" - model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value - """Which implementation of the model to use:\n - - "auto" will try to use the vLLM implementation, if it exists, and fall - back to the Transformers implementation if no vLLM implementation is - available.\n - - "vllm" will use the vLLM model implementation.\n - - "transformers" will use the Transformers model implementation.\n - - "terratorch" will use the TerraTorch model implementation. - """ - override_attention_dtype: Optional[str] = None - """Override dtype for attention""" - logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None - """One or more logits processors' fully-qualified class names or class - definitions""" - io_processor_plugin: Optional[str] = None - """IOProcessor plugin name to load at model startup""" - - # Pooler config - pooler_config: Optional[PoolerConfig] = None - """Pooler config which controls the behaviour of output pooling in pooling - models.""" - override_pooler_config: Optional[Union[dict, PoolerConfig]] = None - """[DEPRECATED] Use `pooler_config` instead. This field will be removed in - v0.12.0 or v1.0.0, whichever is sooner.""" - - # Multimodal config and init vars - multimodal_config: Optional[MultiModalConfig] = None - """Configuration for multimodal model. If `None`, this will be inferred - from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None - media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None - mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None - mm_processor_cache_gb: InitVar[Optional[float]] = None - mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None - mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None - mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None - interleave_mm_strings: InitVar[Optional[bool]] = None - skip_mm_profiling: InitVar[Optional[bool]] = None - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.model) - factors.append(self.dtype) - factors.append(self.quantization) - factors.append(self.revision) - factors.append(self.code_revision) - factors.append(self.max_model_len) - factors.append(self.max_logprobs) - factors.append(self.disable_sliding_window) - factors.append(self.trust_remote_code) - factors.append(self.generation_config) - factors.append(self.model_impl) - factors.append(self.override_generation_config) - factors.append(self.rope_scaling) - factors.append(self.rope_theta) - # hf_config can control how the model looks! - factors.append(self.hf_config.to_json_string()) - str_factors = str(factors) - assert_hashable(str_factors) - return hashlib.sha256(str(factors).encode()).hexdigest() - - def __post_init__( - self, - # Multimodal config init vars - limit_mm_per_prompt: Optional[dict[str, int]], - media_io_kwargs: Optional[dict[str, dict[str, Any]]], - mm_processor_kwargs: Optional[dict[str, Any]], - mm_processor_cache_gb: Optional[float], - mm_processor_cache_type: Optional[MMCacheType], - mm_shm_cache_max_object_size_mb: Optional[int], - mm_encoder_tp_mode: Optional[MMEncoderTPMode], - interleave_mm_strings: Optional[bool], - skip_mm_profiling: Optional[bool]) -> None: - # Set the default seed to 0 in V1. - # NOTE(woosuk): In V0, we set the default seed to None because the - # driver worker shares the same process as the user process, and thus - # setting a seed affects the user process as well. - # In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. However, without a consistent seed, - # different tensor parallel workers would sample different tokens, - # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", self.seed) - - # Keep set served_model_name before maybe_model_redirect(self.model) - self.served_model_name = get_served_model_name(self.model, - self.served_model_name) - self.model = maybe_model_redirect(self.model) - # The tokenizer is consistent with the model by default. - if self.tokenizer is None: - self.tokenizer = self.model - if self.tokenizer_revision is None: - self.tokenizer_revision = self.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) - - if isinstance(self.hf_config_path, str): - self.hf_config_path = maybe_model_redirect(self.hf_config_path) - - if callable(self.hf_overrides): - hf_overrides_kw = {} - hf_overrides_fn = self.hf_overrides - else: - hf_overrides_kw = self.hf_overrides - hf_overrides_fn = None - - if self.rope_scaling: - hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling} - hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides_kw) - msg = ( - "`--rope-scaling` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") - warnings.warn(DeprecationWarning(msg), stacklevel=2) - if self.rope_theta is not None: - hf_override = {"rope_theta": self.rope_theta} - hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides_kw) - msg = ( - "`--rope-theta` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - - if (backend := envs.VLLM_ATTENTION_BACKEND - ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: - raise ValueError( - "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " - "module was not found. See " - "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 - "for instructions on how to install it.") - - from vllm.platforms import current_platform - - if (self.override_attention_dtype is not None - and not current_platform.is_rocm()): - warnings.warn( - "override-attention-dtype is set but not using ROCm platform", - stacklevel=2) - - if (self.enable_sleep_mode - and not current_platform.is_sleep_mode_available()): - raise ValueError( - "Sleep mode is not supported on current platform.") - - hf_config = get_config(self.hf_config_path or self.model, - self.trust_remote_code, - self.revision, - self.code_revision, - self.config_format, - hf_overrides_kw=hf_overrides_kw, - hf_overrides_fn=hf_overrides_fn) - - self.hf_config = hf_config - self.hf_text_config = get_hf_text_config(self.hf_config) - self.attention_chunk_size = getattr(self.hf_text_config, - "attention_chunk_size", None) - self.encoder_config = self._get_encoder_config() - self.hf_image_processor_config = get_hf_image_processor_config( - self.model, hf_token=self.hf_token, revision=self.revision) - - architectures = self.architectures - registry = self.registry - is_generative_model = registry.is_text_generation_model( - architectures, self) - is_pooling_model = registry.is_pooling_model(architectures, self) - - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - return "reward" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ("The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first.") - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ("Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model.") - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ("Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model.") - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ("Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model.") - else: # task == "auto" - pass - else: - raise AssertionError("The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}.") - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - - self.runner_type = self._get_runner_type(architectures, self.runner) - self.convert_type = self._get_convert_type(architectures, - self.runner_type, - self.convert) - - if self.runner_type == "generate" and not is_generative_model: - generate_converts = _RUNNER_CONVERTS["generate"] - if self.convert_type not in generate_converts: - # Currently we don't have any converters for generative models - raise ValueError( - "This model does not support `--runner generate`.") - if self.runner_type == "pooling" and not is_pooling_model: - pooling_converts = _RUNNER_CONVERTS["pooling"] - if self.convert_type not in pooling_converts: - convert_option = "<" + "|".join(pooling_converts) + ">" - raise ValueError( - "This model does not support `--runner pooling`. " - f"You can pass `--convert {convert_option} to adapt " - "it into a pooling model.") - - self.supported_tasks = self._get_supported_tasks( - architectures, self.runner_type, self.convert_type) - - # Note: Initialize these attributes early because transformers fallback - # may fail to load dynamic modules in child processes - model_info, arch = registry.inspect_model_cls(architectures, self) - self._model_info = model_info - self._architecture = arch - logger.info("Resolved architecture: %s", arch) - - # Init pooler config if needed - if self.runner_type == "pooling": - if self.override_pooler_config is not None: - logger.warning_once( - "`override_pooler_config` is deprecated and will be " - "removed in v0.12.0 or v1.0.0, whichever is sooner. " - "Please use `pooler_config` instead.") - - if isinstance(self.override_pooler_config, dict): - self.pooler_config = PoolerConfig( - **self.override_pooler_config) - else: - self.pooler_config = self.override_pooler_config - - if self.pooler_config is None: - self.pooler_config = PoolerConfig() - - base_config = get_pooling_config(self.model, self.revision) - if base_config is not None: - # Only set values that are not overridden by the user - for k, v in base_config.items(): - if getattr(self.pooler_config, k) is None: - setattr(self.pooler_config, k, v) - - default_pooling_type = self._model_info.default_pooling_type - if self.pooler_config.pooling_type is None: - self.pooler_config.pooling_type = default_pooling_type - - self.dtype: torch.dtype = _get_and_verify_dtype( - self.model, - self.hf_config, - self.dtype, - is_pooling_model=self.runner_type == "pooling", - revision=self.revision, - ) - - # Interleaved attention is not supported by some backends in V0 - if (not self.disable_sliding_window - and is_interleaved(self.hf_text_config) - and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) - in ("XFORMERS", "FLASHINFER")): - logger.warning_once( - "%s has interleaved attention, which is currently not " - "supported by the %s backend. Disabling sliding window and " - "capping the max length to the sliding window size (%d).", - self.hf_text_config.model_type, - backend, - self.hf_text_config.sliding_window, - ) - self.disable_sliding_window = True - - self.original_max_model_len = self.max_model_len - self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - # Init multimodal config if needed - if self._model_info.supports_multimodal: - if (mm_encoder_tp_mode == "data" and - not self._model_info.supports_multimodal_encoder_tp_data): - logger.warning_once( - "This model does not support `--mm-encoder-tp-mode data`. " - "Falling back to `--mm-encoder-tp-mode weights`.") - mm_encoder_tp_mode = "weights" - - mm_config_kwargs = dict( - limit_per_prompt=limit_mm_per_prompt, - media_io_kwargs=media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, - mm_processor_cache_gb=mm_processor_cache_gb, - mm_processor_cache_type=mm_processor_cache_type, - mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, - mm_encoder_tp_mode=mm_encoder_tp_mode, - interleave_mm_strings=interleave_mm_strings, - skip_mm_profiling=skip_mm_profiling, - ) - - mm_config_kwargs = { - k: v - for k, v in mm_config_kwargs.items() if v is not None - } - - self.multimodal_config = MultiModalConfig(**mm_config_kwargs) - - if self.disable_sliding_window: - # Set after get_and_verify_max_len to ensure that max_model_len - # can be correctly capped to sliding window size - self.hf_text_config.sliding_window = None - - if not self.skip_tokenizer_init: - self._verify_tokenizer_mode() - - # Avoid running try_verify_and_update_config multiple times - self.config_updated = False - - self._verify_quantization() - self._verify_cuda_graph() - self._verify_bnb_config() - - @field_validator("quantization", mode="before") - @classmethod - def validate_quantization_before(cls, value: Any) -> Any: - if isinstance(value, str): - return value.lower() - return value - - @model_validator(mode="after") - def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": - if not isinstance(self.tokenizer, str): - raise ValueError("tokenizer must be a string after __post_init__.") - if not isinstance(self.max_model_len, int): - raise ValueError( - "max_model_len must be an integer after __post_init__.") - return self - - def _get_transformers_backend_cls(self) -> str: - """Determine which Transformers backend class will be used if - `model_impl` is set to `transformers` or `auto`.""" - if getattr(self, "runner_type", self.runner) == "pooling": - return "TransformersModel" - if self.hf_config != self.hf_text_config: - # If 'hf_text_config' is the same as 'hf_config'. If not, it is - # probably a composite config, i.e. multimodal - return "TransformersForMultimodalLM" - return "TransformersForCausalLM" - - def using_transformers_backend(self) -> bool: - """Check if the model is using the Transformers backend class.""" - return self.architecture == self._get_transformers_backend_cls() - - @property - def registry(self): - return me_models.ModelRegistry - - @property - def architectures(self) -> list[str]: - return getattr(self.hf_config, "architectures", []) - - @property - def architecture(self) -> str: - """The architecture vllm actually used.""" - return self._architecture - - def maybe_pull_model_tokenizer_for_runai(self, model: str, - tokenizer: str) -> None: - """Pull model/tokenizer from Object Storage to temporary - directory when needed. - - Args: - model: Model name or path - tokenizer: Tokenizer name or path - """ - if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): - return - - if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel() - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"]) - self.model_weights = model - self.model = object_storage_model.dir - - # If tokenizer is same as model, download to same directory - if model == tokenizer: - object_storage_model.pull_files(model, - ignore_pattern=[ - "*.pt", "*.safetensors", - "*.bin", "*.tensors", - "*.pth" - ]) - self.tokenizer = object_storage_model.dir - return - - # Only download tokenizer if needed and not already handled - if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel() - object_storage_tokenizer.pull_files(model, - ignore_pattern=[ - "*.pt", "*.safetensors", - "*.bin", "*.tensors", - "*.pth" - ]) - self.tokenizer = object_storage_tokenizer.dir - - def _get_encoder_config(self): - return get_sentence_transformer_tokenizer_config( - self.model, self.revision) - - def _verify_tokenizer_mode(self) -> None: - tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) - if tokenizer_mode not in get_args(TokenizerMode): - raise ValueError( - f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - f"one of {get_args(TokenizerMode)}.") - self.tokenizer_mode = tokenizer_mode - - def _get_default_runner_type( - self, - architectures: list[str], - ) -> RunnerType: - registry = self.registry - - # Some Sentence Transformers models use *ForCausalLM archs - if get_pooling_config(self.model, self.revision): - return "pooling" - - for arch in architectures: - if arch in registry.get_supported_archs(): - if registry.is_pooling_model(architectures, self): - return "pooling" - if registry.is_text_generation_model(architectures, self): - return "generate" - - match = try_match_architecture_defaults(arch) - if match: - _, (runner_type, _) = match - return runner_type - - return "generate" - - def _get_runner_type( - self, - architectures: list[str], - runner: RunnerOption, - ) -> RunnerType: - if runner != "auto": - return runner - - runner_type = self._get_default_runner_type(architectures) - - # Don't log the most common case - if runner_type != "generate": - logger.info( - "Resolved `--runner auto` to `--runner %s`. " - "Pass the value explicitly to silence this message.", - runner_type) - - return runner_type - - def _get_default_convert_type( - self, - architectures: list[str], - runner_type: RunnerType, - ) -> ConvertType: - registry = self.registry - - for arch in architectures: - if arch in registry.get_supported_archs(): - if (runner_type == "generate" - and registry.is_text_generation_model( - architectures, self)): - return "none" - if (runner_type == "pooling" - and registry.is_pooling_model(architectures, self)): - return "none" - - match = try_match_architecture_defaults(arch, - runner_type=runner_type) - if match: - _, (_, convert_type) = match - return convert_type - - # This is to handle Sentence Transformers models that use *ForCausalLM - # and also multi-modal pooling models which are not defined as - # Sentence Transformers models - if runner_type == "pooling": - return "embed" - - return "none" - - def _get_convert_type( - self, - architectures: list[str], - runner_type: RunnerType, - convert: ConvertOption, - ) -> ConvertType: - if convert != "auto": - return convert - - convert_type = self._get_default_convert_type(architectures, - runner_type) - - # Don't log the most common case - if convert_type != "none": - logger.info( - "Resolved `--convert auto` to `--convert %s`. " - "Pass the value explicitly to silence this message.", - convert_type) - - return convert_type - - def _get_supported_generation_tasks( - self, - architectures: list[str], - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - registry = self.registry - - if registry.is_transcription_only_model(architectures, self): - return ["transcription"] - - # TODO: Use get_supported_generation_tasks once V0 is removed - supported_tasks = list[_ResolvedTask]() - if (registry.is_text_generation_model(architectures, self) - or convert_type in _RUNNER_CONVERTS["generate"]): - supported_tasks.append("generate") - - if registry.is_transcription_model(architectures, self): - supported_tasks.append("transcription") - - return supported_tasks - - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, - runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - - def _get_supported_pooling_tasks( - self, - architectures: list[str], - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - registry = self.registry - - # TODO: Use get_supported_pooling_tasks once V0 is removed - supported_tasks = list[_ResolvedTask]() - if (registry.is_pooling_model(architectures, self) - or convert_type in _RUNNER_CONVERTS["pooling"]): - supported_tasks.append("encode") - - extra_task = (self._get_default_pooling_task(architectures) - if convert_type == "none" else convert_type) - supported_tasks.append(extra_task) - - return supported_tasks - - def _get_supported_tasks( - self, - architectures: list[str], - runner_type: RunnerType, - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - if runner_type == "generate": - return self._get_supported_generation_tasks( - architectures, convert_type) - if runner_type == "pooling": - return self._get_supported_pooling_tasks(architectures, - convert_type) - if runner_type == "draft": - return ["draft"] - - assert_never(runner_type) - - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): - quant_cfg = getattr(hf_config, "quantization_config", None) - if quant_cfg is None: - # compressed-tensors uses a "compression_config" key - quant_cfg = getattr(hf_config, "compression_config", None) - - else: - # Set quant_method for ModelOpt models. - producer_name = quant_cfg.get("producer", {}).get("name") - if producer_name == "modelopt": - quant_algo = quant_cfg.get("quantization", - {}).get("quant_algo") - if quant_algo == "FP8": - quant_cfg["quant_method"] = "modelopt" - elif quant_algo == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - elif quant_algo is not None: - raise ValueError( - f"Unknown ModelOpt quant algo: {quant_algo}") - - return quant_cfg - - def _verify_quantization(self) -> None: - supported_quantization = me_quant.QUANTIZATION_METHODS - if self.quantization is not None: - self.quantization = cast(me_quant.QuantizationMethods, - self.quantization) - - # Parse quantization method from the HF model config, if available. - quant_cfg = self._parse_quant_hf_config(self.hf_config) - if quant_cfg is None and (text_config := getattr( - self.hf_config, "text_config", None)): - # Check the text config as well for multi-modal models. - quant_cfg = self._parse_quant_hf_config(text_config) - - if quant_cfg is not None: - # Use the community standard 'quant_method' - quant_method = quant_cfg.get("quant_method", "").lower() - - # Normalize library names - quant_method = quant_method.replace("compressed_tensors", - "compressed-tensors") - - quant_cfg["quant_method"] = quant_method - - # Quantization methods which are overrides (i.e. they have a - # `override_quantization_method` method) must be checked in order - # of preference (this is particularly important for GPTQ). - overrides = [ - "bitblas", - "gptq_marlin_24", - "gptq_marlin", - "gptq_bitblas", - "awq_marlin", - "ipex", - "moe_wna16", - "modelopt", - "modelopt_fp4", - "petit_nvfp4", - ] - quantization_methods = [ - q for q in supported_quantization if q not in overrides - ] - # Any custom overrides will be in quantization_methods so we place - # them at the start of the list so custom overrides have preference - # over the built-in ones. - quantization_methods = quantization_methods + overrides - - # Detect which checkpoint is it - for name in quantization_methods: - method = me_quant.get_quantization_config(name) - quantization_override = method.override_quantization_method( - quant_cfg, self.quantization) - if quantization_override is not None: - # Raise error if the override is not custom (custom would - # be in QUANTIZATION_METHODS but not QuantizationMethods) - # and hasn't been added to the overrides list. - if (name in get_args(me_quant.QuantizationMethods) - and name not in overrides): - raise ValueError( - f"Quantization method {name} is an override but " - "is has not been added to the `overrides` list " - "above. This is necessary to ensure that the " - "overrides are checked in order of preference.") - quant_method = quantization_override - self.quantization = quantization_override - break - - # Verify quantization configurations. - if self.quantization is None: - self.quantization = quant_method - elif self.quantization != quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization}).") - - if self.quantization is not None: - if self.quantization not in supported_quantization: - raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") - from vllm.platforms import current_platform - current_platform.verify_quantization(self.quantization) - - def _verify_cuda_graph(self) -> None: - # The `max_seq_len_to_capture` was incorrectly - # based on the encoder's input length (448) - # but not the decoder's larger input length (1500). - # This change ensures the CUDA Graph captures the correct, - # larger sequence length, allowing it to work as intended. - effective_max_seq_len = self.max_model_len - if self.is_encoder_decoder: - effective_max_seq_len = max( - effective_max_seq_len, - getattr(self.hf_config, "max_source_positions", 0)) - self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, - effective_max_seq_len) - # CUDAGraph capture not supported for encoder-decoder models on ROCm - unsupported_rocm = self.is_encoder_decoder - - if (unsupported_rocm and not self.enforce_eager - and current_platform.is_rocm()): - logger.warning( - "CUDA graph is not supported for %s on ROCm yet, fallback " - "to eager mode.", self.hf_config.model_type) - self.enforce_eager = True - - def _verify_bnb_config(self) -> None: - """ - The current version of bitsandbytes (0.46.1) with 8-bit models does not - yet support CUDA graph. - # TODO Remove this when bitsandbytes supports. - """ - is_bitsandbytes = self.quantization == "bitsandbytes" - has_quantization_config = (getattr(self.hf_config, - "quantization_config", None) - is not None) - is_8bit = (self.hf_config.quantization_config.get( - "load_in_8bit", False) if has_quantization_config else False) - if all([ - is_bitsandbytes, - has_quantization_config, - is_8bit, - not self.enforce_eager, - ]): - logger.warning( - "CUDA graph is not supported on BitsAndBytes 8bit yet, " - "fallback to the eager mode.") - - self.enforce_eager = True - - def _verify_with_expert_parallelism(self) -> None: - num_expert_names = [ - "moe_num_experts", # Dbrx - "num_experts", # Jamba - "n_routed_experts", # DeepSeek - "num_local_experts", # Mixtral - ] - num_experts = 0 - for name in num_expert_names: - num_experts = getattr(self.hf_text_config, name, 0) - if num_experts > 0: - break - if num_experts < 1: - raise ValueError( - "Number of experts in the model must be greater than 0 " - "when expert parallelism is enabled.") - - def verify_dual_chunk_attention_config( - self, - load_config: "LoadConfig", - ) -> None: - if hasattr(self.hf_config, "dual_chunk_attention_config"): - # Try loading the sparse attention config - from vllm.model_executor.model_loader.weight_utils import ( - get_sparse_attention_config) - sparse_attn_config = get_sparse_attention_config(self, load_config) - if sparse_attn_config: - self.hf_config.dual_chunk_attention_config[ - "sparse_attention_config"] = sparse_attn_config - if "sparse_attention_enabled" not in \ - self.hf_config.dual_chunk_attention_config: - self.hf_config.dual_chunk_attention_config[ - "sparse_attention_enabled"] = True - - if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: - raise ValueError("please set VLLM_ATTENTION_BACKEND to " - f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - - def verify_async_output_proc(self, parallel_config, speculative_config, - device_config) -> None: - if not self.use_async_output_proc: - # Nothing to check - return - - if parallel_config.pipeline_parallel_size > 1: - self.use_async_output_proc = False - return - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - self.use_async_output_proc = False - return - - if envs.VLLM_USE_RAY_SPMD_WORKER: - self.use_async_output_proc = False - return - - # Async postprocessor is not necessary for pooling models - # since there is no token generation - if self.runner_type == "pooling": - self.use_async_output_proc = False - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if speculative_config: - self.use_async_output_proc = False - - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - - if parallel_config.distributed_executor_backend == "external_launcher": - assert self.seed is not None, ( - "Seed must be set when using external launcher backend to " - "make sure sampling results are the same across workers.") - - total_num_attention_heads = getattr(self.hf_text_config, - "num_attention_heads", 0) - tensor_parallel_size = parallel_config.tensor_parallel_size - if total_num_attention_heads % tensor_parallel_size != 0: - raise ValueError( - f"Total number of attention heads ({total_num_attention_heads})" - " must be divisible by tensor parallel size " - f"({tensor_parallel_size}).") - - if parallel_config.enable_expert_parallel: - self._verify_with_expert_parallelism() - - pipeline_parallel_size = parallel_config.pipeline_parallel_size - if pipeline_parallel_size > 1: - if not self.registry.is_pp_supported_model(self.architectures, - self): - raise NotImplementedError( - "Pipeline parallelism is not supported for this model. " - "Supported models implement the `SupportsPP` interface.") - - if self.use_async_output_proc: - self.use_async_output_proc = False - - def get_sliding_window(self) -> Optional[int]: - """Get the sliding window size from the HF text config if present.""" - return getattr(self.hf_text_config, "sliding_window", None) - - def get_vocab_size(self) -> int: - return getattr(self.hf_text_config, "vocab_size", 0) - - def get_hidden_size(self) -> int: - return getattr(self.hf_text_config, "hidden_size", 0) - - @property - def is_deepseek_mla(self) -> bool: - if not hasattr(self.hf_text_config, "model_type"): - return False - elif self.hf_text_config.model_type in \ - ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'): - return self.hf_text_config.kv_lora_rank is not None - elif self.hf_text_config.model_type == 'eagle': - # if the model is an EAGLE module, check for the - # underlying architecture - return self.hf_text_config.model.model_type in \ - ('deepseek_v2', 'deepseek_v3') \ - and self.hf_text_config.kv_lora_rank is not None - return False - - def get_head_size(self) -> int: - # TODO remove hard code - if self.is_deepseek_mla: - qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", - 0) - if self.use_mla: - return self.hf_text_config.kv_lora_rank + qk_rope_head_dim - else: - qk_nope_head_dim = getattr(self.hf_text_config, - "qk_nope_head_dim", 0) - if qk_rope_head_dim and qk_nope_head_dim: - return qk_rope_head_dim + qk_nope_head_dim - - if hasattr(self.hf_text_config, - "model_type") and (self.hf_text_config.model_type - == "zamba2"): - return self.hf_text_config.attention_head_dim - - if self.is_attention_free: - return 0 - - # NOTE: Some configs may set head_dim=None in the config - if getattr(self.hf_text_config, "head_dim", None) is not None: - return self.hf_text_config.head_dim - - # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` - if getattr(self.hf_text_config, "hidden_size_per_head", - None) is not None: - return self.hf_text_config.hidden_size_per_head - - # FIXME(woosuk): This may not be true for all models. - return (self.hf_text_config.hidden_size // - self.hf_text_config.num_attention_heads) - - def get_total_num_kv_heads(self) -> int: - """Returns the total number of KV heads.""" - # For GPTBigCode & Falcon: - # NOTE: for falcon, when new_decoder_architecture is True, the - # multi_query flag is ignored and we use n_head_kv for the number of - # KV heads. - falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = ( - self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False)) - if not new_decoder_arch_falcon and getattr(self.hf_text_config, - "multi_query", False): - # Multi-query attention, only one KV head. - # Currently, tensor parallelism is not supported in this case. - return 1 - - # For DBRX and MPT - if self.hf_config.model_type == "mpt": - if "kv_n_heads" in self.hf_config.attn_config: - return self.hf_config.attn_config["kv_n_heads"] - return self.hf_config.num_attention_heads - if self.hf_config.model_type == "dbrx": - return getattr(self.hf_config.attn_config, "kv_n_heads", - self.hf_config.num_attention_heads) - - if self.hf_config.model_type == "nemotron-nas": - for block in self.hf_config.block_configs: - if not block.attention.no_op: - return self.hf_config.num_attention_heads \ - // block.attention.n_heads_in_group - - raise RuntimeError("Couldn't determine number of kv heads") - - if self.is_attention_free: - return 0 - - attributes = [ - # For Falcon: - "n_head_kv", - "num_kv_heads", - # For LLaMA-2: - "num_key_value_heads", - # For ChatGLM: - "multi_query_group_num", - ] - for attr in attributes: - num_kv_heads = getattr(self.hf_text_config, attr, None) - if num_kv_heads is not None: - return num_kv_heads - - # For non-grouped-query attention models, the number of KV heads is - # equal to the number of attention heads. - return self.hf_text_config.num_attention_heads - - def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: - """Returns the number of KV heads per GPU.""" - if self.use_mla: - # When using MLA during decode it becomes MQA - return 1 - - total_num_kv_heads = self.get_total_num_kv_heads() - # If tensor parallelism is used, we divide the number of KV heads by - # the tensor parallel size. We will replicate the KV heads in the - # case where the number of KV heads is smaller than the tensor - # parallel size so each GPU has at least one KV head. - return max(1, - total_num_kv_heads // parallel_config.tensor_parallel_size) - - def get_num_attention_heads(self, - parallel_config: "ParallelConfig") -> int: - num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) - return num_heads // parallel_config.tensor_parallel_size - - def get_layers_start_end_indices( - self, parallel_config: "ParallelConfig") -> tuple[int, int]: - from vllm.distributed.utils import get_pp_indices - if (self.hf_text_config.model_type == "deepseek_mtp" - or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp" - or self.hf_config.model_type == "ernie_mtp" - or self.hf_config.model_type == "qwen3_next_mtp"): - total_num_hidden_layers = getattr(self.hf_text_config, - "num_nextn_predict_layers", 0) - else: - total_num_hidden_layers = getattr(self.hf_text_config, - "num_hidden_layers", 0) - # the layout order is: DP x PP x TP - pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size - ) % parallel_config.pipeline_parallel_size - pp_size = parallel_config.pipeline_parallel_size - start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) - return start, end - - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: - start, end = self.get_layers_start_end_indices(parallel_config) - return end - start - - def get_num_layers_by_block_type( - self, - parallel_config: "ParallelConfig", - block_type: LayerBlockType = LayerBlockType.attention, - ) -> int: - # This function relies on 'layers_block_type' in hf_config, - # for w/o this attribute, we will need to have workarounds like so - attn_block_type = block_type == LayerBlockType.attention - is_transformer = not self.is_hybrid and \ - not self.has_noops and \ - not self.is_attention_free - start, end = self.get_layers_start_end_indices(parallel_config) - - if is_transformer: - # Handle the basic case first - return end - start if attn_block_type else 0 - elif self.is_attention_free: - # Attention free - # Note that this code assumes there - # is only one type of attention-free block type. - return 0 if attn_block_type else end - start - elif self.has_noops: - block_configs = self.hf_config.block_configs - return sum(not bc.attention.no_op - for bc in block_configs[start:end]) - else: - # Hybrid model Jamba - layers_block_type_value = getattr(self.hf_text_config, - "layers_block_type", None) - if layers_block_type_value is not None: - if hasattr(self.hf_text_config, - "model_type") and (self.hf_text_config.model_type - == "zamba2"): - if attn_block_type: - return sum(t == "hybrid" - for t in layers_block_type_value[start:end]) - else: - return self.get_num_layers(parallel_config) - return sum(t == block_type.value - for t in layers_block_type_value[start:end]) - - # Hybrid model Minimax - attn_type_list = getattr(self.hf_config, "attn_type_list", None) - if attn_type_list: - return sum(t == 1 for t in attn_type_list[start:end]) - - # Hybrid model Qwen3Next - layer_types_value = getattr(self.hf_config, "layer_types", None) - if layer_types_value is not None: - if getattr(block_type, "value", block_type) == "attention": - return sum(t == "full_attention" - for t in layer_types_value[start:end]) - elif getattr(block_type, "value", - block_type) == "linear_attention": - return sum(t == "linear_attention" - for t in layer_types_value[start:end]) - else: - return sum(t == getattr(block_type, "value", block_type) - for t in layer_types_value[start:end]) - - if (layers_block_type_value is None and attn_type_list is None - and layer_types_value is None): - raise ValueError( - "The model is an hybrid without a" - "layers_block_type or an attn_type_list, or a layer_types " - "in the hf_config, cannot determine the num of " - f"{block_type.value} layers") - - def get_mamba_chunk_size(self) -> Optional[int]: - """ - Returns the mamba chunk size if it exists - """ - # used by e.g. Bamba, FalconH1, Granite, PLaMo2 - chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None) - if chunk_size is None: - # used by e.g. Mamba2, NemotronH, Zamba - chunk_size = getattr(self.hf_text_config, "chunk_size", None) - return chunk_size - - def get_multimodal_config(self) -> "MultiModalConfig": - """ - Get the multimodal configuration of the model. - - Raises: - ValueError: If the model is not multimodal. - """ - if self.multimodal_config is None: - raise ValueError("The model is not multimodal.") - - return self.multimodal_config - - def try_get_generation_config(self) -> dict[str, Any]: - """ - This method attempts to retrieve the non-default values of the - generation config for this model. - - The generation config can contain information about special tokens, as - well as sampling parameters. Which is why this method exists separately - to `get_diff_sampling_param`. - - Returns: - A dictionary containing the non-default generation config. - """ - if self.generation_config in {"auto", "vllm"}: - config = try_get_generation_config( - self.hf_config_path or self.model, - trust_remote_code=self.trust_remote_code, - revision=self.revision, - ) - else: - config = try_get_generation_config( - self.generation_config, - trust_remote_code=self.trust_remote_code, - ) - - if config is None: - return {} - - return config.to_diff_dict() - - def get_diff_sampling_param(self) -> dict[str, Any]: - """ - This method returns a dictionary containing the non-default sampling - parameters with `override_generation_config` applied. - - The default sampling parameters are: - - - vLLM's neutral defaults if `self.generation_config="vllm"` - - the model's defaults if `self.generation_config="auto"` - - as defined in `generation_config.json` if - `self.generation_config="path/to/generation_config/dir"` - - Returns: - A dictionary containing the non-default sampling parameters. - """ - if self.generation_config == "vllm": - config = {} - else: - config = self.try_get_generation_config() - - # Overriding with given generation config - config.update(self.override_generation_config) - - available_params = [ - "repetition_penalty", - "temperature", - "top_k", - "top_p", - "min_p", - "max_new_tokens", - ] - if any(p in config for p in available_params): - diff_sampling_param = { - p: config.get(p) - for p in available_params if config.get(p) is not None - } - # Huggingface definition of max_new_tokens is equivalent - # to vLLM's max_tokens - if "max_new_tokens" in diff_sampling_param: - diff_sampling_param["max_tokens"] = diff_sampling_param.pop( - "max_new_tokens") - else: - diff_sampling_param = {} - - if diff_sampling_param: - logger.warning_once( - "Default sampling parameters have been overridden by the " - "model's Hugging Face generation config recommended from the " - "model creator. If this is not intended, please relaunch " - "vLLM instance with `--generation-config vllm`.") - return diff_sampling_param - - @property - def is_encoder_decoder(self) -> bool: - """Extract the HF encoder/decoder model flag.""" - return is_encoder_decoder(self.hf_config) - - @property - def uses_mrope(self) -> bool: - return uses_mrope(self.hf_config) - - @property - def is_multimodal_model(self) -> bool: - return self.multimodal_config is not None - - @property - def is_multimodal_raw_input_only_model(self) -> bool: - return self._model_info.supports_multimodal_raw_input_only - - @property - def is_cross_encoder(self) -> bool: - return (self._model_info.supports_cross_encoding - or self.convert_type == "classify") - - @property - def is_pp_supported(self) -> bool: - return self._model_info.supports_pp - - @property - def is_attention_free(self) -> bool: - return self._model_info.is_attention_free - - @property - def is_hybrid(self) -> bool: - return self._model_info.is_hybrid - - @property - def has_noops(self) -> bool: - return self._model_info.has_noops - - @property - def has_inner_state(self): - return self._model_info.has_inner_state - - @property - def is_v1_compatible(self) -> bool: - return not self._model_info.supports_v0_only - - @property - def use_mla(self) -> bool: - return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE - - @property - def is_matryoshka(self) -> bool: - return (bool(getattr(self.hf_config, "matryoshka_dimensions", None)) - or getattr(self.hf_config, "is_matryoshka", False)) - - @property - def matryoshka_dimensions(self): - return getattr(self.hf_config, "matryoshka_dimensions", None) - - @property - def use_pad_token(self) -> bool: - # cross_encoder models defaults to using pad_token. - # `llm as reranker` models defaults to not using pad_token. - return getattr(self.hf_config, "use_pad_token", True) - - @property - def head_dtype(self) -> torch.dtype: - """ - "head" refers to the last Linear layer(s) of an LLM, - such as the lm_head in a generation model, - or the score or classifier in a classification model. - - `head_dtype` currently only supports pooling models.\n - - The pooling model defaults to using fp32 head, - you can use --hf-overrides '{"head_dtype": "model"}' to disable it. - """ - - head_dtype = _get_head_dtype(config=self.hf_config, - dtype=self.dtype, - runner_type=self.runner_type) - - if self.runner_type != "pooling" and head_dtype != self.dtype: - logger.warning_once( - "`head_dtype` currently only supports pooling models." - "fallback to model dtype [%s].", self.dtype) - return self.dtype - - if head_dtype not in current_platform.supported_dtypes: - logger.warning_once( - "The current platform does not support [%s] head dtype, " - "fallback to model dtype [%s].", head_dtype, self.dtype) - return self.dtype - - logger.debug_once("head dtype: %s", head_dtype) - return head_dtype - - def get_and_verify_max_len(self, max_model_len: int): - # Consider max_model_len in tokenizer_config only when - # pooling models use absolute position_embedding. - tokenizer_config = None - if (self.runner_type == "pooling" and getattr( - self.hf_config, "position_embedding_type", "") == "absolute"): - tokenizer_config = try_get_tokenizer_config( - self.tokenizer, - trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision) - max_model_len = _get_and_verify_max_len( - hf_config=self.hf_text_config, - tokenizer_config=tokenizer_config, - max_model_len=max_model_len, - disable_sliding_window=self.disable_sliding_window, - sliding_window=self.get_sliding_window(), - spec_target_max_model_len=self.spec_target_max_model_len, - encoder_config=self.encoder_config) - logger.info("Using max model len %s", max_model_len) - return max_model_len - - Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @@ -1838,365 +144,6 @@ class DeviceConfig: self.device = torch.device(self.device_type) -_STR_DTYPE_TO_TORCH_DTYPE = { - "half": torch.float16, - "float16": torch.float16, - "float": torch.float32, - "float32": torch.float32, - "bfloat16": torch.bfloat16, -} - -# model_type -> reason -_FLOAT16_NOT_SUPPORTED_MODELS = { - "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", - "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", - "gemma3_text": - "Numerical instability. Please use bfloat16 or float32 instead.", - "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", - "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", -} - - -def _is_valid_dtype(model_type: str, dtype: torch.dtype): - if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 - return False - - return True - - -def _check_valid_dtype(model_type: str, dtype: torch.dtype): - if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: - reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] - raise ValueError(f"The model type {model_type!r} " - f"does not support float16. Reason: {reason}") - - return True - - -def _find_dtype( - model_id: str, - config: PretrainedConfig, - *, - revision: Optional[str], -): - # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct - # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) - - # Fallbacks for multi-modal models if the root config - # does not define torch_dtype - if config_dtype is None: - config_dtype = getattr(config.get_text_config(), "torch_dtype", None) - if config_dtype is None and hasattr(config, "vision_config"): - config_dtype = getattr(config.vision_config, "torch_dtype", None) - if config_dtype is None and hasattr(config, "encoder_config"): - config_dtype = getattr(config.encoder_config, "torch_dtype", None) - - # Try to read the dtype of the weights if they are in safetensors format - if config_dtype is None: - repo_mt = try_get_safetensors_metadata(model_id, revision=revision) - - if repo_mt and (files_mt := repo_mt.files_metadata): - param_dtypes: set[torch.dtype] = { - _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] - for file_mt in files_mt.values() - for dtype_str in file_mt.parameter_count - if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE - } - - if param_dtypes: - return common_broadcastable_dtype(param_dtypes) - - if config_dtype is None: - config_dtype = torch.float32 - - return config_dtype - - -def _resolve_auto_dtype( - model_type: str, - config_dtype: torch.dtype, - *, - is_pooling_model: bool, -): - from vllm.platforms import current_platform - - supported_dtypes = [ - dtype for dtype in current_platform.supported_dtypes - if _is_valid_dtype(model_type, dtype) - ] - - if is_pooling_model and torch.float16 in supported_dtypes: - preferred_dtype = torch.float16 - else: - preferred_dtype = supported_dtypes[0] - - # Downcast for float32 models - if config_dtype == torch.float32: - config_dtype = preferred_dtype - - if config_dtype in supported_dtypes: - return config_dtype - - # Ensure device compatibility - device_name = current_platform.get_device_name() - device_capability = current_platform.get_device_capability() - - if device_capability is None: - device_str = f"{device_name!r}" - else: - version_str = device_capability.as_version_str() - device_str = f"{device_name!r} (with compute capability {version_str})" - - logger.warning( - "Your device %s doesn't support %s. " - "Falling back to %s for compatibility.", - device_str, - config_dtype, - preferred_dtype, - ) - - return preferred_dtype - - -def _get_and_verify_dtype( - model_id: str, - config: PretrainedConfig, - dtype: Union[str, torch.dtype], - *, - is_pooling_model: bool, - revision: Optional[str] = None, -) -> torch.dtype: - config_dtype = _find_dtype(model_id, config, revision=revision) - model_type = config.model_type - - if isinstance(dtype, str): - dtype = dtype.lower() - if dtype == "auto": - # Set default dtype from model config - torch_dtype = _resolve_auto_dtype( - model_type, - config_dtype, - is_pooling_model=is_pooling_model, - ) - else: - if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {dtype!r}") - torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] - elif isinstance(dtype, torch.dtype): - torch_dtype = dtype - else: - raise ValueError(f"Unknown dtype: {dtype}") - - _check_valid_dtype(model_type, torch_dtype) - - if torch_dtype != config_dtype: - if torch_dtype == torch.float32: - # Upcasting to float32 is allowed. - logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) - elif config_dtype == torch.float32: - # Downcasting from float32 to float16 or bfloat16 is allowed. - logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) - else: - # Casting between float16 and bfloat16 is allowed with a warning. - logger.warning("Casting %s to %s.", config_dtype, torch_dtype) - - return torch_dtype - - -def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype, - runner_type: str) -> torch.dtype: - head_dtype: Optional[Union[str, - torch.dtype]] = getattr(config, "head_dtype", - None) - - if head_dtype == "model": - return dtype - elif isinstance(head_dtype, str): - head_dtype = head_dtype.lower() - if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {head_dtype!r}") - return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] - elif isinstance(head_dtype, torch.dtype): - return head_dtype - elif head_dtype is None: - if torch.float32 not in current_platform.supported_dtypes: - return dtype - if runner_type == "pooling": - return torch.float32 - return dtype - else: - raise ValueError(f"Unknown dtype: {head_dtype}") - - -def _get_and_verify_max_len( - hf_config: PretrainedConfig, - tokenizer_config: Optional[dict], - max_model_len: Optional[int], - disable_sliding_window: bool, - sliding_window: Optional[int], - spec_target_max_model_len: Optional[int] = None, - encoder_config: Optional[Any] = None, -) -> int: - """Get and verify the model's maximum length.""" - derived_max_model_len = float("inf") - possible_keys = [ - # OPT - "max_position_embeddings", - # GPT-2 - "n_positions", - # MPT - "max_seq_len", - # ChatGLM2 - "seq_length", - # Command-R - "model_max_length", - # Whisper - "max_target_positions", - # Others - "max_sequence_length", - "max_seq_length", - "seq_len", - ] - # Choose the smallest "max_length" from the possible keys - max_len_key = None - for key in possible_keys: - max_len = getattr(hf_config, key, None) - if max_len is not None: - max_len_key = key if max_len < derived_max_model_len \ - else max_len_key - derived_max_model_len = min(derived_max_model_len, max_len) - # For Command-R / Cohere, Cohere2 / Aya Vision models - if tmp_max_len := getattr(hf_config, "model_max_length", None): - max_len_key = "model_max_length" - derived_max_model_len = tmp_max_len - - # If sliding window is manually disabled, max_length should be less - # than the sliding window length in the model config. - if (disable_sliding_window and sliding_window is not None - and sliding_window < derived_max_model_len): - max_len_key = "sliding_window" - derived_max_model_len = sliding_window - - # Consider model_max_length in tokenizer_config - if tokenizer_config: - tokenizer_model_max_length = tokenizer_config.get( - "model_max_length", derived_max_model_len) - derived_max_model_len = min(derived_max_model_len, - tokenizer_model_max_length) - - # If none of the keys were found in the config, use a default and - # log a warning. - if derived_max_model_len == float("inf"): - if max_model_len is not None: - # If max_model_len is specified, we use it. - return max_model_len - - if spec_target_max_model_len is not None: - # If this is a speculative draft model, we use the max model len - # from the target model. - return spec_target_max_model_len - - default_max_len = 2048 - logger.warning( - "The model's config.json does not contain any of the following " - "keys to determine the original maximum length of the model: " - "%s. Assuming the model's maximum length is %d.", possible_keys, - default_max_len) - derived_max_model_len = default_max_len - - rope_scaling = getattr(hf_config, "rope_scaling", None) - # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE - # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] - - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate.") - - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) - - if rope_type == "yarn": - derived_max_model_len = rope_scaling[ - "original_max_position_embeddings"] - derived_max_model_len *= scaling_factor - - if encoder_config and "max_seq_length" in encoder_config: - derived_max_model_len = encoder_config["max_seq_length"] - - # If the user specified a max length, make sure it is smaller than the - # derived length from the HF model config. - if max_model_len is None: - max_model_len = int(derived_max_model_len) - if current_platform.is_tpu(): - logger.warning( - "--max-model-len is not specified, " - "it's currently using model's default length %s, " - "which might be too large." - "Please input with --max-model-len based on your " - "request input length and output length, to avoid " - "unnecessary degradation.", max_model_len) - elif max_model_len > derived_max_model_len: - # Some models might have a separate key for specifying model_max_length - # that will be bigger than derived_max_model_len. We compare user input - # with model_max_length and allow this override when it's smaller. - model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate.") - else: - msg = ( - f"User-specified max_model_len ({max_model_len}) is greater " - f"than the derived max_model_len ({max_len_key}=" - f"{derived_max_model_len} or model_max_length=" - f"{model_max_length} in model's config.json).") - warning = ( - "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " - "caution. If the model uses relative position encoding (RoPE), " - "positions exceeding derived_max_model_len lead to nan. If the " - "model uses absolute position encoding, positions exceeding " - "derived_max_model_len will cause a CUDA array out-of-bounds " - "error.") - if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: - logger.warning_once("%s %s", msg, warning) - else: - raise ValueError( - f"{msg} To allow overriding this maximum, set " - f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") - return int(max_model_len) - - -def get_served_model_name(model: str, - served_model_name: Optional[Union[str, list[str]]]): - """ - If the input is a non-empty list, the first model_name in - `served_model_name` is taken. - If the input is a non-empty string, it is used directly. - For cases where the input is either an empty string or an - empty list, the fallback is to use `self.model`. - """ - if not served_model_name: - return model - if isinstance(served_model_name, list): - return served_model_name[0] - return served_model_name - - DetailedTraceModules = Literal["model", "worker", "all"] @@ -3012,33 +959,6 @@ def get_current_model_prefix() -> str: return _current_prefix -def contains_object_print(text): - """ - Check if the text looks like a printed Python object, e.g. - contains any substring matching the pattern: "at 0xFFFFFFF>" - We match against 0x followed by 2-16 hex chars (there's - a max of 16 on a 64-bit system). - - Args: - text (str): The text to check - - Returns: - result (bool): `True` if a match is found, `False` otherwise. - """ - pattern = r'at 0x[a-fA-F0-9]{2,16}>' - match = re.search(pattern, text) - return match is not None - - -def assert_hashable(text): - if not contains_object_print(text): - return True - raise AssertionError( - f"vLLM tried to hash some configs that may have Python objects ids " - f"in them. This is a bug, please file an issue. " - f"Text being hashed: {text}") - - T = TypeVar("T") diff --git a/vllm/config/model.py b/vllm/config/model.py new file mode 100644 index 0000000000000..21457d3660a23 --- /dev/null +++ b/vllm/config/model.py @@ -0,0 +1,2006 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import json +import warnings +from dataclasses import InitVar, field +from importlib.util import find_spec +from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Union, + cast, get_args) + +import torch +from pydantic import (ConfigDict, SkipValidation, field_validator, + model_validator) +from pydantic.dataclasses import dataclass +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from typing_extensions import assert_never + +import vllm.envs as envs +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) +from vllm.config.pooler import PoolerConfig +from vllm.config.utils import assert_hashable, config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.transformers_utils.config import ( + ConfigFormat, get_config, get_hf_image_processor_config, + get_hf_text_config, get_pooling_config, + get_sentence_transformer_tokenizer_config, is_encoder_decoder, + is_interleaved, maybe_override_with_speculators_target_model, + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) +from vllm.transformers_utils.runai_utils import (ObjectStorageModel, + is_runai_obj_uri) +from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, + LazyLoader, common_broadcastable_dtype) + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + import vllm.model_executor.layers.quantization as me_quant + import vllm.model_executor.models as me_models + from vllm.config.load import LoadConfig + from vllm.config.parallel import ParallelConfig + from vllm.config.scheduler import RunnerType + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.v1.sample.logits_processor import LogitsProcessor +else: + PretrainedConfig = Any + + me_quant = LazyLoader("model_executor", globals(), + "vllm.model_executor.layers.quantization") + me_models = LazyLoader("model_executor", globals(), + "vllm.model_executor.models") + LoadConfig = Any + ParallelConfig = Any + RunnerType = Any + QuantizationMethods = Any + LogitsProcessor = Any + +logger = init_logger(__name__) + +RunnerOption = Literal["auto", "generate", "pooling", "draft"] +ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertOption = Literal["auto", ConvertType] +TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", + "score", "reward", "transcription", "draft"] +_ResolvedTask = Literal["generate", "transcription", "encode", "embed", + "classify", "reward", "draft"] +TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] +LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits", + "processed_logprobs"] +HfOverrides = Union[dict[str, Any], Callable[[type], type]] +ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] + +_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { + "generate": ["generate", "transcription"], + "pooling": ["embedding", "embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { + "generate": [], + "pooling": ["embed", "classify", "reward"], + "draft": [], +} + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class ModelConfig: + """Configuration for the model.""" + + model: str = "Qwen/Qwen3-0.6B" + """Name or path of the Hugging Face model to use. It is also used as the + content for `model_name` tag in metrics output when `served_model_name` is + not specified.""" + runner: RunnerOption = "auto" + """The type of model runner to use. Each vLLM instance only supports one + model runner, even if the same model can be used for multiple types.""" + convert: ConvertOption = "auto" + """Convert the model using adapters defined in + [vllm.model_executor.models.adapters][]. The most common use case is to + adapt a text generation model to be used for pooling tasks.""" + task: Optional[TaskOption] = None + """[DEPRECATED] The task to use the model for. If the model supports more + than one model runner, this is used to select which model runner to run. + + Note that the model may support other tasks using the same model runner. + """ + tokenizer: SkipValidation[str] = None # type: ignore + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode = "auto" + """Tokenizer mode:\n + - "auto" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "custom" will use --tokenizer to select the preregistered tokenizer.""" + trust_remote_code: bool = False + """Trust remote code (e.g., from HuggingFace) when downloading the model + and tokenizer.""" + dtype: Union[ModelDType, torch.dtype] = "auto" + """Data type for model weights and activations:\n + - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 + precision for BF16 models.\n + - "half" for FP16. Recommended for AWQ quantization.\n + - "float16" is the same as "half".\n + - "bfloat16" for a balance between precision and range.\n + - "float" is shorthand for FP32 precision.\n + - "float32" for FP32 precision.""" + seed: Optional[int] = None + """Random seed for reproducibility. Initialized to None in V0, but + initialized to 0 in V1.""" + hf_config_path: Optional[str] = None + """Name or path of the Hugging Face config to use. If unspecified, model + name or path will be used.""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + revision: Optional[str] = None + """The specific model version to use. It can be a branch name, a tag name, + or a commit id. If unspecified, will use the default version.""" + code_revision: Optional[str] = None + """The specific revision to use for the model code on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + rope_scaling: dict[str, Any] = field(default_factory=dict) + """RoPE scaling configuration. For example, + `{"rope_type":"dynamic","factor":2.0}`.""" + rope_theta: Optional[float] = None + """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE + theta improves the performance of the scaled model.""" + tokenizer_revision: Optional[str] = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + max_model_len: SkipValidation[int] = None # type: ignore + """Model context length (prompt and output). If unspecified, will be + automatically derived from the model config. + + When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable + format. Examples:\n + - 1k -> 1000\n + - 1K -> 1024\n + - 25.6k -> 25,600""" + spec_target_max_model_len: Optional[int] = None + """Specify the maximum length for spec decoding draft models.""" + quantization: SkipValidation[Optional[QuantizationMethods]] = None + """Method used to quantize the weights. If `None`, we first check the + `quantization_config` attribute in the model config file. If that is + `None`, we assume the model weights are not quantized and use `dtype` to + determine the data type of the weights.""" + enforce_eager: bool = False + """Whether to always use eager-mode PyTorch. If True, we will disable CUDA + graph and always execute the model in eager mode. If False, we will use + CUDA graph and eager execution in hybrid for maximal performance and + flexibility.""" + max_seq_len_to_capture: int = 8192 + """Maximum sequence len covered by CUDA graphs. When a sequence has context + length larger than this, we fall back to eager mode. Additionally for + encoder-decoder models, if the sequence length of the encoder input is + larger than this, we fall back to the eager mode.""" + max_logprobs: int = 20 + """Maximum number of log probabilities to return when `logprobs` is + specified in `SamplingParams`. The default value comes the default for the + OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * + vocab_size) logprobs are allowed to be returned and it may cause OOM.""" + logprobs_mode: LogprobsMode = "raw_logprobs" + """Indicates the content returned in the logprobs and prompt_logprobs. + Supported mode: + 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. + """ + disable_sliding_window: bool = False + """Whether to disable sliding window. If True, we will disable the sliding + window functionality of the model, capping to sliding window size. If the + model does not support sliding window, this argument is ignored.""" + disable_cascade_attn: bool = False + """Disable cascade attention for V1. While cascade attention does not + change the mathematical correctness, disabling it could be useful for + preventing potential numerical issues. Note that even if this is set to + False, cascade attention will be only used when the heuristic tells that + it's beneficial.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + enable_prompt_embeds: bool = False + """If `True`, enables passing text embeddings as inputs via the + `prompt_embeds` key. Note that enabling this will double the time required + for graph compilation.""" + served_model_name: Optional[Union[str, list[str]]] = None + """The model name(s) used in the API. If multiple names are provided, the + server will respond to any of the provided names. The model name in the + model field of a response will be the first name in this list. If not + specified, the model name will be the same as the `--model` argument. Noted + that this name(s) will also be used in `model_name` tag content of + prometheus metrics, if multiple names provided, metrics tag will take the + first one.""" + use_async_output_proc: bool = True + """Whether to use async output processor.""" + config_format: Union[str, ConfigFormat] = "auto" + """The format of the model config to load:\n + - "auto" will try to load the config in hf format if available else it + will try to load in mistral format.\n + - "hf" will load the config in hf format.\n + - "mistral" will load the config in mistral format.""" + hf_token: Optional[Union[bool, str]] = None + """The token to use as HTTP bearer authorization for remote files . If + `True`, will use the token generated when running `huggingface-cli login` + (stored in `~/.huggingface`).""" + hf_overrides: HfOverrides = field(default_factory=dict) + """If a dictionary, contains arguments to be forwarded to the Hugging Face + config. If a callable, it is called to update the HuggingFace config.""" + logits_processor_pattern: Optional[str] = None + """Optional regex pattern specifying valid logits processor qualified names + that can be passed with the `logits_processors` extra completion argument. + Defaults to `None`, which allows no processors.""" + generation_config: str = "auto" + """The folder path to the generation config. Defaults to `"auto"`, the + generation config will be loaded from model path. If set to `"vllm"`, no + generation config is loaded, vLLM defaults will be used. If set to a folder + path, the generation config will be loaded from the specified folder path. + If `max_new_tokens` is specified in generation config, then it sets a + server-wide limit on the number of output tokens for all requests.""" + override_generation_config: dict[str, Any] = field(default_factory=dict) + """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If + used with `--generation-config auto`, the override parameters will be + merged with the default config from the model. If used with + `--generation-config vllm`, only the override parameters are used.""" + enable_sleep_mode: bool = False + """Enable sleep mode for the engine (only cuda platform is supported).""" + model_impl: Union[str, ModelImpl] = "auto" + """Which implementation of the model to use:\n + - "auto" will try to use the vLLM implementation, if it exists, and fall + back to the Transformers implementation if no vLLM implementation is + available.\n + - "vllm" will use the vLLM model implementation.\n + - "transformers" will use the Transformers model implementation.\n + - "terratorch" will use the TerraTorch model implementation. + """ + override_attention_dtype: Optional[str] = None + """Override dtype for attention""" + logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None + """One or more logits processors' fully-qualified class names or class + definitions""" + io_processor_plugin: Optional[str] = None + """IOProcessor plugin name to load at model startup""" + + # Pooler config + pooler_config: Optional[PoolerConfig] = None + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: Optional[Union[dict, PoolerConfig]] = None + """[DEPRECATED] Use `pooler_config` instead. This field will be removed in + v0.12.0 or v1.0.0, whichever is sooner.""" + + # Multimodal config and init vars + multimodal_config: Optional[MultiModalConfig] = None + """Configuration for multimodal model. If `None`, this will be inferred + from the architecture of `self.model`.""" + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None + mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None + mm_processor_cache_gb: InitVar[Optional[float]] = None + mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None + mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None + mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None + interleave_mm_strings: InitVar[Optional[bool]] = None + skip_mm_profiling: InitVar[Optional[bool]] = None + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.model) + factors.append(self.dtype) + factors.append(self.quantization) + factors.append(self.revision) + factors.append(self.code_revision) + factors.append(self.max_model_len) + factors.append(self.max_logprobs) + factors.append(self.disable_sliding_window) + factors.append(self.trust_remote_code) + factors.append(self.generation_config) + factors.append(self.model_impl) + factors.append(self.override_generation_config) + factors.append(self.rope_scaling) + factors.append(self.rope_theta) + # hf_config can control how the model looks! + factors.append(self.hf_config.to_json_string()) + str_factors = str(factors) + assert_hashable(str_factors) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[dict[str, int]], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: + # Set the default seed to 0 in V1. + # NOTE(woosuk): In V0, we set the default seed to None because the + # driver worker shares the same process as the user process, and thus + # setting a seed affects the user process as well. + # In V1, we use separate processes for workers (unless + # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here + # doesn't affect the user process. However, without a consistent seed, + # different tensor parallel workers would sample different tokens, + # leading to inconsistent results. + if envs.VLLM_USE_V1 and self.seed is None: + self.seed = 0 + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", self.seed) + + # Keep set served_model_name before maybe_model_redirect(self.model) + self.served_model_name = get_served_model_name(self.model, + self.served_model_name) + self.model = maybe_model_redirect(self.model) + # The tokenizer is consistent with the model by default. + if self.tokenizer is None: + self.tokenizer = self.model + if self.tokenizer_revision is None: + self.tokenizer_revision = self.revision + self.tokenizer = maybe_model_redirect(self.tokenizer) + + if isinstance(self.hf_config_path, str): + self.hf_config_path = maybe_model_redirect(self.hf_config_path) + + if callable(self.hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = self.hf_overrides + else: + hf_overrides_kw = self.hf_overrides + hf_overrides_fn = None + + if self.rope_scaling: + hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling} + hf_overrides_kw.update(hf_override) + hf_overrides_str = json.dumps(hf_overrides_kw) + msg = ( + "`--rope-scaling` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + if self.rope_theta is not None: + hf_override = {"rope_theta": self.rope_theta} + hf_overrides_kw.update(hf_override) + hf_overrides_str = json.dumps(hf_overrides_kw) + msg = ( + "`--rope-theta` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + + if self.runner != "draft": + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) + + if (backend := envs.VLLM_ATTENTION_BACKEND + ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: + raise ValueError( + "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " + "module was not found. See " + "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 + "for instructions on how to install it.") + + from vllm.platforms import current_platform + + if (self.override_attention_dtype is not None + and not current_platform.is_rocm()): + warnings.warn( + "override-attention-dtype is set but not using ROCm platform", + stacklevel=2) + + if (self.enable_sleep_mode + and not current_platform.is_sleep_mode_available()): + raise ValueError( + "Sleep mode is not supported on current platform.") + + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn) + + self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(self.hf_config) + self.attention_chunk_size = getattr(self.hf_text_config, + "attention_chunk_size", None) + self.encoder_config = self._get_encoder_config() + self.hf_image_processor_config = get_hf_image_processor_config( + self.model, hf_token=self.hf_token, revision=self.revision) + + architectures = self.architectures + registry = self.registry + is_generative_model = registry.is_text_generation_model( + architectures, self) + is_pooling_model = registry.is_pooling_model(architectures, self) + + def _task_to_convert(task: TaskOption) -> ConvertType: + if task == "embedding" or task == "embed": + return "embed" + if task == "classify": + return "classify" + if task == "reward": + return "reward" + if task == "score": + new_task = self._get_default_pooling_task(architectures) + return "classify" if new_task == "classify" else "embed" + + return "none" + + if self.task is not None: + runner: RunnerOption = "auto" + convert: ConvertOption = "auto" + msg_prefix = ("The 'task' option has been deprecated and will be " + "removed in v0.13.0 or v1.0, whichever comes first.") + msg_hint = "Please remove this option." + + is_generative_task = self.task in _RUNNER_TASKS["generate"] + is_pooling_task = self.task in _RUNNER_TASKS["pooling"] + + if is_generative_model and is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = ("Please replace this option with `--runner " + "generate` to continue using this model " + "as a generative model.") + elif is_pooling_task: + runner = "pooling" + convert = "auto" + msg_hint = ("Please replace this option with `--runner " + "pooling` to continue using this model " + "as a pooling model.") + else: # task == "auto" + pass + elif is_generative_model or is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = "Please remove this option" + elif is_pooling_task: + runner = "pooling" + convert = _task_to_convert(self.task) + msg_hint = ("Please replace this option with `--convert " + f"{convert}` to continue using this model " + "as a pooling model.") + else: # task == "auto" + pass + else: + raise AssertionError("The model should be a generative or " + "pooling model when task is set to " + f"{self.task!r}.") + + self.runner = runner + self.convert = convert + + msg = f"{msg_prefix} {msg_hint}" + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + self.runner_type = self._get_runner_type(architectures, self.runner) + self.convert_type = self._get_convert_type(architectures, + self.runner_type, + self.convert) + + if self.runner_type == "generate" and not is_generative_model: + generate_converts = _RUNNER_CONVERTS["generate"] + if self.convert_type not in generate_converts: + # Currently we don't have any converters for generative models + raise ValueError( + "This model does not support `--runner generate`.") + if self.runner_type == "pooling" and not is_pooling_model: + pooling_converts = _RUNNER_CONVERTS["pooling"] + if self.convert_type not in pooling_converts: + convert_option = "<" + "|".join(pooling_converts) + ">" + raise ValueError( + "This model does not support `--runner pooling`. " + f"You can pass `--convert {convert_option} to adapt " + "it into a pooling model.") + + self.supported_tasks = self._get_supported_tasks( + architectures, self.runner_type, self.convert_type) + + # Note: Initialize these attributes early because transformers fallback + # may fail to load dynamic modules in child processes + model_info, arch = registry.inspect_model_cls(architectures, self) + self._model_info = model_info + self._architecture = arch + logger.info("Resolved architecture: %s", arch) + + # Init pooler config if needed + if self.runner_type == "pooling": + if self.override_pooler_config is not None: + logger.warning_once( + "`override_pooler_config` is deprecated and will be " + "removed in v0.12.0 or v1.0.0, whichever is sooner. " + "Please use `pooler_config` instead.") + + if isinstance(self.override_pooler_config, dict): + self.pooler_config = PoolerConfig( + **self.override_pooler_config) + else: + self.pooler_config = self.override_pooler_config + + if self.pooler_config is None: + self.pooler_config = PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(self.pooler_config, k) is None: + setattr(self.pooler_config, k, v) + + default_pooling_type = self._model_info.default_pooling_type + if self.pooler_config.pooling_type is None: + self.pooler_config.pooling_type = default_pooling_type + + self.dtype: torch.dtype = _get_and_verify_dtype( + self.model, + self.hf_config, + self.dtype, + is_pooling_model=self.runner_type == "pooling", + revision=self.revision, + ) + + # Interleaved attention is not supported by some backends in V0 + if (not self.disable_sliding_window + and is_interleaved(self.hf_text_config) + and not envs.VLLM_USE_V1 + and (backend := envs.VLLM_ATTENTION_BACKEND) + in ("XFORMERS", "FLASHINFER")): + logger.warning_once( + "%s has interleaved attention, which is currently not " + "supported by the %s backend. Disabling sliding window and " + "capping the max length to the sliding window size (%d).", + self.hf_text_config.model_type, + backend, + self.hf_text_config.sliding_window, + ) + self.disable_sliding_window = True + + self.original_max_model_len = self.max_model_len + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if (mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + ) + + mm_config_kwargs = { + k: v + for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + + if self.disable_sliding_window: + # Set after get_and_verify_max_len to ensure that max_model_len + # can be correctly capped to sliding window size + self.hf_text_config.sliding_window = None + + if not self.skip_tokenizer_init: + self._verify_tokenizer_mode() + + # Avoid running try_verify_and_update_config multiple times + self.config_updated = False + + self._verify_quantization() + self._verify_cuda_graph() + self._verify_bnb_config() + + @field_validator("quantization", mode="before") + @classmethod + def validate_quantization_before(cls, value: Any) -> Any: + if isinstance(value, str): + return value.lower() + return value + + @model_validator(mode="after") + def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") + if not isinstance(self.max_model_len, int): + raise ValueError( + "max_model_len must be an integer after __post_init__.") + return self + + def _get_transformers_backend_cls(self) -> str: + """Determine which Transformers backend class will be used if + `model_impl` is set to `transformers` or `auto`.""" + if getattr(self, "runner_type", self.runner) == "pooling": + return "TransformersModel" + if self.hf_config != self.hf_text_config: + # If 'hf_text_config' is the same as 'hf_config'. If not, it is + # probably a composite config, i.e. multimodal + return "TransformersForMultimodalLM" + return "TransformersForCausalLM" + + def using_transformers_backend(self) -> bool: + """Check if the model is using the Transformers backend class.""" + return self.architecture == self._get_transformers_backend_cls() + + @property + def registry(self): + return me_models.ModelRegistry + + @property + def architectures(self) -> list[str]: + return getattr(self.hf_config, "architectures", []) + + @property + def architecture(self) -> str: + """The architecture vllm actually used.""" + return self._architecture + + def maybe_pull_model_tokenizer_for_runai(self, model: str, + tokenizer: str) -> None: + """Pull model/tokenizer from Object Storage to temporary + directory when needed. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + """ + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + return + + if is_runai_obj_uri(model): + object_storage_model = ObjectStorageModel() + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"]) + self.model_weights = model + self.model = object_storage_model.dir + + # If tokenizer is same as model, download to same directory + if model == tokenizer: + object_storage_model.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors", + "*.pth" + ]) + self.tokenizer = object_storage_model.dir + return + + # Only download tokenizer if needed and not already handled + if is_runai_obj_uri(tokenizer): + object_storage_tokenizer = ObjectStorageModel() + object_storage_tokenizer.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors", + "*.pth" + ]) + self.tokenizer = object_storage_tokenizer.dir + + def _get_encoder_config(self): + return get_sentence_transformer_tokenizer_config( + self.model, self.revision) + + def _verify_tokenizer_mode(self) -> None: + tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) + if tokenizer_mode not in get_args(TokenizerMode): + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + f"one of {get_args(TokenizerMode)}.") + self.tokenizer_mode = tokenizer_mode + + def _get_default_runner_type( + self, + architectures: list[str], + ) -> RunnerType: + registry = self.registry + + # Some Sentence Transformers models use *ForCausalLM archs + if get_pooling_config(self.model, self.revision): + return "pooling" + + for arch in architectures: + if arch in registry.get_supported_archs(): + if registry.is_pooling_model(architectures, self): + return "pooling" + if registry.is_text_generation_model(architectures, self): + return "generate" + + match = try_match_architecture_defaults(arch) + if match: + _, (runner_type, _) = match + return runner_type + + return "generate" + + def _get_runner_type( + self, + architectures: list[str], + runner: RunnerOption, + ) -> RunnerType: + if runner != "auto": + return runner + + runner_type = self._get_default_runner_type(architectures) + + # Don't log the most common case + if runner_type != "generate": + logger.info( + "Resolved `--runner auto` to `--runner %s`. " + "Pass the value explicitly to silence this message.", + runner_type) + + return runner_type + + def _get_default_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + ) -> ConvertType: + registry = self.registry + + for arch in architectures: + if arch in registry.get_supported_archs(): + if (runner_type == "generate" + and registry.is_text_generation_model( + architectures, self)): + return "none" + if (runner_type == "pooling" + and registry.is_pooling_model(architectures, self)): + return "none" + + match = try_match_architecture_defaults(arch, + runner_type=runner_type) + if match: + _, (_, convert_type) = match + return convert_type + + # This is to handle Sentence Transformers models that use *ForCausalLM + # and also multi-modal pooling models which are not defined as + # Sentence Transformers models + if runner_type == "pooling": + return "embed" + + return "none" + + def _get_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + convert: ConvertOption, + ) -> ConvertType: + if convert != "auto": + return convert + + convert_type = self._get_default_convert_type(architectures, + runner_type) + + # Don't log the most common case + if convert_type != "none": + logger.info( + "Resolved `--convert auto` to `--convert %s`. " + "Pass the value explicitly to silence this message.", + convert_type) + + return convert_type + + def _get_supported_generation_tasks( + self, + architectures: list[str], + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + registry = self.registry + + if registry.is_transcription_only_model(architectures, self): + return ["transcription"] + + # TODO: Use get_supported_generation_tasks once V0 is removed + supported_tasks = list[_ResolvedTask]() + if (registry.is_text_generation_model(architectures, self) + or convert_type in _RUNNER_CONVERTS["generate"]): + supported_tasks.append("generate") + + if registry.is_transcription_model(architectures, self): + supported_tasks.append("transcription") + + return supported_tasks + + def _get_default_pooling_task( + self, + architectures: list[str], + ) -> Literal["embed", "classify", "reward"]: + if self.registry.is_cross_encoder_model(architectures, self): + return "classify" + + for arch in architectures: + match = try_match_architecture_defaults(arch, + runner_type="pooling") + if match: + _, (_, convert_type) = match + assert convert_type != "none" + return convert_type + + return "embed" + + def _get_supported_pooling_tasks( + self, + architectures: list[str], + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + registry = self.registry + + # TODO: Use get_supported_pooling_tasks once V0 is removed + supported_tasks = list[_ResolvedTask]() + if (registry.is_pooling_model(architectures, self) + or convert_type in _RUNNER_CONVERTS["pooling"]): + supported_tasks.append("encode") + + extra_task = (self._get_default_pooling_task(architectures) + if convert_type == "none" else convert_type) + supported_tasks.append(extra_task) + + return supported_tasks + + def _get_supported_tasks( + self, + architectures: list[str], + runner_type: RunnerType, + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + if runner_type == "generate": + return self._get_supported_generation_tasks( + architectures, convert_type) + if runner_type == "pooling": + return self._get_supported_pooling_tasks(architectures, + convert_type) + if runner_type == "draft": + return ["draft"] + + assert_never(runner_type) + + def _parse_quant_hf_config(self, hf_config: PretrainedConfig): + quant_cfg = getattr(hf_config, "quantization_config", None) + if quant_cfg is None: + # compressed-tensors uses a "compression_config" key + quant_cfg = getattr(hf_config, "compression_config", None) + + else: + # Set quant_method for ModelOpt models. + producer_name = quant_cfg.get("producer", {}).get("name") + if producer_name == "modelopt": + quant_algo = quant_cfg.get("quantization", + {}).get("quant_algo") + if quant_algo == "FP8": + quant_cfg["quant_method"] = "modelopt" + elif quant_algo == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + elif quant_algo is not None: + raise ValueError( + f"Unknown ModelOpt quant algo: {quant_algo}") + + return quant_cfg + + def _verify_quantization(self) -> None: + supported_quantization = me_quant.QUANTIZATION_METHODS + if self.quantization is not None: + self.quantization = cast(me_quant.QuantizationMethods, + self.quantization) + + # Parse quantization method from the HF model config, if available. + quant_cfg = self._parse_quant_hf_config(self.hf_config) + if quant_cfg is None and (text_config := getattr( + self.hf_config, "text_config", None)): + # Check the text config as well for multi-modal models. + quant_cfg = self._parse_quant_hf_config(text_config) + + if quant_cfg is not None: + # Use the community standard 'quant_method' + quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names + quant_method = quant_method.replace("compressed_tensors", + "compressed-tensors") + + quant_cfg["quant_method"] = quant_method + + # Quantization methods which are overrides (i.e. they have a + # `override_quantization_method` method) must be checked in order + # of preference (this is particularly important for GPTQ). + overrides = [ + "bitblas", + "gptq_marlin_24", + "gptq_marlin", + "gptq_bitblas", + "awq_marlin", + "ipex", + "moe_wna16", + "modelopt", + "modelopt_fp4", + "petit_nvfp4", + ] + quantization_methods = [ + q for q in supported_quantization if q not in overrides + ] + # Any custom overrides will be in quantization_methods so we place + # them at the start of the list so custom overrides have preference + # over the built-in ones. + quantization_methods = quantization_methods + overrides + + # Detect which checkpoint is it + for name in quantization_methods: + method = me_quant.get_quantization_config(name) + quantization_override = method.override_quantization_method( + quant_cfg, self.quantization) + if quantization_override is not None: + # Raise error if the override is not custom (custom would + # be in QUANTIZATION_METHODS but not QuantizationMethods) + # and hasn't been added to the overrides list. + if (name in get_args(me_quant.QuantizationMethods) + and name not in overrides): + raise ValueError( + f"Quantization method {name} is an override but " + "is has not been added to the `overrides` list " + "above. This is necessary to ensure that the " + "overrides are checked in order of preference.") + quant_method = quantization_override + self.quantization = quantization_override + break + + # Verify quantization configurations. + if self.quantization is None: + self.quantization = quant_method + elif self.quantization != quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization}).") + + if self.quantization is not None: + if self.quantization not in supported_quantization: + raise ValueError( + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}.") + from vllm.platforms import current_platform + current_platform.verify_quantization(self.quantization) + + def _verify_cuda_graph(self) -> None: + # The `max_seq_len_to_capture` was incorrectly + # based on the encoder's input length (448) + # but not the decoder's larger input length (1500). + # This change ensures the CUDA Graph captures the correct, + # larger sequence length, allowing it to work as intended. + effective_max_seq_len = self.max_model_len + if self.is_encoder_decoder: + effective_max_seq_len = max( + effective_max_seq_len, + getattr(self.hf_config, "max_source_positions", 0)) + self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, + effective_max_seq_len) + # CUDAGraph capture not supported for encoder-decoder models on ROCm + unsupported_rocm = self.is_encoder_decoder + + if (unsupported_rocm and not self.enforce_eager + and current_platform.is_rocm()): + logger.warning( + "CUDA graph is not supported for %s on ROCm yet, fallback " + "to eager mode.", self.hf_config.model_type) + self.enforce_eager = True + + def _verify_bnb_config(self) -> None: + """ + The current version of bitsandbytes (0.46.1) with 8-bit models does not + yet support CUDA graph. + # TODO Remove this when bitsandbytes supports. + """ + is_bitsandbytes = self.quantization == "bitsandbytes" + has_quantization_config = (getattr(self.hf_config, + "quantization_config", None) + is not None) + is_8bit = (self.hf_config.quantization_config.get( + "load_in_8bit", False) if has_quantization_config else False) + if all([ + is_bitsandbytes, + has_quantization_config, + is_8bit, + not self.enforce_eager, + ]): + logger.warning( + "CUDA graph is not supported on BitsAndBytes 8bit yet, " + "fallback to the eager mode.") + + self.enforce_eager = True + + def _verify_with_expert_parallelism(self) -> None: + num_expert_names = [ + "moe_num_experts", # Dbrx + "num_experts", # Jamba + "n_routed_experts", # DeepSeek + "num_local_experts", # Mixtral + ] + num_experts = 0 + for name in num_expert_names: + num_experts = getattr(self.hf_text_config, name, 0) + if num_experts > 0: + break + if num_experts < 1: + raise ValueError( + "Number of experts in the model must be greater than 0 " + "when expert parallelism is enabled.") + + def verify_dual_chunk_attention_config( + self, + load_config: LoadConfig, + ) -> None: + if hasattr(self.hf_config, "dual_chunk_attention_config"): + # Try loading the sparse attention config + from vllm.model_executor.model_loader.weight_utils import ( + get_sparse_attention_config) + sparse_attn_config = get_sparse_attention_config(self, load_config) + if sparse_attn_config: + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_config"] = sparse_attn_config + if "sparse_attention_enabled" not in \ + self.hf_config.dual_chunk_attention_config: + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_enabled"] = True + + if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: + raise ValueError("please set VLLM_ATTENTION_BACKEND to " + f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") + + def verify_async_output_proc(self, parallel_config, speculative_config, + device_config) -> None: + if not self.use_async_output_proc: + # Nothing to check + return + + if parallel_config.pipeline_parallel_size > 1: + self.use_async_output_proc = False + return + + # Reminder: Please update docs/features/compatibility_matrix.md + # If the feature combo become valid + from vllm.platforms import current_platform + if not current_platform.is_async_output_supported(self.enforce_eager): + self.use_async_output_proc = False + return + + if envs.VLLM_USE_RAY_SPMD_WORKER: + self.use_async_output_proc = False + return + + # Async postprocessor is not necessary for pooling models + # since there is no token generation + if self.runner_type == "pooling": + self.use_async_output_proc = False + + # Reminder: Please update docs/features/compatibility_matrix.md + # If the feature combo become valid + if speculative_config: + self.use_async_output_proc = False + + def verify_with_parallel_config( + self, + parallel_config: ParallelConfig, + ) -> None: + + if parallel_config.distributed_executor_backend == "external_launcher": + assert self.seed is not None, ( + "Seed must be set when using external launcher backend to " + "make sure sampling results are the same across workers.") + + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) + tensor_parallel_size = parallel_config.tensor_parallel_size + if total_num_attention_heads % tensor_parallel_size != 0: + raise ValueError( + f"Total number of attention heads ({total_num_attention_heads})" + " must be divisible by tensor parallel size " + f"({tensor_parallel_size}).") + + if parallel_config.enable_expert_parallel: + self._verify_with_expert_parallelism() + + pipeline_parallel_size = parallel_config.pipeline_parallel_size + if pipeline_parallel_size > 1: + if not self.registry.is_pp_supported_model(self.architectures, + self): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") + + if self.use_async_output_proc: + self.use_async_output_proc = False + + def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size from the HF text config if present.""" + return getattr(self.hf_text_config, "sliding_window", None) + + def get_vocab_size(self) -> int: + return getattr(self.hf_text_config, "vocab_size", 0) + + def get_hidden_size(self) -> int: + return getattr(self.hf_text_config, "hidden_size", 0) + + @property + def is_deepseek_mla(self) -> bool: + if not hasattr(self.hf_text_config, "model_type"): + return False + elif self.hf_text_config.model_type in \ + ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'): + return self.hf_text_config.kv_lora_rank is not None + elif self.hf_text_config.model_type == 'eagle': + # if the model is an EAGLE module, check for the + # underlying architecture + return self.hf_text_config.model.model_type in \ + ('deepseek_v2', 'deepseek_v3') \ + and self.hf_text_config.kv_lora_rank is not None + return False + + def get_head_size(self) -> int: + # TODO remove hard code + if self.is_deepseek_mla: + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", + 0) + if self.use_mla: + return self.hf_text_config.kv_lora_rank + qk_rope_head_dim + else: + qk_nope_head_dim = getattr(self.hf_text_config, + "qk_nope_head_dim", 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim + + if hasattr(self.hf_text_config, + "model_type") and (self.hf_text_config.model_type + == "zamba2"): + return self.hf_text_config.attention_head_dim + + if self.is_attention_free: + return 0 + + # NOTE: Some configs may set head_dim=None in the config + if getattr(self.hf_text_config, "head_dim", None) is not None: + return self.hf_text_config.head_dim + + # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` + if getattr(self.hf_text_config, "hidden_size_per_head", + None) is not None: + return self.hf_text_config.hidden_size_per_head + + # FIXME(woosuk): This may not be true for all models. + return (self.hf_text_config.hidden_size // + self.hf_text_config.num_attention_heads) + + def get_total_num_kv_heads(self) -> int: + """Returns the total number of KV heads.""" + # For GPTBigCode & Falcon: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_text_config, + "multi_query", False): + # Multi-query attention, only one KV head. + # Currently, tensor parallelism is not supported in this case. + return 1 + + # For DBRX and MPT + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": + return getattr(self.hf_config.attn_config, "kv_n_heads", + self.hf_config.num_attention_heads) + + if self.hf_config.model_type == "nemotron-nas": + for block in self.hf_config.block_configs: + if not block.attention.no_op: + return self.hf_config.num_attention_heads \ + // block.attention.n_heads_in_group + + raise RuntimeError("Couldn't determine number of kv heads") + + if self.is_attention_free: + return 0 + + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + for attr in attributes: + num_kv_heads = getattr(self.hf_text_config, attr, None) + if num_kv_heads is not None: + return num_kv_heads + + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + return self.hf_text_config.num_attention_heads + + def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: + """Returns the number of KV heads per GPU.""" + if self.use_mla: + # When using MLA during decode it becomes MQA + return 1 + + total_num_kv_heads = self.get_total_num_kv_heads() + # If tensor parallelism is used, we divide the number of KV heads by + # the tensor parallel size. We will replicate the KV heads in the + # case where the number of KV heads is smaller than the tensor + # parallel size so each GPU has at least one KV head. + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) + + def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int: + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size + + def get_layers_start_end_indices( + self, parallel_config: ParallelConfig) -> tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + if (self.hf_text_config.model_type == "deepseek_mtp" + or self.hf_config.model_type == "mimo_mtp" + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp" + or self.hf_config.model_type == "qwen3_next_mtp"): + total_num_hidden_layers = getattr(self.hf_text_config, + "num_nextn_predict_layers", 0) + else: + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) + # the layout order is: DP x PP x TP + pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size + ) % parallel_config.pipeline_parallel_size + pp_size = parallel_config.pipeline_parallel_size + start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) + return start, end + + def get_num_layers(self, parallel_config: ParallelConfig) -> int: + start, end = self.get_layers_start_end_indices(parallel_config) + return end - start + + def get_num_layers_by_block_type( + self, + parallel_config: ParallelConfig, + block_type: LayerBlockType = LayerBlockType.attention, + ) -> int: + # This function relies on 'layers_block_type' in hf_config, + # for w/o this attribute, we will need to have workarounds like so + attn_block_type = block_type == LayerBlockType.attention + is_transformer = not self.is_hybrid and \ + not self.has_noops and \ + not self.is_attention_free + start, end = self.get_layers_start_end_indices(parallel_config) + + if is_transformer: + # Handle the basic case first + return end - start if attn_block_type else 0 + elif self.is_attention_free: + # Attention free + # Note that this code assumes there + # is only one type of attention-free block type. + return 0 if attn_block_type else end - start + elif self.has_noops: + block_configs = self.hf_config.block_configs + return sum(not bc.attention.no_op + for bc in block_configs[start:end]) + else: + # Hybrid model Jamba + layers_block_type_value = getattr(self.hf_text_config, + "layers_block_type", None) + if layers_block_type_value is not None: + if hasattr(self.hf_text_config, + "model_type") and (self.hf_text_config.model_type + == "zamba2"): + if attn_block_type: + return sum(t == "hybrid" + for t in layers_block_type_value[start:end]) + else: + return self.get_num_layers(parallel_config) + return sum(t == block_type.value + for t in layers_block_type_value[start:end]) + + # Hybrid model Minimax + attn_type_list = getattr(self.hf_config, "attn_type_list", None) + if attn_type_list: + return sum(t == 1 for t in attn_type_list[start:end]) + + # Hybrid model Qwen3Next + layer_types_value = getattr(self.hf_config, "layer_types", None) + if layer_types_value is not None: + if getattr(block_type, "value", block_type) == "attention": + return sum(t == "full_attention" + for t in layer_types_value[start:end]) + elif getattr(block_type, "value", + block_type) == "linear_attention": + return sum(t == "linear_attention" + for t in layer_types_value[start:end]) + else: + return sum(t == getattr(block_type, "value", block_type) + for t in layer_types_value[start:end]) + + if (layers_block_type_value is None and attn_type_list is None + and layer_types_value is None): + raise ValueError( + "The model is an hybrid without a" + "layers_block_type or an attn_type_list, or a layer_types " + "in the hf_config, cannot determine the num of " + f"{block_type.value} layers") + + def get_mamba_chunk_size(self) -> Optional[int]: + """ + Returns the mamba chunk size if it exists + """ + # used by e.g. Bamba, FalconH1, Granite, PLaMo2 + chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None) + if chunk_size is None: + # used by e.g. Mamba2, NemotronH, Zamba + chunk_size = getattr(self.hf_text_config, "chunk_size", None) + return chunk_size + + def get_multimodal_config(self) -> MultiModalConfig: + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + if self.multimodal_config is None: + raise ValueError("The model is not multimodal.") + + return self.multimodal_config + + def try_get_generation_config(self) -> dict[str, Any]: + """ + This method attempts to retrieve the non-default values of the + generation config for this model. + + The generation config can contain information about special tokens, as + well as sampling parameters. Which is why this method exists separately + to `get_diff_sampling_param`. + + Returns: + A dictionary containing the non-default generation config. + """ + if self.generation_config in {"auto", "vllm"}: + config = try_get_generation_config( + self.hf_config_path or self.model, + trust_remote_code=self.trust_remote_code, + revision=self.revision, + ) + else: + config = try_get_generation_config( + self.generation_config, + trust_remote_code=self.trust_remote_code, + ) + + if config is None: + return {} + + return config.to_diff_dict() + + def get_diff_sampling_param(self) -> dict[str, Any]: + """ + This method returns a dictionary containing the non-default sampling + parameters with `override_generation_config` applied. + + The default sampling parameters are: + + - vLLM's neutral defaults if `self.generation_config="vllm"` + - the model's defaults if `self.generation_config="auto"` + - as defined in `generation_config.json` if + `self.generation_config="path/to/generation_config/dir"` + + Returns: + A dictionary containing the non-default sampling parameters. + """ + if self.generation_config == "vllm": + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + + available_params = [ + "repetition_penalty", + "temperature", + "top_k", + "top_p", + "min_p", + "max_new_tokens", + ] + if any(p in config for p in available_params): + diff_sampling_param = { + p: config.get(p) + for p in available_params if config.get(p) is not None + } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") + else: + diff_sampling_param = {} + + if diff_sampling_param: + logger.warning_once( + "Default sampling parameters have been overridden by the " + "model's Hugging Face generation config recommended from the " + "model creator. If this is not intended, please relaunch " + "vLLM instance with `--generation-config vllm`.") + return diff_sampling_param + + @property + def is_encoder_decoder(self) -> bool: + """Extract the HF encoder/decoder model flag.""" + return is_encoder_decoder(self.hf_config) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) + + @property + def is_multimodal_model(self) -> bool: + return self.multimodal_config is not None + + @property + def is_multimodal_raw_input_only_model(self) -> bool: + return self._model_info.supports_multimodal_raw_input_only + + @property + def is_cross_encoder(self) -> bool: + return (self._model_info.supports_cross_encoding + or self.convert_type == "classify") + + @property + def is_pp_supported(self) -> bool: + return self._model_info.supports_pp + + @property + def is_attention_free(self) -> bool: + return self._model_info.is_attention_free + + @property + def is_hybrid(self) -> bool: + return self._model_info.is_hybrid + + @property + def has_noops(self) -> bool: + return self._model_info.has_noops + + @property + def has_inner_state(self): + return self._model_info.has_inner_state + + @property + def is_v1_compatible(self) -> bool: + return not self._model_info.supports_v0_only + + @property + def use_mla(self) -> bool: + return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE + + @property + def is_matryoshka(self) -> bool: + return (bool(getattr(self.hf_config, "matryoshka_dimensions", None)) + or getattr(self.hf_config, "is_matryoshka", False)) + + @property + def matryoshka_dimensions(self): + return getattr(self.hf_config, "matryoshka_dimensions", None) + + @property + def use_pad_token(self) -> bool: + # cross_encoder models defaults to using pad_token. + # `llm as reranker` models defaults to not using pad_token. + return getattr(self.hf_config, "use_pad_token", True) + + @property + def head_dtype(self) -> torch.dtype: + """ + "head" refers to the last Linear layer(s) of an LLM, + such as the lm_head in a generation model, + or the score or classifier in a classification model. + + `head_dtype` currently only supports pooling models.\n + - The pooling model defaults to using fp32 head, + you can use --hf-overrides '{"head_dtype": "model"}' to disable it. + """ + + head_dtype = _get_head_dtype(config=self.hf_config, + dtype=self.dtype, + runner_type=self.runner_type) + + if self.runner_type != "pooling" and head_dtype != self.dtype: + logger.warning_once( + "`head_dtype` currently only supports pooling models." + "fallback to model dtype [%s].", self.dtype) + return self.dtype + + if head_dtype not in current_platform.supported_dtypes: + logger.warning_once( + "The current platform does not support [%s] head dtype, " + "fallback to model dtype [%s].", head_dtype, self.dtype) + return self.dtype + + logger.debug_once("head dtype: %s", head_dtype) + return head_dtype + + def get_and_verify_max_len(self, max_model_len: int): + # Consider max_model_len in tokenizer_config only when + # pooling models use absolute position_embedding. + tokenizer_config = None + if (self.runner_type == "pooling" and getattr( + self.hf_config, "position_embedding_type", "") == "absolute"): + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + tokenizer_config=tokenizer_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window=self.get_sliding_window(), + spec_target_max_model_len=self.spec_target_max_model_len, + encoder_config=self.encoder_config) + logger.info("Using max model len %s", max_model_len) + return max_model_len + + +def get_served_model_name(model: str, + served_model_name: Optional[Union[str, list[str]]]): + """ + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an + empty list, the fallback is to use `self.model`. + """ + if not served_model_name: + return model + if isinstance(served_model_name, list): + return served_model_name[0] + return served_model_name + + +# Some model suffixes are based on auto classes from Transformers: +# https://huggingface.co/docs/transformers/en/model_doc/auto +# NOTE: Items higher on this list priority over lower ones +_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ + ("ForCausalLM", ("generate", "none")), + ("ForConditionalGeneration", ("generate", "none")), + ("ChatModel", ("generate", "none")), + ("LMHeadModel", ("generate", "none")), + ("ForTextEncoding", ("pooling", "embed")), + ("EmbeddingModel", ("pooling", "embed")), + ("ForSequenceClassification", ("pooling", "classify")), + ("ForAudioClassification", ("pooling", "classify")), + ("ForImageClassification", ("pooling", "classify")), + ("ForVideoClassification", ("pooling", "classify")), + ("ClassificationModel", ("pooling", "classify")), + ("ForRewardModeling", ("pooling", "reward")), + ("RewardModel", ("pooling", "reward")), + # Let other `*Model`s take priority + ("Model", ("pooling", "embed")), +] + + +def iter_architecture_defaults(): + yield from _SUFFIX_TO_DEFAULTS + + +def try_match_architecture_defaults( + architecture: str, + *, + runner_type: Optional[RunnerType] = None, + convert_type: Optional[ConvertType] = None, +) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]: + for suffix, (default_runner_type, + default_convert_type) in iter_architecture_defaults(): + if ((runner_type is None or runner_type == default_runner_type) and + (convert_type is None or convert_type == default_convert_type) + and architecture.endswith(suffix)): + return suffix, (default_runner_type, default_convert_type) + + return None + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.float16, + "float16": torch.float16, + "float": torch.float32, + "float32": torch.float32, + "bfloat16": torch.bfloat16, +} + +# model_type -> reason +_FLOAT16_NOT_SUPPORTED_MODELS = { + "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3_text": + "Numerical instability. Please use bfloat16 or float32 instead.", + "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", + "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", +} + + +def _is_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 + return False + + return True + + +def _check_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: + reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] + raise ValueError(f"The model type {model_type!r} " + f"does not support float16. Reason: {reason}") + + return True + + +def _find_dtype( + model_id: str, + config: PretrainedConfig, + *, + revision: Optional[str], +): + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct + # because config.torch_dtype can be None. + config_dtype = getattr(config, "torch_dtype", None) + + # Fallbacks for multi-modal models if the root config + # does not define torch_dtype + if config_dtype is None: + config_dtype = getattr(config.get_text_config(), "torch_dtype", None) + if config_dtype is None and hasattr(config, "vision_config"): + config_dtype = getattr(config.vision_config, "torch_dtype", None) + if config_dtype is None and hasattr(config, "encoder_config"): + config_dtype = getattr(config.encoder_config, "torch_dtype", None) + + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + + if config_dtype is None: + config_dtype = torch.float32 + + return config_dtype + + +def _resolve_auto_dtype( + model_type: str, + config_dtype: torch.dtype, + *, + is_pooling_model: bool, +): + from vllm.platforms import current_platform + + supported_dtypes = [ + dtype for dtype in current_platform.supported_dtypes + if _is_valid_dtype(model_type, dtype) + ] + + if is_pooling_model and torch.float16 in supported_dtypes: + preferred_dtype = torch.float16 + else: + preferred_dtype = supported_dtypes[0] + + # Downcast for float32 models + if config_dtype == torch.float32: + config_dtype = preferred_dtype + + if config_dtype in supported_dtypes: + return config_dtype + + # Ensure device compatibility + device_name = current_platform.get_device_name() + device_capability = current_platform.get_device_capability() + + if device_capability is None: + device_str = f"{device_name!r}" + else: + version_str = device_capability.as_version_str() + device_str = f"{device_name!r} (with compute capability {version_str})" + + logger.warning( + "Your device %s doesn't support %s. " + "Falling back to %s for compatibility.", + device_str, + config_dtype, + preferred_dtype, + ) + + return preferred_dtype + + +def _get_and_verify_dtype( + model_id: str, + config: PretrainedConfig, + dtype: Union[str, torch.dtype], + *, + is_pooling_model: bool, + revision: Optional[str] = None, +) -> torch.dtype: + config_dtype = _find_dtype(model_id, config, revision=revision) + model_type = config.model_type + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + # Set default dtype from model config + torch_dtype = _resolve_auto_dtype( + model_type, + config_dtype, + is_pooling_model=is_pooling_model, + ) + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype!r}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + _check_valid_dtype(model_type, torch_dtype) + + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) + + return torch_dtype + + +def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype, + runner_type: str) -> torch.dtype: + head_dtype: Optional[Union[str, + torch.dtype]] = getattr(config, "head_dtype", + None) + + if head_dtype == "model": + return dtype + elif isinstance(head_dtype, str): + head_dtype = head_dtype.lower() + if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {head_dtype!r}") + return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] + elif isinstance(head_dtype, torch.dtype): + return head_dtype + elif head_dtype is None: + if torch.float32 not in current_platform.supported_dtypes: + return dtype + if runner_type == "pooling": + return torch.float32 + return dtype + else: + raise ValueError(f"Unknown dtype: {head_dtype}") + + +def _get_and_verify_max_len( + hf_config: PretrainedConfig, + tokenizer_config: Optional[dict], + max_model_len: Optional[int], + disable_sliding_window: bool, + sliding_window: Optional[int], + spec_target_max_model_len: Optional[int] = None, + encoder_config: Optional[Any] = None, +) -> int: + """Get and verify the model's maximum length.""" + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Command-R + "model_max_length", + # Whisper + "max_target_positions", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + # Choose the smallest "max_length" from the possible keys + max_len_key = None + for key in possible_keys: + max_len = getattr(hf_config, key, None) + if max_len is not None: + max_len_key = key if max_len < derived_max_model_len \ + else max_len_key + derived_max_model_len = min(derived_max_model_len, max_len) + # For Command-R / Cohere, Cohere2 / Aya Vision models + if tmp_max_len := getattr(hf_config, "model_max_length", None): + max_len_key = "model_max_length" + derived_max_model_len = tmp_max_len + + # If sliding window is manually disabled, max_length should be less + # than the sliding window length in the model config. + if (disable_sliding_window and sliding_window is not None + and sliding_window < derived_max_model_len): + max_len_key = "sliding_window" + derived_max_model_len = sliding_window + + # Consider model_max_length in tokenizer_config + if tokenizer_config: + tokenizer_model_max_length = tokenizer_config.get( + "model_max_length", derived_max_model_len) + derived_max_model_len = min(derived_max_model_len, + tokenizer_model_max_length) + + # If none of the keys were found in the config, use a default and + # log a warning. + if derived_max_model_len == float("inf"): + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + if spec_target_max_model_len is not None: + # If this is a speculative draft model, we use the max model len + # from the target model. + return spec_target_max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + "%s. Assuming the model's maximum length is %d.", possible_keys, + default_max_len) + derived_max_model_len = default_max_len + + rope_scaling = getattr(hf_config, "rope_scaling", None) + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE + # scaling, so we skip applying the scaling factor again. + if rope_scaling is not None and "gemma3" not in hf_config.model_type: + # No need to consider "type" key because of patch_rope_scaling when + # loading HF config + rope_type = rope_scaling["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_scaling + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "with rope_scaling. Please raise an issue so we can " + "investigate.") + + # NOTE: rope_type == "default" does not define factor + # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + scaling_factor = rope_scaling.get("factor", 1.0) + + if rope_type == "yarn": + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] + derived_max_model_len *= scaling_factor + + if encoder_config and "max_seq_length" in encoder_config: + derived_max_model_len = encoder_config["max_seq_length"] + + # If the user specified a max length, make sure it is smaller than the + # derived length from the HF model config. + if max_model_len is None: + max_model_len = int(derived_max_model_len) + if current_platform.is_tpu(): + logger.warning( + "--max-model-len is not specified, " + "it's currently using model's default length %s, " + "which might be too large." + "Please input with --max-model-len based on your " + "request input length and output length, to avoid " + "unnecessary degradation.", max_model_len) + elif max_model_len > derived_max_model_len: + # Some models might have a separate key for specifying model_max_length + # that will be bigger than derived_max_model_len. We compare user input + # with model_max_length and allow this override when it's smaller. + model_max_length = getattr(hf_config, "model_max_length", None) + if model_max_length is not None and max_model_len <= model_max_length: + if disable_sliding_window: + # TODO(robertgshaw): Find a model that has model_max_length + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "model_max_length in the config. Please raise an issue " + "so we can investigate.") + else: + msg = ( + f"User-specified max_model_len ({max_model_len}) is greater " + f"than the derived max_model_len ({max_len_key}=" + f"{derived_max_model_len} or model_max_length=" + f"{model_max_length} in model's config.json).") + warning = ( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " + "caution. If the model uses relative position encoding (RoPE), " + "positions exceeding derived_max_model_len lead to nan. If the " + "model uses absolute position encoding, positions exceeding " + "derived_max_model_len will cause a CUDA array out-of-bounds " + "error.") + if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: + logger.warning_once("%s %s", msg, warning) + else: + raise ValueError( + f"{msg} To allow overriding this maximum, set " + f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") + return int(max_model_len) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 93002012799ab..f0f67bab9d6ff 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -3,7 +3,7 @@ import hashlib from dataclasses import field -from typing import TYPE_CHECKING, Any, Literal, Optional, Union +from typing import Any, Literal, Optional, Union from pydantic import SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -15,13 +15,9 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, POOLING_MODEL_MAX_NUM_BATCHED_TOKENS) -if TYPE_CHECKING: - from vllm.config import RunnerType -else: - RunnerType = Any - logger = init_logger(__name__) +RunnerType = Literal["generate", "pooling", "draft"] PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] diff --git a/vllm/config/utils.py b/vllm/config/utils.py index db8c05ef8be4a..91e61b3302738 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -1,8 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import inspect +import textwrap from dataclasses import MISSING, Field, field, fields, is_dataclass -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar + +import regex as re if TYPE_CHECKING: from _typeshed import DataclassInstance @@ -45,3 +50,96 @@ def get_field(cls: ConfigType, name: str) -> Field: return field(default=default) raise ValueError( f"{cls.__name__}.{name} must have a default value or default factory.") + + +def contains_object_print(text: str) -> bool: + """ + Check if the text looks like a printed Python object, e.g. + contains any substring matching the pattern: "at 0xFFFFFFF>" + We match against 0x followed by 2-16 hex chars (there's + a max of 16 on a 64-bit system). + + Args: + text (str): The text to check + + Returns: + result (bool): `True` if a match is found, `False` otherwise. + """ + pattern = r'at 0x[a-fA-F0-9]{2,16}>' + match = re.search(pattern, text) + return match is not None + + +def assert_hashable(text: str) -> bool: + if not contains_object_print(text): + return True + raise AssertionError( + f"vLLM tried to hash some configs that may have Python objects ids " + f"in them. This is a bug, please file an issue. " + f"Text being hashed: {text}") + + +def get_attr_docs(cls: type[Any]) -> dict[str, str]: + """ + Get any docstrings placed after attribute assignments in a class body. + + https://davidism.com/mit-license/ + """ + + def pairwise(iterable): + """ + Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise + + Can be removed when Python 3.9 support is dropped. + """ + iterator = iter(iterable) + a = next(iterator, None) + + for b in iterator: + yield a, b + a = b + + try: + cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] + except (OSError, KeyError, TypeError): + # HACK: Python 3.13+ workaround - set missing __firstlineno__ + # Workaround can be removed after we upgrade to pydantic==2.12.0 + with open(inspect.getfile(cls)) as f: + for i, line in enumerate(f): + if f"class {cls.__name__}" in line and ":" in line: + cls.__firstlineno__ = i + 1 + break + cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] + + if not isinstance(cls_node, ast.ClassDef): + raise TypeError("Given object was not a class.") + + out = {} + + # Consider each pair of nodes. + for a, b in pairwise(cls_node.body): + # Must be an assignment then a constant string. + if (not isinstance(a, (ast.Assign, ast.AnnAssign)) + or not isinstance(b, ast.Expr) + or not isinstance(b.value, ast.Constant) + or not isinstance(b.value.value, str)): + continue + + doc = inspect.cleandoc(b.value.value) + + # An assignment can have multiple targets (a = b = v), but an + # annotated assignment only has one target. + targets = a.targets if isinstance(a, ast.Assign) else [a.target] + + for target in targets: + # Must be assigning to a plain name. + if not isinstance(target, ast.Name): + continue + + out[target.id] = doc + + return out + + +def is_init_field(cls: ConfigType, name: str) -> bool: + return next(f for f in fields(cls) if f.name == name).init diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 27462b8fa0dad..ecf4e486a016f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,11 +27,11 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, EPLBConfig, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, - ModelDType, ModelImpl, ObservabilityConfig, - ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, - RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, StructuredOutputsConfig, - TaskOption, TokenizerMode, VllmConfig, get_attr_docs) + ModelDType, ObservabilityConfig, ParallelConfig, + PoolerConfig, PrefixCachingHashAlgo, RunnerOption, + SchedulerConfig, SchedulerPolicy, SpeculativeConfig, + StructuredOutputsConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs) from vllm.config.multimodal import MMCacheType, MultiModalConfig from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.utils import get_field @@ -548,7 +548,6 @@ class EngineArgs: model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", - choices=[f.value for f in LogprobsMode], **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) @@ -593,9 +592,7 @@ class EngineArgs: **model_kwargs["override_generation_config"]) model_group.add_argument("--enable-sleep-mode", **model_kwargs["enable_sleep_mode"]) - model_group.add_argument("--model-impl", - choices=[f.value for f in ModelImpl], - **model_kwargs["model_impl"]) + model_group.add_argument("--model-impl", **model_kwargs["model_impl"]) model_group.add_argument("--override-attention-dtype", **model_kwargs["override_attention_dtype"]) model_group.add_argument("--logits-processors", diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 0c2441a6db44d..bd1773c753a93 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -13,8 +13,7 @@ from torch import nn from typing_extensions import assert_never from vllm.attention import Attention -from vllm.config import (ModelConfig, ModelImpl, VllmConfig, - set_current_vllm_config) +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.quantization.base_config import ( @@ -176,8 +175,8 @@ def get_model_architecture( ) if arch == model_config._get_transformers_backend_cls(): - assert model_config.model_impl != ModelImpl.VLLM - if model_config.model_impl == ModelImpl.AUTO: + assert model_config.model_impl != "vllm" + if model_config.model_impl == "auto": logger.warning_once( "%s has no vLLM implementation, falling back to Transformers " "implementation. Some features may not be supported and " diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 1382fd9e93ea3..76f2bd087624c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -19,7 +19,7 @@ from typing import Callable, Optional, TypeVar, Union import torch.nn as nn import transformers -from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults, +from vllm.config import (ModelConfig, iter_architecture_defaults, try_match_architecture_defaults) from vllm.logger import init_logger from vllm.transformers_utils.dynamic_module import ( @@ -587,7 +587,7 @@ class _ModelRegistry: if model_module is not None: break else: - if model_config.model_impl != ModelImpl.TRANSFORMERS: + if model_config.model_impl != "transformers": return None raise ValueError( @@ -598,7 +598,7 @@ class _ModelRegistry: "'auto_map' (relevant if the model is custom).") if not model_module.is_backend_compatible(): - if model_config.model_impl != ModelImpl.TRANSFORMERS: + if model_config.model_impl != "transformers": return None raise ValueError( @@ -644,20 +644,20 @@ class _ModelRegistry: raise ValueError("No model architectures are specified") # Require transformers impl - if model_config.model_impl == ModelImpl.TRANSFORMERS: + if model_config.model_impl == "transformers": arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - elif model_config.model_impl == ModelImpl.TERRATORCH: + elif model_config.model_impl == "terratorch": model_info = self._try_inspect_model_cls("Terratorch") return (model_info, "Terratorch") # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO + and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none"): arch = self._try_resolve_transformers(architectures[0], model_config) @@ -674,7 +674,7 @@ class _ModelRegistry: # Fallback to transformers impl (before resolving runner_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO): + and model_config.model_impl == "auto"): arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: @@ -695,14 +695,14 @@ class _ModelRegistry: raise ValueError("No model architectures are specified") # Require transformers impl - if model_config.model_impl == ModelImpl.TRANSFORMERS: + if model_config.model_impl == "transformers": arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - elif model_config.model_impl == ModelImpl.TERRATORCH: + elif model_config.model_impl == "terratorch": arch = "Terratorch" model_cls = self._try_load_model_cls(arch) if model_cls is not None: @@ -710,7 +710,7 @@ class _ModelRegistry: # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO + and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none"): arch = self._try_resolve_transformers(architectures[0], model_config) @@ -727,7 +727,7 @@ class _ModelRegistry: # Fallback to transformers impl (before resolving runner_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO): + and model_config.model_impl == "auto"): arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index cc5653b10ec1d..747e52f2e5892 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -29,15 +29,12 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__( - self, - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None: + def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None: super().__init__() self.logprobs_mode = logprobs_mode # flashinfer optimization does not apply if intermediate # logprobs/logits after top_k/top_p need to be returned - if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, - LogprobsMode.PROCESSED_LOGPROBS + if logprobs_mode not in ("processed_logits", "processed_logprobs" ) and current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ @@ -90,9 +87,9 @@ class TopKTopPSampler(nn.Module): """ logits = self.apply_top_k_top_p(logits, k, p) logits_to_return = None - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + if self.logprobs_mode == "processed_logits": logits_to_return = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + elif self.logprobs_mode == "processed_logprobs": logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators), logits_to_return @@ -115,7 +112,7 @@ class TopKTopPSampler(nn.Module): "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) assert self.logprobs_mode not in ( - LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS + "processed_logits", "processed_logprobs" ), "FlashInfer does not support returning logits/logprobs" # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 546531a91610f..fa2a6e590f22d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -60,8 +60,7 @@ class Sampler(nn.Module): 9. Return the final `SamplerOutput`. """ - def __init__(self, - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS): + def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): super().__init__() self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.pin_memory = is_pin_memory_available() @@ -78,9 +77,9 @@ class Sampler(nn.Module): # is used for sampling (after penalties and temperature scaling). num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: + if self.logprobs_mode == "raw_logprobs": raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: + elif self.logprobs_mode == "raw_logits": raw_logprobs = logits.clone() # Use float32 for the logits. @@ -156,9 +155,9 @@ class Sampler(nn.Module): if sampling_metadata.all_greedy: processed_logprobs = None if sampling_metadata.max_num_logprobs is not None: - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + if self.logprobs_mode == "processed_logits": processed_logprobs = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + elif self.logprobs_mode == "processed_logprobs": processed_logprobs = self.compute_logprobs(logits) return greedy_sampled, processed_logprobs From ce75e153735363eca01ce67b9f69e7b9ea440c63 Mon Sep 17 00:00:00 2001 From: samzong Date: Sat, 20 Sep 2025 00:36:52 +0800 Subject: [PATCH 016/185] refactor(benchmarks): add type annotations to wait_for_endpoint parameters (#25218) Signed-off-by: samzong --- vllm/benchmarks/lib/endpoint_request_func.py | 15 +++++++++++++-- vllm/benchmarks/lib/ready_checker.py | 5 +++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 066b8fe834380..725b7df8b1871 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -8,8 +8,9 @@ import os import sys import time import traceback +from collections.abc import Awaitable from dataclasses import dataclass, field -from typing import Optional, Union +from typing import Optional, Protocol, Union import aiohttp from tqdm.asyncio import tqdm @@ -92,6 +93,16 @@ class RequestFuncOutput: start_time: float = 0.0 +class RequestFunc(Protocol): + def __call__( + self, + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, + ) -> Awaitable[RequestFuncOutput]: + ... + + async def async_request_openai_completions( request_func_input: RequestFuncInput, session: aiohttp.ClientSession, @@ -507,7 +518,7 @@ async def async_request_openai_embeddings( # TODO: Add more request functions for different API protocols. -ASYNC_REQUEST_FUNCS = { +ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py index 7e836158386a9..87fc16b55012e 100644 --- a/vllm/benchmarks/lib/ready_checker.py +++ b/vllm/benchmarks/lib/ready_checker.py @@ -8,11 +8,12 @@ import time import aiohttp from tqdm.asyncio import tqdm -from .endpoint_request_func import RequestFuncInput, RequestFuncOutput +from .endpoint_request_func import (RequestFunc, RequestFuncInput, + RequestFuncOutput) async def wait_for_endpoint( - request_func, + request_func: RequestFunc, test_input: RequestFuncInput, session: aiohttp.ClientSession, timeout_seconds: int = 600, From 7ac67ea5255c764e87bdfc5c712bfaa35f491764 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Fri, 19 Sep 2025 19:53:45 +0300 Subject: [PATCH 017/185] [KV offload][3/N] Add worker-side CPU support (#21448) Signed-off-by: Or Ozeri --- tests/v1/kv_offload/test_cpu_gpu.py | 177 +++++++++++++++++++++++++++ vllm/v1/kv_offload/worker/cpu_gpu.py | 171 ++++++++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 tests/v1/kv_offload/test_cpu_gpu.py create mode 100644 vllm/v1/kv_offload/worker/cpu_gpu.py diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py new file mode 100644 index 0000000000000..0edb9513e3ff9 --- /dev/null +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import time + +import pytest +import torch + +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.attention.backends.flashinfer import FlashInferBackend +from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler + +NUM_GPU_BLOCKS = [64] +NUM_CPU_BLOCKS = [256] +GPU_BLOCK_SIZES = [16] +GPU_BLOCKS_PER_CPU_BLOCK = [1, 3] +HEAD_SIZES = [64] +NUM_HEADS = [8] +NUM_LAYERS = [4] +DTYPES = [torch.bfloat16] +SEEDS = [0] +CUDA_DEVICES = ['cuda:0'] +NUM_MAPPINGS = [3] + + +@pytest.mark.parametrize("gpu_to_cpu", [True, False]) +@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES) +@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK) +@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS) +@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS) +@pytest.mark.parametrize("num_layers", NUM_LAYERS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_transfer( + gpu_to_cpu: bool, + num_mappings: int, + head_size: int, + num_heads: int, + gpu_block_size: int, + gpu_blocks_per_cpu_block: int, + num_gpu_blocks: int, + num_cpu_blocks: int, + num_layers: int, + dtype: torch.dtype, + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + + # create per-layer GPU KV caches + attn_backends_list = [ + FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend + ] + + gpu_caches = {} + attn_backends = {} + for i in range(num_layers): + layer_name = f'layer {i}' + + attn_backend = attn_backends_list[i % len(attn_backends_list)] + attn_backends[layer_name] = attn_backend + + gpu_cache_shape = attn_backend.get_kv_cache_shape( + num_gpu_blocks, gpu_block_size, num_heads, head_size) + gpu_caches[layer_name] = torch.rand(gpu_cache_shape, + dtype=dtype, + device=device) + + # create handler + cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size + handler = CpuGpuOffloadingHandler(attn_backends=attn_backends, + gpu_block_size=gpu_block_size, + cpu_block_size=cpu_block_size, + num_cpu_blocks=num_cpu_blocks, + gpu_caches=gpu_caches) + + # select block mappings + gpu_blocks = random.sample(range(num_gpu_blocks), + num_mappings * gpu_blocks_per_cpu_block) + cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings) + + # convert cpu blocks to gpu block size + cpu_blocks_in_gpu_block_size = [] + for cpu_block in cpu_blocks: + base_block_id = cpu_block * gpu_blocks_per_cpu_block + for i in range(gpu_blocks_per_cpu_block): + cpu_blocks_in_gpu_block_size.append(i + base_block_id) + + # maybe skip a GPU block to test writing to the middle of a CPU block + if gpu_to_cpu: + gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:] + cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[ + gpu_blocks_per_cpu_block - 1:] + + # set transfer direction + if gpu_to_cpu: + src_kv_caches = handler.gpu_tensors + dst_kv_caches = handler.cpu_tensors + src_spec_class = GPULoadStoreSpec + dst_spec_class = CPULoadStoreSpec + src_blocks = gpu_blocks + dst_blocks = cpu_blocks + src_blocks_in_gpu_block_size = gpu_blocks + dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size + dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block + else: + src_kv_caches = handler.cpu_tensors + dst_kv_caches = handler.gpu_tensors + src_spec_class = CPULoadStoreSpec + dst_spec_class = GPULoadStoreSpec + src_blocks = cpu_blocks + dst_blocks = gpu_blocks + src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size + dst_blocks_in_gpu_block_size = gpu_blocks + dst_size_in_gpu_blocks = num_gpu_blocks + + # build dst -> src mapping + dst_to_src = {} + for src_block, dst_block in zip(src_blocks_in_gpu_block_size, + dst_blocks_in_gpu_block_size): + dst_to_src[dst_block] = src_block + + # build transfer specs + src_spec = src_spec_class(src_blocks) + dst_spec = dst_spec_class(dst_blocks) + + # clone src and dst tensors before transfer + orig_src_caches = [x.clone() for x in src_kv_caches] + orig_dst_caches = [x.clone() for x in dst_kv_caches] + + # call transfer function + assert handler.transfer_async(1, (src_spec, dst_spec)) + assert set(handler.transfer_events.keys()) == {1} + + # wait for transfer to complete + end_time = time.time() + 10 + while time.time() < end_time: + finished = handler.get_finished() + if finished: + assert finished == [(1, True)] + break + time.sleep(0.1) + + # verify src tensors did not change + for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches): + assert torch.equal(orig_tensor, tensor) + + # verify dst tensors + for dst_block in range(dst_size_in_gpu_blocks): + src_block_candidate = dst_to_src.get(dst_block) + for src_cache, dst_cache, orig_dst_cache, kv_dim in zip( + src_kv_caches, dst_kv_caches, orig_dst_caches, + handler.kv_dim_before_num_blocks): + if kv_dim: + # iterate over key, value + for i in range(2): + if src_block_candidate is not None: + expected_value = src_cache[i][src_block_candidate] + else: + expected_value = orig_dst_cache[i][dst_block] + torch.testing.assert_close(dst_cache[i][dst_block].cpu(), + expected_value.cpu()) + else: + if src_block_candidate is not None: + expected_value = src_cache[src_block_candidate] + else: + expected_value = orig_dst_cache[dst_block] + torch.testing.assert_close(dst_cache[dst_block].cpu(), + expected_value.cpu()) diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py new file mode 100644 index 0000000000000..556c29247e5e7 --- /dev/null +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import numpy as np +import torch + +from vllm import _custom_ops as ops +from vllm.attention import AttentionBackend +from vllm.logger import init_logger +from vllm.utils import is_pin_memory_available +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, + TransferResult, TransferSpec) + +logger = init_logger(__name__) + + +def expand_block_ids(block_ids: np.ndarray, + block_size_factor: int, + output: np.ndarray, + skip_count: int = 0): + """ + Convert a list of block IDs to a list of matching block ids, + assuming each block is composed of actual block_size_factor blocks. + Outputs to output tensor. + The first skip_count blocks will be skipped. + Note that skip_count must be less than block_size_factor. + + For example, if block_ids = [0, 1, 3] and block_size_factor = 4, + then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15] + since 0 maps to [0, 1, 2, 3] + 1 maps to [4, 5, 6, 7] + and 3 maps to [12, 13, 14, 15] + """ + assert skip_count < block_size_factor + + first_range = np.arange(skip_count, block_size_factor) + full_range = np.arange(0, block_size_factor) + + output_idx = 0 + for i, block_id in enumerate(block_ids): + base_block_id = block_id * block_size_factor + indices = first_range if i == 0 else full_range + output_end_idx = output_idx + len(indices) + output[output_idx:output_end_idx] = base_block_id + indices + output_idx = output_end_idx + + +class CpuGpuOffloadingHandler(OffloadingHandler): + + def __init__(self, gpu_block_size: int, cpu_block_size: int, + num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]]): + assert cpu_block_size % gpu_block_size == 0 + self.block_size_factor = cpu_block_size // gpu_block_size + + # cuda streams for gpu->cpu and cpu->gpu + self.d2h_stream = torch.cuda.Stream() + self.h2d_stream = torch.cuda.Stream() + + # job_id -> transfer cuda event + self.transfer_events: dict[int, torch.cuda.Event] = {} + # list of cuda events available for re-use + self.events_pool: list[torch.cuda.Event] = [] + + pin_memory = is_pin_memory_available() + + # allocate cpu tensors + logger.info("Allocating %d CPU tensors...", len(gpu_caches)) + self.gpu_tensors: list[torch.Tensor] = [] + self.cpu_tensors: list[torch.Tensor] = [] + self.kv_dim_before_num_blocks: list[bool] = [] + for layer_name, gpu_tensor in gpu_caches.items(): + self.gpu_tensors.append(gpu_tensor) + + gpu_shape = gpu_tensor.shape + test_shape = attn_backends[layer_name].get_kv_cache_shape( + num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256) + if test_shape[0] == 1234: + # shape is (num_blocks, ...) + num_blocks_idx = 0 + self.kv_dim_before_num_blocks.append(False) + else: + # shape should be (2, num_blocks, ...) + assert test_shape[0] == 2 + assert test_shape[1] == 1234 + assert gpu_shape[0] == 2 + + num_blocks_idx = 1 + self.kv_dim_before_num_blocks.append(True) + + cpu_shape = list(gpu_shape) + cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor + + logger.debug("Allocating CPU tensor of shape %r", cpu_shape) + self.cpu_tensors.append( + torch.zeros(cpu_shape, + dtype=gpu_tensor.dtype, + device="cpu", + pin_memory=pin_memory)) + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src_spec, dst_spec = spec + if isinstance(src_spec, CPULoadStoreSpec): + assert isinstance(dst_spec, GPULoadStoreSpec) + stream = self.h2d_stream + src_tensors = self.cpu_tensors + dst_tensors = self.gpu_tensors + src_block_size_factor = self.block_size_factor + dst_block_size_factor = 1 + else: + assert isinstance(src_spec, GPULoadStoreSpec) + assert isinstance(dst_spec, CPULoadStoreSpec) + stream = self.d2h_stream + src_tensors = self.gpu_tensors + dst_tensors = self.cpu_tensors + src_block_size_factor = 1 + dst_block_size_factor = self.block_size_factor + + src_blocks = src_spec.block_ids + dst_blocks = dst_spec.block_ids + assert src_blocks.ndim == 1 + assert dst_blocks.ndim == 1 + + dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor) + src_sub_block_count = src_blocks.size * src_block_size_factor + + assert ( + src_sub_block_count == dst_blocks.size * dst_block_size_factor - + dst_sub_blocks_to_skip) + + src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64) + expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0]) + expand_block_ids(dst_blocks, + dst_block_size_factor, + src_to_dst[:, 1], + skip_count=dst_sub_blocks_to_skip) + src_to_dst_tensor = torch.from_numpy(src_to_dst) + + event = self.events_pool.pop() if self.events_pool \ + else torch.cuda.Event() + with torch.cuda.stream(stream): + for src_tensor, dst_tensor, kv_dim in zip( + src_tensors, dst_tensors, self.kv_dim_before_num_blocks): + if kv_dim: + src_key_cache = src_tensor[0] + dst_key_cache = dst_tensor[0] + ops.swap_blocks(src_key_cache, dst_key_cache, + src_to_dst_tensor) + src_value_cache = src_tensor[1] + dst_value_cache = dst_tensor[1] + ops.swap_blocks(src_value_cache, dst_value_cache, + src_to_dst_tensor) + else: + ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor) + event.record(stream) + + self.transfer_events[job_id] = event + + # success + return True + + def get_finished(self) -> list[TransferResult]: + results: list[TransferResult] = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append((job_id, True)) + self.events_pool.append(event) + for job_id, _ in results: + del self.transfer_events[job_id] + return results From 6c117cff7d0110c74f97f68cda009595a2fdae5e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 20 Sep 2025 01:15:19 +0800 Subject: [PATCH 018/185] [Frontend] Pass API server count to each process (#23717) Signed-off-by: DarkLight1337 --- .../kernels/benchmark_w8a8_block_fp8.py | 2 +- examples/others/tensorize_vllm_model.py | 9 +-- .../test_api_server_process_manager.py | 2 +- tests/v1/test_external_lb_dp.py | 52 +++++++++++++++-- tests/v1/test_hybrid_lb_dp.py | 54 ++++++++++++++++-- tests/v1/test_internal_lb_dp.py | 57 ++++++++++++++++--- vllm/config/parallel.py | 25 ++++++++ vllm/engine/arg_utils.py | 9 ++- vllm/entrypoints/cli/serve.py | 18 ++---- vllm/entrypoints/openai/api_server.py | 38 ++++++++++--- vllm/multimodal/cache.py | 3 +- vllm/v1/engine/core_client.py | 3 +- 12 files changed, 221 insertions(+), 51 deletions(-) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index df2b713e46dc4..c6c8e0b0b936b 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -11,13 +11,13 @@ from datetime import datetime from typing import Any import torch -import triton from tqdm import tqdm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( _w8a8_block_fp8_matmul, ) from vllm.platforms import current_platform +from vllm.triton_utils import triton from vllm.utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 559c7c493aca2..2b7f0beab2277 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import dataclasses import json import logging import os @@ -327,12 +325,7 @@ def main(): if args.command == "serialize": - eng_args_dict = {f.name: getattr(args, f.name) for f in - dataclasses.fields(EngineArgs)} - - engine_args = EngineArgs.from_cli_args( - argparse.Namespace(**eng_args_dict) - ) + engine_args = EngineArgs.from_cli_args(args) input_dir = tensorizer_dir.rstrip('/') suffix = args.suffix if args.suffix else uuid.uuid4().hex diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index a993e24ff838a..34b05ad17b025 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update): global WORKER_RUNTIME_SECONDS WORKER_RUNTIME_SECONDS = 0.5 - # Copy the args to avoid mutating the + # Copy the args to avoid mutating them args = api_server_args.copy() if not with_stats_update: diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 4a5c47fead58f..862a76f3c4e2d 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -9,6 +9,7 @@ from contextlib import AsyncExitStack import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform @@ -70,6 +71,8 @@ class ExternalLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -127,11 +130,19 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args) as server_list: - yield server_list + server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -144,6 +155,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_external_lb_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + print(f"Testing {i=}") + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 293b1257be6bb..552436f818d77 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -9,6 +9,7 @@ from contextlib import AsyncExitStack import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -92,6 +93,8 @@ class HybridLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -150,12 +153,20 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args, DP_SIZE_LOCAL, - TP_SIZE) as server_list: - yield server_list + server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, DP_SIZE_LOCAL, + TP_SIZE) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -168,6 +179,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_hybrid_dp_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + print(f"Testing {i=}") + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 2b031865cad76..e965645711ee6 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -10,6 +10,7 @@ from typing import Optional, cast import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -101,6 +102,8 @@ class MultinodeInternalLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -214,7 +217,10 @@ class APIOnlyServerManager: self.model_name, api_server_args, auto_port=False, - env_dict={}) # No GPUs needed for API-only server + env_dict={ + "VLLM_SERVER_DEV_MODE": "1", + # No GPUs needed for API-only server + }) server.__enter__() print(f"API-only server started successfully with " f"{self.api_server_count} API servers") @@ -293,14 +299,21 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, - api_server_count, - default_server_args, - DP_SIZE // NUM_NODES, - TP_SIZE) as server_list: - yield server_list + server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, + DP_SIZE // NUM_NODES, + TP_SIZE) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest.fixture(scope="module", params=[1, 4]) @@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, yield client +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_multinode_dp_server_info(server_manager): + head_server = server_manager.servers[0][0] + api_server_count = server_manager.api_server_count + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(head_server) for _ in range(n_reqs) + ] + api_process_counts = [c["_api_process_count"] for c in parallel_configs] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 8e92e54a96780..37a41bf6de71a 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -193,6 +193,25 @@ class ParallelConfig: not change by dcp, it simply reuse the GPUs of TP group, and tp_size needs to be divisible by dcp_size.""" + _api_process_count: int = 1 + """ + The number of API processes initialized. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + + _api_process_rank: int = 0 + """ + The rank of this API process, or `-1` for engine core processes + under API server scale-out. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world @@ -428,6 +447,12 @@ class ParallelConfig: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" + if not -1 <= self._api_process_rank < self._api_process_count: + raise ValueError( + "Invalid value of `_api_process_rank`. " + f"Expected to be `-1` or `[0, {self._api_process_count})`, " + f"but found: {self._api_process_rank}") + @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ecf4e486a016f..7a4bb0d41d236 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -333,6 +333,8 @@ class EngineArgs: enable_eplb: bool = ParallelConfig.enable_eplb expert_placement_strategy: ExpertPlacementStrategy = \ ParallelConfig.expert_placement_strategy + _api_process_count: int = ParallelConfig._api_process_count + _api_process_rank: int = ParallelConfig._api_process_rank num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -952,7 +954,10 @@ class EngineArgs: # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. - engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) + engine_args = cls(**{ + attr: getattr(args, attr) + for attr in attrs if hasattr(args, attr) + }) return engine_args def create_model_config(self) -> ModelConfig: @@ -1366,6 +1371,8 @@ class EngineArgs: worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, decode_context_parallel_size=self.decode_context_parallel_size, + _api_process_count=self._api_process_count, + _api_process_rank=self._api_process_rank, ) speculative_config = self.create_speculative_config( diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 803a3e004656a..de47bf00932e0 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -135,23 +135,20 @@ def run_headless(args: argparse.Namespace): def run_multi_api_server(args: argparse.Namespace): assert not args.headless - num_api_servers = args.api_server_count + num_api_servers: int = args.api_server_count assert num_api_servers > 0 - orig_mm_processor_cache_gb = args.mm_processor_cache_gb - if num_api_servers > 1: setup_multiprocess_prometheus() - # Not compatible with API server scale-out - args.mm_processor_cache_gb = 0 - listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) + engine_args._api_process_count = num_api_servers + engine_args._api_process_rank = -1 + usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) - model_config = vllm_config.model_config if num_api_servers > 1: if not envs.VLLM_USE_V1: @@ -161,10 +158,6 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0: - logger.warning("Multi-modal processor cache is disabled because " - "it is not compatible with `api_server_count > 1`.") - executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats @@ -221,9 +214,10 @@ def run_api_server_worker_proc(listen_address, client_config=None, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" + client_config = client_config or {} + server_index = client_config.get("client_index", 0) # Set process title and add process-specific prefix to stdout and stderr. - server_index = client_config.get("client_index", 0) if client_config else 0 set_process_title("APIServer", str(server_index)) decorate_logs() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 11031cd616d20..b8ba7e81ef5f6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,13 +17,14 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any, Callable, Optional +from typing import Annotated, Any, Callable, Literal, Optional import prometheus_client import pydantic import regex as re import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request +from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query, + Request) from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -166,6 +167,9 @@ async def build_async_engine_client( # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) + if client_config: + engine_args._api_process_count = client_config.get("client_count", 1) + engine_args._api_process_rank = client_config.get("client_index", 0) if disable_frontend_multiprocessing is None: disable_frontend_multiprocessing = bool( @@ -209,8 +213,12 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop("client_count") if client_config else 1 - client_index = client_config.pop("client_index") if client_config else 0 + + # Don't mutate the input client_config + client_config = dict(client_config) if client_config else {} + client_count = client_config.pop("client_count", 1) + client_index = client_config.pop("client_index", 0) + try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, @@ -956,9 +964,22 @@ if envs.VLLM_SERVER_DEV_MODE: logger.warning("SECURITY WARNING: Development endpoints are enabled! " "This should NOT be used in production!") + PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + @router.get("/server_info") - async def show_server_info(raw_request: Request): - server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} + async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], + Query()] = "text", + ): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": + str(vllm_config) + if config_format == "text" else PydanticVllmConfig.dump_python( + vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } return JSONResponse(content=server_info) @router.post("/reset_prefix_cache") @@ -1856,8 +1877,6 @@ async def run_server_worker(listen_address, if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - server_index = client_config.get("client_index", 0) if client_config else 0 - # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: @@ -1873,7 +1892,8 @@ async def run_server_worker(listen_address, vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - logger.info("Starting vLLM API server %d on %s", server_index, + logger.info("Starting vLLM API server %d on %s", + vllm_config.parallel_config._api_process_rank, listen_address) shutdown_task = await serve_http( app, diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 297b4c7fa7fbd..642ec3fd7e3f7 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -494,7 +494,8 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = (parallel_config.data_parallel_size == 1 + supports_ipc_cache = ((parallel_config._api_process_count == 1 + and parallel_config.data_parallel_size == 1) or parallel_config.data_parallel_external_lb) return supports_ipc_cache diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index bb0f37c6e0264..a84b0e55105b2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -437,7 +437,7 @@ class MPClient(EngineCoreClient): self.engines_running = False self.stats_update_address: Optional[str] = None - if client_addresses is not None: + if client_addresses: # Engines are managed externally to this client. input_address = client_addresses["input_address"] output_address = client_addresses["output_address"] @@ -774,6 +774,7 @@ class AsyncMPClient(MPClient): client_addresses=client_addresses, ) + self.client_count = client_count self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() From 2821986450bc31869714885ed4203650a42f3cb0 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 20 Sep 2025 02:01:28 +0800 Subject: [PATCH 019/185] [Core] Modify the initialization parameters of the lora manager (#25249) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_manager.py | 36 ++++++++++++++++++----- tests/lora/utils.py | 2 +- vllm/lora/{lora.py => lora_weights.py} | 0 vllm/lora/models.py | 2 +- vllm/lora/worker_manager.py | 23 ++++++++------- vllm/v1/worker/cpu_model_runner.py | 5 ++-- vllm/v1/worker/gpu_model_runner.py | 5 +--- vllm/v1/worker/lora_model_runner_mixin.py | 15 ++-------- vllm/v1/worker/tpu_model_runner.py | 4 +-- vllm/worker/model_runner.py | 11 ++----- 10 files changed, 51 insertions(+), 52 deletions(-) rename vllm/lora/{lora.py => lora_weights.py} (100%) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index a5802c108c6be..d7684fbf34abb 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,11 +8,12 @@ import torch from safetensors.torch import load_file from torch import nn +from vllm.config import ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA) -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager) from vllm.lora.peft_helper import PEFTHelper @@ -435,10 +436,19 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, target_modules=["layer1.dense1", "dense2"], lora_dtype=DEFAULT_DTYPE, ) + + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config, + lora_config=lora_config) + + vllm_config.scheduler_config.max_num_seqs = 4 + vllm_config.scheduler_config.max_num_batched_tokens = 2 worker_adapter_manager = LRUCacheWorkerLoRAManager( - 4, 2, - dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size, - lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + + worker_adapter_manager.max_num_seqs = 4 + worker_adapter_manager.max_num_batched_tokens = 2 + worker_adapter_manager.create_lora_manager(dummy_model) mapping = LoRAMapping([], []) @@ -517,10 +527,20 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE) - worker_adapter_manager = WorkerLoRAManager( - 4, 2, dummy_model_gate_up.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, device, - EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config, + lora_config=lora_config) + + vllm_config.scheduler_config.max_num_seqs = 4 + vllm_config.scheduler_config.max_num_batched_tokens = 2 + + worker_adapter_manager = WorkerLoRAManager(vllm_config, device, + EMBEDDING_MODULES, + EMBEDDING_PADDING_MODULES) + worker_adapter_manager.vocab_size = ( + dummy_model_gate_up.unpadded_vocab_size - + lora_config.lora_extra_vocab_size) worker_adapter_manager.create_lora_manager(dummy_model_gate_up) dummy_lora_files = f"{tmp_path}/lora_adapter" diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 7cda90787b6f1..ab475904d4938 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -9,7 +9,7 @@ from typing import Optional, Union import torch from safetensors.torch import save_file -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights class DummyLoRAManager: diff --git a/vllm/lora/lora.py b/vllm/lora/lora_weights.py similarity index 100% rename from vllm/lora/lora.py rename to vllm/lora/lora_weights.py diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 25f90f2fa932b..9ea46be65cff3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -14,7 +14,7 @@ from torch import nn from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index e27b7d5fcf223..cdb2f86611d81 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union import torch -from vllm.config.lora import LoRAConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) @@ -27,25 +27,26 @@ class WorkerLoRAManager: def __init__( self, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, + vllm_config: VllmConfig, device: torch.device, embedding_modules: dict[str, str], embedding_padding_modules: list[str], lora_model_cls: type[LoRAModel] = LoRAModel, - max_position_embeddings: Optional[int] = None, ): self._lora_model_cls = lora_model_cls self.embedding_modules = embedding_modules self.embedding_padding_modules = embedding_padding_modules self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False - self.max_num_seqs = max_num_seqs - self.max_num_batched_tokens = max_num_batched_tokens - self.vocab_size = vocab_size - self.lora_config = lora_config - self.max_position_embeddings = max_position_embeddings + self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs + self.max_num_batched_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens) + self.vocab_size = vllm_config.model_config.get_vocab_size() + self.lora_config = vllm_config.lora_config + + # Use get_text_config() in case of multimodal models + text_config = vllm_config.model_config.hf_config.get_text_config() + + self.max_position_embeddings = text_config.max_position_embeddings self.device = device # Lazily initialized by create_lora_manager. self._adapter_manager: LoRAModelManager diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index cd0f0af43e7e7..6a97f7ebc3fcd 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -107,9 +107,8 @@ class CPUModelRunner(GPUModelRunner): self.model = get_model(vllm_config=self.vllm_config) if self.lora_config: - self.model = self.load_lora_model(self.model, self.model_config, - self.scheduler_config, - self.lora_config, self.device) + self.model = self.load_lora_model(self.model, self.vllm_config, + self.device) def get_model(self) -> nn.Module: return self.model diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3ee2160a42ffe..9d0f26266f0c5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2552,10 +2552,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.model = model_loader.load_model( vllm_config=self.vllm_config, model_config=self.model_config) if self.lora_config: - self.model = self.load_lora_model(self.model, - self.model_config, - self.scheduler_config, - self.lora_config, + self.model = self.load_lora_model(self.model, self.vllm_config, self.device) if hasattr(self, "drafter"): logger.info("Loading drafter model...") diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 01d5f0525c4e2..e416f50322f48 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import ModelConfig, SchedulerConfig +from vllm.config import VllmConfig from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -31,9 +31,7 @@ class LoRAModelRunnerMixin: LORA_WARMUP_RANK = 8 - def load_lora_model(self, model: nn.Module, model_config: ModelConfig, - scheduler_config: SchedulerConfig, - lora_config: LoRAConfig, + def load_lora_model(self, model: nn.Module, vllm_config: VllmConfig, device: torch.device) -> nn.Module: if not supports_lora(model): @@ -44,19 +42,12 @@ class LoRAModelRunnerMixin: logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # Use get_text_config() in case of multimodal models - text_config = model_config.hf_config.get_text_config() - # Add LoRA Manager to the Model Runner self.lora_manager = LRUCacheWorkerLoRAManager( - scheduler_config.max_num_seqs, - scheduler_config.max_num_batched_tokens, - model_config.get_vocab_size(), - lora_config, + vllm_config, device, model.embedding_modules, model.embedding_padding_modules, - max_position_embeddings=text_config.max_position_embeddings, ) return self.lora_manager.create_lora_manager(model) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 01a8e5c3f0dba..48070c1e3e7cb 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1178,9 +1178,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): "or sharding the weights on more chips. " f"See the detailed error: {e}") from e if self.lora_config is not None: - model = self.load_lora_model(model, self.model_config, - self.scheduler_config, - self.lora_config, self.device) + model = self.load_lora_model(model, self.vllm_config, self.device) replace_set_lora(model) # Sync all pending XLA execution during model initialization and weight diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 594382650e3c1..c91c871766cff 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1078,20 +1078,13 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): "Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # Use get_text_config() in case of multimodal models - text_config = self.model_config.hf_config.get_text_config() - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, + self.vllm_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=text_config. - max_position_embeddings, ) + self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter() From d90e212a3a586b8a6ca9a424868abd0e6ef6779a Mon Sep 17 00:00:00 2001 From: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Date: Sat, 20 Sep 2025 02:15:13 +0800 Subject: [PATCH 020/185] Remove Redundant Assignment in Qwen3_VisionPatchMerger (#25224) Signed-off-by: Junhong Co-authored-by: Junhong Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c224b78e2c27c..17375ff0959d7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -223,9 +223,7 @@ class Qwen3_VisionPatchMerger(nn.Module): if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) - self.use_postshuffle_norm = use_postshuffle_norm - self.norm = norm_layer( - self.hidden_size if use_postshuffle_norm else context_dim) + self.norm = norm_layer(context_dim) self.linear_fc1 = ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=True, From 12aed7e453aea713495bd7cced6f9e2bb78aaa79 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 19:15:22 +0100 Subject: [PATCH 021/185] Encoder model support for the Transformers backend (#25174) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 37 ++++++++++----- tests/models/test_transformers.py | 36 ++++++++++++++- vllm/attention/backends/abstract.py | 8 ++-- vllm/model_executor/models/transformers.py | 54 +++++++++++++++++++--- 4 files changed, 111 insertions(+), 24 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 3a6738a27be09..cbc0a56a645ea 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models] ### Transformers -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases. +vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend". -To check if the modeling backend is Transformers, you can simply do this: +Currently, the Transformers backend works for the following: + +- Modalities: embedding models, language models and vision-language models* +- Architectures: encoder-only, decoder-only +- Attention types: full attention and/or sliding attention + +_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._ + +If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM: + +- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature) +- Any combination of the following vLLM parallelisation schemes: + - Pipeline parallel + - Tensor parallel + +Checking if the modeling backend is Transformers is as simple as: ```python from vllm import LLM @@ -27,16 +42,12 @@ llm = LLM(model=...) # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers! +If the printed type starts with `Transformers...` then it's using the Transformers model implementation! -!!! tip - You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md). +If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). !!! note - vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. - -!!! note - In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. + For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. #### Custom models @@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa To make your model compatible with the Transformers backend, it needs: 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. + 1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. -
+
modeling_my_model.py ```python @@ -78,6 +90,7 @@ from transformers import PreTrainedModel from torch import nn class MyAttention(nn.Module): + is_causal = False # Only do this for encoder-only models def forward(self, hidden_states, **kwargs): ... @@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into one of the Transformers backend classes in which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: -
+
configuration_my_model.py ```python diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index ba9c3bebc437e..1817d4aeee9f9 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -9,7 +9,7 @@ from vllm.platforms import current_platform from ..conftest import HfRunner, VllmRunner from ..utils import multi_gpu_test, prep_prompts -from .utils import check_logprobs_close +from .utils import check_embeddings_close, check_logprobs_close def check_implementation( @@ -165,6 +165,40 @@ def test_embed_loading(vllm_runner, model): assert model_config.using_transformers_backend() +@pytest.mark.parametrize( + "model", + [ + # Encoder model + "BAAI/bge-base-en-v1.5", + ]) +def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model): + import transformers + from packaging.version import Version + installed = Version(transformers.__version__) + required = Version("4.57.0.dev0") + if installed < required: + pytest.skip("Encoder models with the Transformers backend require " + f"transformers>={required}, but got {installed}") + + with vllm_runner(model, max_model_len=512, + model_impl="transformers") as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + assert model_config.using_transformers_backend() + + vllm_outputs = vllm_model.embed(example_prompts) + + with hf_runner(model, is_sentence_transformer=True) as hf_model: + hf_outputs = hf_model.encode(example_prompts) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + + @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 75bcdc4bbcf0d..dfde67e1713c5 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -23,14 +23,14 @@ class AttentionType: Attention type. Use string to be compatible with `torch.compile`. """ - # Decoder attention between previous layer Q/K/V DECODER = "decoder" - # Encoder attention between previous layer Q/K/V for encoder-decoder + """Decoder attention between previous layer Q/K/V.""" ENCODER = "encoder" - # Encoder attention between previous layer Q/K/V + """Encoder attention between previous layer Q/K/V for encoder-decoder.""" ENCODER_ONLY = "encoder_only" - # Attention between dec. Q and enc. K/V for encoder-decoder + """Encoder attention between previous layer Q/K/V.""" ENCODER_DECODER = "encoder_decoder" + """Attention between dec. Q and enc. K/V for encoder-decoder.""" class AttentionBackend(ABC): diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4f51441e28efa..f40a20dee63d7 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -27,7 +27,7 @@ from transformers import (AutoModel, BatchFeature, PretrainedConfig, PreTrainedModel) from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention import Attention +from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, VllmConfig) @@ -452,8 +452,9 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): self.pp_rank = self.pp_group.rank_in_group self.tp_size = get_tensor_model_parallel_world_size() - # To be updated in child classes for use in `load_weights` - self.skip_prefixes: Optional[list[str]] = None + # Weights to skip in `self.load_weights` + self.skip_prefixes: list[str] = [] + self.skip_substrs: list[str] = [] # Set correct attn and init on "meta" to delay allocating GPU tensors # TODO: @raushan, use the public `model.set_attn_implementation()` @@ -596,7 +597,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): _tensor_parallel(self.model) - def create_attention_instances(self) -> dict[int, Attention]: + def create_attention_instances( + self, + attn_type: AttentionType = AttentionType.DECODER + ) -> dict[int, Attention]: """ Create `Attention` instances to inform KV cache allocation. """ @@ -625,7 +629,8 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): cache_config=self.cache_config, quant_config=self.quant_config, per_layer_sliding_window=per_layer_sliding_window, - prefix=f"{i}.attn") + prefix=f"{i}.attn", + attn_type=attn_type) return attention_instances def init_parameters(self, module: nn.Module): @@ -685,7 +690,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes) + loader = AutoWeightsLoader( + self, + skip_prefixes=self.skip_prefixes, + skip_substrs=self.skip_substrs, + ) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) @@ -700,6 +709,37 @@ class TransformersModel(TransformersBase): "model.score": "score", }) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + # Some encoder models have the position_ids buffer in the checkpoint + # vLLM will always pass position_ids as an argument, so we skip loading + # the buffer if it exists + self.skip_substrs.append("position_ids") + + def create_attention_instances( + self, attn_type: AttentionType = AttentionType.DECODER): + # TODO(hmellor): Better way to detect encoder models + # In encoder models, the attention layers will have `is_causal=False` + is_encoder = lambda m: not getattr(m, "is_causal", True) + # vLLM does not support encoder-decoder models, so if any encoder layer + # is found, we assume the whole model is an encoder model + if any(is_encoder(m) for m in self.model.modules()): + attn_type = AttentionType.ENCODER_ONLY + + # Check minimum transformers version for encoder models support + if attn_type == AttentionType.ENCODER_ONLY: + import transformers + from packaging.version import Version + installed = Version(transformers.__version__) + required = Version("4.57.0.dev0") + if installed < required: + raise ValueError( + "Encoder models with the Transformers backend require " + f"transformers>={required}, but got {installed}") + + return super().create_attention_instances(attn_type) + @support_torch_compile(enable_if=can_enable_torch_compile) class TransformersForCausalLM(TransformersBase): @@ -710,7 +750,7 @@ class TransformersForCausalLM(TransformersBase): # Tell `TransformersBase.load_weights` to skip # `lm_head` if the model has tied word embeddings if self.text_config.tie_word_embeddings: - self.skip_prefixes = ["lm_head."] + self.skip_prefixes.append("lm_head.") if get_pp_group().is_last_rank: self.unpadded_vocab_size = self.text_config.vocab_size From 47fd08aaf9fe6616d5daf1f30c0377d8b8b7cf21 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 20 Sep 2025 02:16:32 +0800 Subject: [PATCH 022/185] [CI/Build] fix test function_calling (#25072) Signed-off-by: chaunceyjiang --- tests/entrypoints/openai/test_response_api_with_harmony.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 8d974d56b4450..40a22c04b08a5 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -515,6 +515,7 @@ async def test_function_calling(client: OpenAI, model_name: str): model=model_name, input="What's the weather like in Paris today?", tools=tools, + temperature=0.0, ) assert response is not None assert response.status == "completed" From 2506ce5189fa67fea905c1f18ca735191cec0d29 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Fri, 19 Sep 2025 11:22:53 -0700 Subject: [PATCH 023/185] [Core][Prefix Hash] Fix prefix hash metrics sliding window maintainance (#24990) Signed-off-by: Jialin Ouyang --- tests/v1/core/test_kv_cache_utils.py | 46 +++++++++++++++++++++++----- vllm/v1/core/kv_cache_utils.py | 13 ++++++-- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 319e6e84fba1e..4bf6bbbfeae28 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn): assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None)) +def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats: + return PrefixCacheStats(requests=requests, queries=queries, hits=hits) + + def test_metrics(): """ Test the prefix caching metrics. """ - - def stats(requests, queries, hits): - return PrefixCacheStats(requests=requests, queries=queries, hits=hits) - metrics = PrefixCachingMetrics(max_recent_requests=5) assert metrics.hit_rate == 0.0 - metrics.observe(stats(1, 20, 9)) + metrics.observe(_stats(1, 20, 9)) # 9 / 20 = 0.45 assert metrics.hit_rate == 0.45 - metrics.observe(stats(4, 80, 16)) + metrics.observe(_stats(4, 80, 16)) # 25 / 100 = 0.25 assert metrics.hit_rate == 0.25 - metrics.observe(stats(1, 10, 2)) + metrics.observe(_stats(1, 10, 2)) # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 assert metrics.aggregated_requests == 5 @@ -549,6 +549,38 @@ def test_metrics(): assert not metrics.query_queue +def test_metrics_empty_stats(): + """ + Test the prefix caching metrics with empty stats. + """ + metrics = PrefixCachingMetrics(max_recent_requests=5) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(1, 20, 9)) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(4, 80, 16)) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(1, 10, 2)) + # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 + assert metrics.aggregated_requests == 5 + assert metrics.aggregated_query_total == 90 + assert metrics.aggregated_query_hit == 18 + assert metrics.hit_rate == 0.2 + + # Only the latest added stats preserved 10 / 20 = 0.5 + metrics.observe(_stats(11, 20, 10)) + assert metrics.aggregated_requests == 11 + assert metrics.aggregated_query_total == 20 + assert metrics.aggregated_query_hit == 10 + assert metrics.hit_rate == 0.5 + + # Only the latest added stats preserved 30 / 40 = 0.75 + metrics.observe(_stats(22, 40, 30)) + assert metrics.aggregated_requests == 22 + assert metrics.aggregated_query_total == 40 + assert metrics.aggregated_query_hit == 30 + assert metrics.hit_rate == 0.75 + + def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) vllm_config = VllmConfig(model_config=model_config) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9fab36aba91b3..bc2ec5e42ea20 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -127,14 +127,23 @@ class PrefixCachingMetrics: if stats.reset: self.reset() + # DO NOT appending empty stats to avoid helpful info get kicked out + # due to sliding window. + if stats.requests == 0: + return + # Update the metrics. self.query_queue.append((stats.requests, stats.queries, stats.hits)) self.aggregated_requests += stats.requests self.aggregated_query_total += stats.queries self.aggregated_query_hit += stats.hits - # Remove the oldest stats if the number of requests exceeds. - if self.aggregated_requests > self.max_recent_requests: + # Remove the oldest stats until number of requests does not exceed + # the limit. + # NOTE: We preserve the latest added stats regardless. + while len( + self.query_queue + ) > 1 and self.aggregated_requests > self.max_recent_requests: old_requests, old_queries, old_hits = self.query_queue.popleft() self.aggregated_requests -= old_requests self.aggregated_query_total -= old_queries From 138f0d1e752d3a35cd959f3df8bf00370a2ace7b Mon Sep 17 00:00:00 2001 From: samzong Date: Sat, 20 Sep 2025 02:32:27 +0800 Subject: [PATCH 024/185] [Docs] add __init__.py to vllm/model_executor/layers/quantization/compressed_tensors/transform (#24974) Signed-off-by: samzong --- .../quantization/compressed_tensors/transform/__init__.py | 0 .../compressed_tensors/transform/schemes/__init__.py | 0 vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/llava_next.py | 5 +++-- 5 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c1e7a7d498b11..a3131aa3812ef 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, batch. Info: - [Blip2ImageInputs][] + [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs] """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 9591deea06ce9..4f15e1b5762ea 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaImageInputs][] + [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5e82f9799e0fe..beb3c33100595 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by [get_llava_next_image_feature_size][]. + is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\ +model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. @@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaNextImageInputs][] + [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None From b716ab93a781156ab178513afc0e407cc72d443b Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:37:57 -0700 Subject: [PATCH 025/185] [bugfix] fix structured outputs key missing issue from #24929 (#25195) Signed-off-by: Lu Fang --- vllm/v1/core/sched/scheduler.py | 4 +++- vllm/v1/structured_output/utils.py | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b08898d253cab..ef77d9e2d3ffd 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -578,8 +578,10 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) + scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs + + scheduled_resumed_reqs) structured_output_request_ids, grammar_bitmask = ( - self.get_grammar_bitmask(self.running, + self.get_grammar_bitmask(scheduled_requests, scheduled_spec_decode_tokens)) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 127c8876525b5..b9b09bea1e80f 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -90,13 +90,14 @@ def apply_grammar_bitmask( seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]) for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) + if req_id in struct_out_req_batch_indices: + logit_index = struct_out_req_batch_indices[req_id] + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask From c59a0eca4204522ecaa28af1f1e38d50b1e2626f Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Fri, 19 Sep 2025 22:07:17 +0300 Subject: [PATCH 026/185] [KV offload][4/N] Offloading KV connector (#22595) Signed-off-by: Or Ozeri --- .../unit/test_offloading_connector.py | 505 ++++++++++++++++++ tests/v1/kv_connector/unit/utils.py | 3 +- .../kv_transfer/kv_connector/factory.py | 5 + .../kv_connector/v1/offloading_connector.py | 485 +++++++++++++++++ vllm/v1/kv_offload/factory.py | 53 ++ vllm/v1/kv_offload/spec.py | 61 +++ 6 files changed, 1111 insertions(+), 1 deletion(-) create mode 100644 tests/v1/kv_connector/unit/test_offloading_connector.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py create mode 100644 vllm/v1/kv_offload/factory.py create mode 100644 vllm/v1/kv_offload/spec.py diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py new file mode 100644 index 0000000000000..f9a4d2fb4de44 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -0,0 +1,505 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from typing import Any +from unittest.mock import MagicMock + +import pytest +import torch + +from vllm import SamplingParams +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_events import BlockRemoved, BlockStored +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole +from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import ( + OffloadingConnector, OffloadingConnectorMetadata) +from vllm.forward_context import ForwardContext +from vllm.utils import sha256 +from vllm.v1.core.kv_cache_utils import (BlockHash, get_request_block_hasher, + init_none_hash) +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + OffloadingManager, PrepareStoreOutput) +from vllm.v1.kv_offload.mediums import GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, + TransferResult, TransferSpec) +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput +from vllm.v1.request import Request + +from .utils import (EOS_TOKEN_ID, create_model_runner_output, create_scheduler, + create_vllm_config) + + +class MockLoadStoreSpec(LoadStoreSpec): + + def __init__(self, block_hashes: Iterable[BlockHash]): + self.block_hashes: list[BlockHash] = list(block_hashes) + + @staticmethod + def medium() -> str: + return "Mock" + + def __repr__(self) -> str: + return repr(self.block_hashes) + + +class MockOffloadingHandler(OffloadingHandler): + + def __init__(self): + self.completed_transfers: list[TransferResult] = [] + self.completed_specs: list[TransferSpec] = [] + + def get_finished(self) -> list[TransferResult]: + finished = self.completed_transfers + self.completed_transfers = [] + return finished + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + self.completed_specs.append(spec) + self.completed_transfers.append((job_id, True)) + return True + + +class MockOffloadingSpec(OffloadingSpec): + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + self.manager = MagicMock(spec=OffloadingManager) + self.manager.lookup.return_value = 0 + self.manager.prepare_load = lambda block_hashes: (MockLoadStoreSpec( + block_hashes)) + self.handler = MockOffloadingHandler() + + def get_manager(self) -> OffloadingManager: + return self.manager + + def get_handlers( + self, _ + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + + yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler + yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler + + def get_completed_transfers(self) -> list[TransferSpec]: + specs = self.handler.completed_specs + self.handler.completed_specs = [] + return specs + + +@dataclass +class TransferSummary: + gpu_block_indices: list[int] + offload_addresses: list[Any] + + +class RequestRunner: + + def __init__(self, offloaded_block_size: int, gpu_block_size: int, + num_gpu_blocks: int): + self.offloaded_block_size: int = offloaded_block_size + self.gpu_block_size: int = gpu_block_size + self.num_gpu_blocks: int = num_gpu_blocks + + self.req_id: int = -1 + + vllm_config = create_vllm_config(block_size=gpu_block_size, + max_num_batched_tokens=1000) + vllm_config.kv_transfer_config = KVTransferConfig( + kv_connector="OffloadingConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "spec_name": "MockOffloadingSpec", + "spec_module_path": + "tests.v1.kv_connector.unit.test_offloading_connector", + "block_size": offloaded_block_size, + }) + + self.scheduler: Scheduler = create_scheduler(vllm_config, + num_blocks=num_gpu_blocks) + self.worker_connector = OffloadingConnector(vllm_config, + KVConnectorRole.WORKER) + + # register worker kv_caches to enable OffloadingWorker creations + self.worker_connector.register_kv_caches( + kv_caches={"a": torch.empty(0)}) + + # extract connector of scheduler + scheduler_connector = self.scheduler.connector + assert scheduler_connector is not None + assert isinstance(scheduler_connector, OffloadingConnector) + self.scheduler_connector: OffloadingConnector = scheduler_connector + + # extract mocked OffloadingManager of scheduler connector + connector_scheduler = scheduler_connector.connector_scheduler + assert connector_scheduler is not None + manager = connector_scheduler.manager + assert isinstance(manager, MagicMock) + self.manager: MagicMock = manager + + assert connector_scheduler.gpu_block_size == gpu_block_size + assert connector_scheduler.offloaded_block_size == offloaded_block_size + + # extract OffloadingSpec of worker_connector + connector_worker = self.worker_connector.connector_worker + assert connector_worker is not None + offloading_spec = connector_worker.spec + assert isinstance(offloading_spec, MockOffloadingSpec) + self.offloading_spec: MockOffloadingSpec = offloading_spec + + # mapping (offloading address) -> gpu_block_index + self.offloaded: dict[Any, int] = {} + + self.pending_loads_count: int = 0 + self.pending_stores_count: int = 0 + + self.completed_loads: list[TransferSummary] = [] + self.completed_stores: list[TransferSummary] = [] + + # maps {block_id: block_offset} + self.gpu_block_index: dict[int, int] = {} + + init_none_hash(sha256) + self._block_hasher = get_request_block_hasher(gpu_block_size, sha256) + + self._dummy_ctx: ForwardContext = ForwardContext(no_compile_layers={}, + attn_metadata={}, + virtual_engine=0) + + def new_request(self, token_ids: list[int]): + assert not self.scheduler.requests + self.req_id += 1 + + req = Request( + request_id=str(self.req_id), + prompt_token_ids=token_ids, + sampling_params=SamplingParams(max_tokens=1000), + pooling_params=None, + eos_token_id=EOS_TOKEN_ID, + block_hasher=self._block_hasher, + ) + + self.scheduler.add_request(req) + + def _wait_for_transfers(self): + block_size_factor = self.offloaded_block_size // self.gpu_block_size + + while self.pending_loads_count or self.pending_stores_count: + for transfer_spec in ( + self.offloading_spec.get_completed_transfers()): + src_spec, dst_spec = transfer_spec + + if isinstance(src_spec, GPULoadStoreSpec): + store = True + gpu_spec = src_spec + offload_spec = dst_spec + else: + store = False + gpu_spec = dst_spec + offload_spec = src_spec + + assert isinstance(offload_spec, MockLoadStoreSpec) + assert isinstance(gpu_spec, GPULoadStoreSpec) + + gpu_block_indices: list[int] = [] + for block_id in gpu_spec.block_ids: + gpu_block_indices.append( + self.gpu_block_index[block_id.item()]) + + # list of (block_hash, sub_block_offset) + offload_addresses: list[Any] = [] + for block_hash in offload_spec.block_hashes: + for sub_block_idx in range(block_size_factor): + offload_addresses.append((block_hash, sub_block_idx)) + + if store: + assert len(gpu_block_indices) == len(offload_addresses) + + self.completed_stores.append( + TransferSummary(gpu_block_indices, offload_addresses)) + self.pending_stores_count -= 1 + else: + remainder_sub_block_count = (len(offload_addresses) - + len(gpu_block_indices)) + assert remainder_sub_block_count >= 0 + assert remainder_sub_block_count < block_size_factor + offload_addresses = offload_addresses[ + remainder_sub_block_count:] + + self.completed_loads.append( + TransferSummary(gpu_block_indices, offload_addresses)) + self.pending_loads_count -= 1 + + def _update_gpu_block_idx(self): + for blocks in (self.scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks.values()): + for block_idx, block in enumerate(blocks): + self.gpu_block_index[block.block_id] = block_idx + + def _run(self, decoded_tokens: list[int]): + """ + Runs multiple engine (scheduler + worker) steps. + Assumes a single request is running. + + Args: + decoded_tokens: the tokens to yield at each step. + """ + + tokens_iter = iter(decoded_tokens) + token_id = next(tokens_iter, None) + while token_id is not None: + assert self.scheduler.requests + + scheduler_output = self.scheduler.schedule() + self._update_gpu_block_idx() + + kv_connector_metadata = scheduler_output.kv_connector_metadata + assert kv_connector_metadata is not None + assert isinstance(kv_connector_metadata, + OffloadingConnectorMetadata) + + self.pending_loads_count += len(kv_connector_metadata.reqs_to_load) + self.pending_stores_count += len( + kv_connector_metadata.reqs_to_store) + + self.worker_connector.bind_connector_metadata( + kv_connector_metadata) + self.worker_connector.start_load_kv(self._dummy_ctx) + + if scheduler_output.total_num_scheduled_tokens > 0: + self.worker_connector.wait_for_save() + + finished_sending, finished_recving = ( + self.worker_connector.get_finished( + scheduler_output.finished_req_ids)) + + self.worker_connector.clear_connector_metadata() + + model_runner_output = create_model_runner_output( + reqs=self.scheduler.running, + finished_sending=list(finished_sending), + finished_recving=list(finished_recving), + token_id=token_id) + + if self.scheduler.running: + token_id = next(tokens_iter, None) + + self.scheduler.update_from_output(scheduler_output, + model_runner_output) + + self._wait_for_transfers() + + # run one more step to update finished stored + if EOS_TOKEN_ID in decoded_tokens: + assert not self.scheduler.running + + while self.scheduler.requests: + scheduler_output = self.scheduler.schedule() + + finished_sending, finished_recving = ( + self.worker_connector.get_finished( + scheduler_output.finished_req_ids)) + + assert not finished_recving + + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending) + + self.scheduler.update_from_output(scheduler_output, + model_runner_output) + + def run( + self, + decoded_tokens: list[int], + expected_stored_gpu_block_indexes: tuple[int, ...] = (), + expected_loaded_gpu_block_indexes: tuple[int, ...] = (), + ): + """ + Runs multiple engine (scheduler + worker) steps. + Assumes a single request is running. + + Args: + decoded_tokens: the tokens to yield at each step. + expected_stored_gpu_block_indexes: GPU block indexes + that are expected to be written during the run. + expected_loaded_gpu_block_indexes: GPU block indexes + that are expected to be loaded during the run. + """ + + self.manager.reset_mock() + self._run(decoded_tokens) + + loaded_gpu_block_indexes: set[int] = set() + for transfer in self.completed_loads: + for gpu_block_idx, offloaded_address in zip( + transfer.gpu_block_indices, transfer.offload_addresses): + loaded_gpu_block_indexes.add(gpu_block_idx) + assert gpu_block_idx == self.offloaded[offloaded_address] + + assert ( + set(expected_loaded_gpu_block_indexes) == loaded_gpu_block_indexes) + self.completed_loads.clear() + + stored_gpu_block_indexes: set[int] = set() + for transfer in self.completed_stores: + for gpu_block_idx, offloaded_address in zip( + transfer.gpu_block_indices, transfer.offload_addresses): + stored_gpu_block_indexes.add(gpu_block_idx) + self.offloaded[offloaded_address] = gpu_block_idx + + assert ( + set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes) + self.completed_stores.clear() + + +@pytest.fixture +def request_runner(): + runners = [] + + def runner_factory(offloaded_block_size, gpu_block_size, num_gpu_blocks): + runner = RequestRunner(offloaded_block_size=offloaded_block_size, + gpu_block_size=gpu_block_size, + num_gpu_blocks=num_gpu_blocks) + runners.append(runner) + return runner + + yield runner_factory # pass factory to the test + + +def generate_store_output(block_hashes: Iterable[BlockHash]): + block_hashes = list(block_hashes) + return PrepareStoreOutput( + block_hashes_to_store=list(block_hashes), + store_spec=MockLoadStoreSpec(block_hashes), + block_hashes_evicted=[], + ) + + +def test_offloading_connector(request_runner): + offloaded_block_size = 12 + gpu_block_size = 4 + num_gpu_blocks = 100 + block_size_factor = offloaded_block_size // gpu_block_size + + runner = request_runner(offloaded_block_size=offloaded_block_size, + gpu_block_size=gpu_block_size, + num_gpu_blocks=num_gpu_blocks) + + # 3 blocks, store just the middle block (skip first and last) + # blocks = [0, 1, 2], [3, 4, 5], [6, 7, 8] + runner.new_request(token_ids=[0] * offloaded_block_size * 3) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output(list(block_hashes)[1:2]) + runner.run(decoded_tokens=[0], expected_stored_gpu_block_indexes=(3, 4, 5)) + + # add block missing 1 token -> no offload + runner.run(decoded_tokens=[0] * (offloaded_block_size - 1)) + runner.manager.prepare_store.assert_not_called() + + # +1 token -> single block, fail prepare_store + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: None + runner.run(decoded_tokens=[0]) + runner.manager.prepare_store.assert_called() + + # 1 more block, now set block_hashes_to_store = [] + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[0] * offloaded_block_size) + + # 1 more block, now check touch was called with all 6 blocks + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output(block_hashes) + runner.run(decoded_tokens=[0] * offloaded_block_size, + expected_stored_gpu_block_indexes=(15, 16, 17)) + runner.manager.touch.assert_called() + block_hashes1 = list(runner.manager.touch.call_args.args[0]) + assert len(block_hashes1) == 6 + + # terminate request + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + + # create a new request differing only on the last token + runner.new_request(token_ids=[0] * (offloaded_block_size * 6 - 1) + [1]) + runner.run(decoded_tokens=[0], + expected_stored_gpu_block_indexes=tuple( + range(6 * block_size_factor))) + runner.manager.touch.assert_called() + block_hashes2 = list(runner.manager.touch.call_args.args[0]) + assert len(block_hashes2) == 6 + + # verify hashes are the same, except for the last block + assert block_hashes1[:5] == block_hashes2[:5] + assert block_hashes1[5] != block_hashes2[5] + + # terminate request + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + + # full_block_tokens - num_computed_tokens < offloaded_block_size + runner.new_request(token_ids=[0] * gpu_block_size + [1] * + (offloaded_block_size - gpu_block_size)) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + runner.manager.lookup.assert_not_called() + + # single block lookup with no hits + runner.new_request(token_ids=[1] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + runner.manager.lookup.assert_called() + assert len(list(runner.manager.lookup.call_args.args[0])) == 1 + + # single block lookup with a hit + runner.scheduler.reset_prefix_cache() + runner.new_request(token_ids=[0] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.manager.lookup.return_value = 1 + runner.run(decoded_tokens=[EOS_TOKEN_ID], + expected_loaded_gpu_block_indexes=(0, 1, 2)) + + # single block lookup with a hit in a middle block + runner.new_request(token_ids=[0] * offloaded_block_size * 2 + + [1] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.manager.lookup.return_value = 1 + runner.run(decoded_tokens=[EOS_TOKEN_ID], + expected_loaded_gpu_block_indexes=(3, 4, 5)) + + # test take_events + def to_hashes(int_hashes: list[int]) -> list[BlockHash]: + return [BlockHash(str(i).encode()) for i in int_hashes] + + def take_events() -> Iterable[OffloadingEvent]: + yield OffloadingEvent(block_hashes=to_hashes([1, 2, 3]), + block_size=16, + medium="A", + removed=False) + yield OffloadingEvent(block_hashes=to_hashes([4, 5, 6]), + block_size=32, + medium="B", + removed=True) + + runner.manager.take_events.side_effect = take_events + events = list(runner.scheduler_connector.take_events()) + assert len(events) == 2 + event = events[0] + assert isinstance(event, BlockStored) + assert event.block_hashes == to_hashes([1, 2, 3]) + assert event.block_size == 16 + assert event.medium == "A" + assert event.token_ids == [] + assert event.parent_block_hash is None + assert event.lora_id is None + event = events[1] + assert isinstance(event, BlockRemoved) + assert event.block_hashes == to_hashes([4, 5, 6]) + assert event.medium == "B" diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 0cae1c7bc0518..de52668e3dcf1 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -176,6 +176,7 @@ def create_model_runner_output( finished_sending: Optional[list[str]] = None, finished_recving: Optional[list[str]] = None, use_eos: bool = False, + token_id: int = 0, ) -> ModelRunnerOutput: """Make dummy model runner output for testing.""" @@ -184,7 +185,7 @@ def create_model_runner_output( req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)} # Make sampled tokens. - sampled_token = EOS_TOKEN_ID if use_eos else 0 + sampled_token = EOS_TOKEN_ID if use_eos else token_id sampled_token_ids = [[sampled_token] for _ in req_ids] kv_connector_output = None if ( diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 670f9c26b2104..873f130ed827e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -106,3 +106,8 @@ KVConnectorFactory.register_connector( "MultiConnector", "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", "MultiConnector") + +KVConnectorFactory.register_connector( + "OffloadingConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector", + "OffloadingConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py new file mode 100644 index 0000000000000..c23efa604544d --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -0,0 +1,485 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import defaultdict +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from itertools import islice +from typing import Any, Optional + +import torch + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent +from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, + KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorMetadata) +from vllm.forward_context import ForwardContext +from vllm.logger import init_logger +from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_offload.abstract import OffloadingManager +from vllm.v1.kv_offload.factory import OffloadingSpecFactory +from vllm.v1.kv_offload.mediums import GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.worker import OffloadingWorker, TransferSpec +from vllm.v1.outputs import KVConnectorOutput +from vllm.v1.request import Request + +ReqId = str + +logger = init_logger(__name__) + + +@dataclass +class OffloadingConnectorMetadata(KVConnectorMetadata): + reqs_to_load: dict[ReqId, TransferSpec] + reqs_to_store: dict[ReqId, TransferSpec] + + +class OffloadingConnector(KVConnectorBase_V1): + + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + super().__init__(vllm_config, role) + + spec = OffloadingSpecFactory.create_spec(vllm_config) + + self.connector_scheduler: Optional[OffloadingConnectorScheduler] = None + self.connector_worker: Optional[OffloadingConnectorWorker] = None + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler = OffloadingConnectorScheduler(spec) + elif role == KVConnectorRole.WORKER: + self.connector_worker = OffloadingConnectorWorker(spec) + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + OffloadingConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + pass + + def wait_for_save(self): + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + OffloadingConnectorMetadata) + self.connector_worker.start_store_kv(self._connector_metadata) + + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + assert self.connector_worker is not None + return self.connector_worker.get_finished(finished_req_ids) + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens) + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def update_connector_output(self, connector_output: KVConnectorOutput): + assert self.connector_scheduler is not None + self.connector_scheduler.update_connector_output(connector_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + def take_events(self) -> Iterable[KVCacheEvent]: + assert self.connector_scheduler is not None + return self.connector_scheduler.take_events() + + +class OffloadingConnectorScheduler: + """Implementation of Scheduler side methods""" + + def __init__(self, spec: OffloadingSpec): + self.gpu_block_size = spec.gpu_block_size + self.offloaded_block_size = spec.offloaded_block_size + self.block_size_factor = (self.offloaded_block_size // + self.gpu_block_size) + self.manager: OffloadingManager = spec.get_manager() + + self._requests: dict[ReqId, Request] = {} + # list of GPU block IDs per request + self._request_block_ids: dict[ReqId, list[int]] = {} + # requests to load for the current scheduler step + self._reqs_to_load: dict[ReqId, TransferSpec] = {} + # request blocks are stored in order + # index of next block (of size offloaded_block_size) to offload + self._next_stored_block_idx: dict[ReqId, int] = {} + + # request ID -> set(block hashes being stored/load) + self._reqs_being_stored = defaultdict[ReqId, set[BlockHash]](set) + self._reqs_being_loaded = defaultdict[ReqId, set[BlockHash]](set) + + def _get_block_hashes( + self, + req: Request, + start_idx: int = 0, + end_idx: Optional[int] = None, + ) -> Iterable[BlockHash]: + return islice( + req.block_hashes, + self.block_size_factor * start_idx + self.block_size_factor - 1, + self.block_size_factor * end_idx if end_idx else None, + self.block_size_factor) + + def get_num_new_matched_tokens( + self, request: Request, + num_computed_tokens: int) -> tuple[int, bool]: + """ + Get number of new tokens that can be loaded beyond the + num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + A tuple with the following elements: + - The number of tokens that can be loaded beyond what is + already computed. + - `True` if tokens will be loaded asynchronously + (between scheduler steps). + """ + num_blocks = request.num_tokens // self.offloaded_block_size + + assert (len(request.block_hashes) // + self.block_size_factor == num_blocks) + block_hashes = self._get_block_hashes(request) + + self.manager.touch(block_hashes) + + full_block_tokens = self.offloaded_block_size * num_blocks + if full_block_tokens - num_computed_tokens < self.offloaded_block_size: + # we can load less than a block, skip + return 0, False + + start_block_idx = num_computed_tokens // self.offloaded_block_size + hits = self.manager.lookup( + self._get_block_hashes(request, start_idx=start_block_idx)) + if hits == 0: + return 0, False + + num_hit_tokens = (self.offloaded_block_size * + (start_block_idx + hits) - num_computed_tokens) + logger.debug( + "Request %s hit %s offloaded tokens after %s GPU hit tokens", + request.request_id, + num_hit_tokens, + num_computed_tokens, + ) + if num_hit_tokens < self.offloaded_block_size: + return 0, False + + return num_hit_tokens, True + + def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, + num_external_tokens: int): + self._requests[request.request_id] = request + # the block ids are updated in _get_reqs_to_store + self._request_block_ids[request.request_id] = [] + + if num_external_tokens == 0: + return + + block_groups = blocks.get_block_ids() + block_ids = block_groups[0] + + num_computed_gpu_blocks = sum(block.block_hash is not None + for block in blocks.blocks[0]) + num_computed_tokens = num_computed_gpu_blocks * self.gpu_block_size + full_block_tokens = num_computed_tokens + num_external_tokens + assert full_block_tokens % self.offloaded_block_size == 0 + + num_pending_gpu_blocks = len(block_ids) - num_computed_gpu_blocks + assert (num_external_tokens == num_pending_gpu_blocks * + self.gpu_block_size) + + start_block_idx = num_computed_tokens // self.offloaded_block_size + num_blocks = full_block_tokens // self.offloaded_block_size + + assert (len(request.block_hashes) // self.block_size_factor + >= num_blocks) + block_hashes = self._get_block_hashes(request, + start_idx=start_block_idx, + end_idx=num_blocks) + + src_spec = self.manager.prepare_load(block_hashes) + dst_spec = GPULoadStoreSpec(block_ids[num_computed_gpu_blocks:]) + + block_hashes = self._get_block_hashes(request, + start_idx=start_block_idx, + end_idx=num_blocks) + + self._reqs_to_load[request.request_id] = (src_spec, dst_spec) + self._reqs_being_loaded[request.request_id].update(block_hashes) + self._next_stored_block_idx[request.request_id] = num_blocks + + def _get_reqs_to_store(self, scheduler_output: SchedulerOutput): + reqs_to_store: dict[ReqId, TransferSpec] = {} + # iterate over both new and cached requests + for req_id, new_block_id_groups, preempted in yield_req_data( + scheduler_output): + + if preempted: + self._request_block_ids[req_id] = [] + + if new_block_id_groups: + new_block_ids = new_block_id_groups[0] + self._request_block_ids[req_id] += new_block_ids + + block_ids = self._request_block_ids[req_id] + + req = self._requests[req_id] + new_tokens = scheduler_output.num_scheduled_tokens[req_id] + total_tokens = req.num_computed_tokens + new_tokens + num_blocks = total_tokens // self.offloaded_block_size + start_block_idx = self._next_stored_block_idx.get(req_id, 0) + num_new_blocks = num_blocks - start_block_idx + + if num_new_blocks <= 0: + continue + + num_gpu_blocks = num_blocks * self.block_size_factor + assert len(req.block_hashes) >= num_gpu_blocks + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks) + store_output = self.manager.prepare_store(new_block_hashes) + if store_output is None: + logger.warning("Cannot store %s blocks", num_new_blocks) + break + + self._next_stored_block_idx[req_id] = num_blocks + + if not store_output.block_hashes_to_store: + continue + block_hashes_to_store = set(store_output.block_hashes_to_store) + + block_hashes = self._get_block_hashes(req, end_idx=num_blocks) + self.manager.touch(block_hashes) + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks) + dst_spec = store_output.store_spec + src_block_ids: list[int] = [] + for idx, blk_hash in enumerate(new_block_hashes): + if blk_hash not in block_hashes_to_store: + continue + offloaded_block_idx = start_block_idx + idx + gpu_block_idx = offloaded_block_idx * self.block_size_factor + for i in range(self.block_size_factor): + src_block_ids.append(block_ids[gpu_block_idx + i]) + src_spec = GPULoadStoreSpec(src_block_ids) + + reqs_to_store[req_id] = (src_spec, dst_spec) + self._reqs_being_stored[req_id] |= block_hashes_to_store + + logger.debug( + "Request %s offloading %s blocks starting from block #%d", + req_id, + len(block_hashes_to_store), + start_block_idx, + ) + + return reqs_to_store + + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + meta = OffloadingConnectorMetadata( + reqs_to_load=self._reqs_to_load, + reqs_to_store=self._get_reqs_to_store(scheduler_output)) + self._reqs_to_load = {} + return meta + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + for req_id in connector_output.finished_sending or []: + block_hashes = self._reqs_being_stored.pop(req_id, None) + if block_hashes: + self.manager.complete_store(block_hashes) + + for req_id in connector_output.finished_recving or []: + block_hashes = self._reqs_being_loaded.pop(req_id, None) + if block_hashes: + self.manager.complete_load(block_hashes) + + def request_finished( + self, + request: Request, + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + req_id = request.request_id + self._requests.pop(req_id, None) + self._request_block_ids.pop(req_id, None) + self._next_stored_block_idx.pop(req_id, None) + + request_being_stored = req_id in self._reqs_being_stored + return request_being_stored, None + + def take_events(self) -> Iterable[KVCacheEvent]: + """Take the KV cache events from the connector. + + Returns: + A list of KV cache events. + """ + for event in self.manager.take_events(): + if event.removed: + yield BlockRemoved(block_hashes=event.block_hashes, + medium=event.medium) + else: + yield BlockStored(block_hashes=event.block_hashes, + parent_block_hash=None, + token_ids=[], + lora_id=None, + block_size=event.block_size, + medium=event.medium) + + +class OffloadingConnectorWorker: + """Implementation of Worker side methods""" + + def __init__(self, spec: OffloadingSpec): + self.spec = spec + self.worker = OffloadingWorker() + + self._job_counter = 0 + + # req_id -> (job_id, store) + self._jobs: dict[int, tuple[ReqId, bool]] = {} + # req_id -> active job IDs + self._load_job: dict[ReqId, int] = {} + # req_id -> set(active job IDs) + self._store_jobs = defaultdict[ReqId, set[int]](set) + + self._finished_reqs_waiting_for_store: set[ReqId] = set() + + def _generate_job_id(self) -> int: + job_id = self._job_counter + self._job_counter = job_id + 1 + return job_id + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for src_cls, dst_cls, handler in (self.spec.get_handlers(kv_caches)): + self.worker.register_handler(src_cls, dst_cls, handler) + + def start_load_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_load.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, False) + assert req_id not in self._load_job + self._load_job[req_id] = job_id + assert self.worker.transfer_async(job_id, transfer_spec) + + def start_store_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_store.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, True) + self._store_jobs[req_id].add(job_id) + assert self.worker.transfer_async(job_id, transfer_spec) + + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + Returns a list of request IDs that finished loading or storing. + + Returns: + ids of requests that have finished asynchronous transfer + tuple of (sending/saving ids, recving/loading ids). + """ + finished_sending = set() + finished_recving = set() + for job_id, success in self.worker.get_finished(): + # we currently do not support job failures + assert success + req_id, store = self._jobs.pop(job_id) + if store: + req_jobs = self._store_jobs[req_id] + req_jobs.remove(job_id) + if req_jobs: + continue + + if req_id in self._finished_reqs_waiting_for_store: + self._finished_reqs_waiting_for_store.remove(req_id) + finished_sending.add(req_id) + del self._store_jobs[req_id] + else: + req_job = self._load_job[req_id] + assert job_id == req_job + del self._load_job[req_id] + finished_recving.add(req_id) + + for req_id in finished_req_ids: + pending_req_jobs = self._store_jobs.get(req_id) + if pending_req_jobs: + self._finished_reqs_waiting_for_store.add(req_id) + elif pending_req_jobs is not None: + finished_sending.add(req_id) + del self._store_jobs[req_id] + + return finished_sending, finished_recving + + +def yield_req_data( + scheduler_output) -> Iterator[tuple[str, tuple[list[int], ...], bool]]: + """ + Yields: + (req_id, new_block_id_groups, preempted) + """ + # new requests + for req_data in scheduler_output.scheduled_new_reqs: + yield req_data.req_id, req_data.block_ids, False + + # cached requests + cached_reqs = scheduler_output.scheduled_cached_reqs + yield from zip(cached_reqs.req_ids, cached_reqs.new_block_ids, + cached_reqs.resumed_from_preemption) diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py new file mode 100644 index 0000000000000..6365ab4a6db75 --- /dev/null +++ b/vllm/v1/kv_offload/factory.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +from typing import TYPE_CHECKING, Callable + +from vllm.logger import init_logger +from vllm.v1.kv_offload.spec import OffloadingSpec + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + + +class OffloadingSpecFactory: + _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {} + + @classmethod + def register_spec(cls, name: str, module_path: str, + class_name: str) -> None: + """Register a spec with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> type[OffloadingSpec]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_spec( + cls, + config: "VllmConfig", + ) -> OffloadingSpec: + kv_transfer_config = config.kv_transfer_config + assert kv_transfer_config is not None + extra_config = kv_transfer_config.kv_connector_extra_config + spec_name = extra_config.get("spec_name", "CPUOffloadingSpec") + if spec_name in cls._registry: + spec_cls = cls._registry[spec_name]() + else: + spec_module_path = extra_config.get("spec_module_path") + if spec_module_path is None: + raise ValueError(f"Unsupported spec type: {spec_name}") + spec_module = importlib.import_module(spec_module_path) + spec_cls = getattr(spec_module, spec_name) + assert issubclass(spec_cls, OffloadingSpec) + logger.info("Creating offloading spec with name: %s", spec_name) + return spec_cls(config) + + +# Register various specs here. diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py new file mode 100644 index 0000000000000..ed23d5e51934f --- /dev/null +++ b/vllm/v1/kv_offload/spec.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Iterator +from typing import TYPE_CHECKING + +import torch + +from vllm.logger import init_logger +from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager +from vllm.v1.kv_offload.worker.worker import OffloadingHandler + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + + +class OffloadingSpec(ABC): + """Spec for an offloading connector""" + + def __init__(self, vllm_config: "VllmConfig"): + logger.warning( + "Initializing OffloadingSpec. This API is experimental and " + "subject to change in the future as we iterate the design.") + self.vllm_config = vllm_config + + kv_transfer_config = vllm_config.kv_transfer_config + assert kv_transfer_config is not None + self.extra_config = kv_transfer_config.kv_connector_extra_config + + self.gpu_block_size = vllm_config.cache_config.block_size + self.offloaded_block_size = int( + self.extra_config.get("block_size", self.gpu_block_size)) + + assert self.offloaded_block_size % self.gpu_block_size == 0 + + @abstractmethod + def get_manager(self) -> OffloadingManager: + """ + Get an OffloadingManager that will be used + by the scheduler-side offloading connector to track + offloaded blocks and manage evictions. + """ + pass + + @abstractmethod + def get_handlers( + self, kv_caches: dict[str, torch.Tensor] + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + """ + Get offloading handlers along with their respective src and dst types. + + Args: + kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor. + + Yields: + Tuples of (src_type, dst_type, offloading_handler). + """ + pass From a2a5f79e09075f50943750a2376b3b5d95a47ac5 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:07:26 -0700 Subject: [PATCH 027/185] Optimize triton unified attention performance for sliding window attention (#24390) Signed-off-by: zixi-qi --- .../test_triton_unified_attention.py | 2 +- .../attention/ops/triton_unified_attention.py | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index ab91560e995c8..5cff29b15aa3f 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -83,7 +83,7 @@ def ref_paged_attn( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 591b68bfa6468..9e7cafc174287 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -184,8 +184,30 @@ def kernel_unified_attention_2d( # this prefix can be skipped) num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) - # iterate through tiles - for j in range(0, num_tiles): + # ---- Sliding-window tile pruning -------------------- + # Default: keep previous global behavior + tile_start = 0 + tile_end = num_tiles + if SLIDING_WINDOW > 0: + # Query rows covered by this Q-block + qpos_lo = q_block_local_idx * BLOCK_Q + qpos_hi = tl.minimum( + qpos_lo + (BLOCK_M - 1) // num_queries_per_kv, + cur_batch_query_len - 1, + ) + # For sliding window, each query position q can only attend to + # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs] + # where q_abs = context_len + q + # The union of allowed key positions for this Q-block is: + # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi] + first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1 + last_allowed_key = context_len + qpos_hi + # Convert to tile indices and clamp + tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE) + tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles) + + # iterate through tiles (now limited to the sliding window range) + for j in range(tile_start, tile_end): seq_offset = j * TILE_SIZE + offs_t tile_mask = seq_offset < max_seq_prefix_len From 7852b82b93d189a5374d403f9735d6849ba4c64e Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 19 Sep 2025 15:14:09 -0400 Subject: [PATCH 028/185] [Bugfix] GPT OSS Attritbute error on H100 (#25228) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/model_executor/layers/fused_moe/config.py | 4 ++-- vllm/model_executor/layers/quantization/mxfp4.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 742df3dbdc6af..b14bc06e913cf 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import torch @@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.utils import cdiv, has_triton_kernels from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if TYPE_CHECKING and has_triton_kernels: +if has_triton_kernels(): from triton_kernels.matmul_ogs import PrecisionConfig logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 28c1e60ccd08a..5c3f8a891276b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -638,8 +638,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): return None if self.mxfp4_backend == Mxfp4Backend.TRITON: - w1_scale = layer.w13_precision_config - w2_scale = layer.w2_precision_config + w1_scale = self.w13_precision_config + w2_scale = self.w2_precision_config else: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale From 4bdf40021821dfb085e2cbe2f4dc0b7ad7b2e635 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:42:01 -0400 Subject: [PATCH 029/185] [Bugfix] Fix chunked a2_scales in modular kernels (#25264) Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/batched_deep_gemm_moe.py | 1 + .../layers/fused_moe/batched_triton_or_deep_gemm_moe.py | 3 ++- vllm/model_executor/layers/fused_moe/cutlass_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 3 ++- .../layers/fused_moe/flashinfer_cutlass_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++- .../layers/fused_moe/gpt_oss_triton_kernels_moe.py | 1 + vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 +++++ vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 2 ++ vllm/model_executor/layers/fused_moe/trtllm_moe.py | 1 + 11 files changed, 23 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index e9dfb22bea27b..cf0b965cc8c51 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -286,6 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index 8b9070f098898..c3c4f4a5d190e 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -126,6 +126,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -136,5 +137,5 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): assert experts is not None experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, - workspace13, workspace2, expert_tokens_meta, + a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 957ffca0d1246..8c2ff580575f5 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -241,6 +241,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -262,7 +263,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, self.w1_scale, self.w2_scale, - a1q_scale, self.a2_scale, self.ab_strides1, self.ab_strides2, + a1q_scale, a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, @@ -705,6 +706,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], # unused + a2_scale: Optional[torch.Tensor], # unused workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 8830b95df7cf0..51a4f275e98cc 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -214,13 +214,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): assert a1q_scale is not None - assert self.a2_scale is None + assert a2_scale is None assert self.block_shape is not None assert self.w1_scale is not None assert self.w2_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 6eeec18a6ec87..a074da883088e 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -129,6 +129,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index fe6ac458a9593..660bae3146026 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -688,6 +688,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -879,6 +880,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -970,7 +972,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache1.view(-1, N)) qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, max_num_tokens, E, N, + intermediate_cache2, a2_scale, max_num_tokens, E, N, expert_num_tokens, self.quant_dtype, self.per_act_token_quant, self.block_shape) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index d4de3f640865e..6c2a5bda7cbaa 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1598,6 +1598,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): a2q_scale: Optional[torch.Tensor] = None qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, self.quant_dtype, + intermediate_cache2, a2_scale, self.quant_dtype, self.per_act_token_quant, self.block_shape) invoke_fused_moe_kernel( diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 614a83ad1158c..08a9b34a42457 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -179,6 +179,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 729f8e39cf0f7..a16c254fadf66 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -519,6 +519,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], @@ -634,6 +635,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, ) -> torch.Tensor: @@ -671,6 +673,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=a2_scale, workspace13=workspace13, workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, @@ -718,6 +721,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=self.fused_experts.a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -803,6 +807,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=c_a1q_scale, + a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index b2dbc306a6148..3de80ff85747e 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -111,6 +111,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -134,6 +135,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts, expert_map, a1q_scale, + a2_scale, workspace13, workspace2, expert_tokens_meta, diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 8e5f6acc9df63..05ed93c942c8e 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -103,6 +103,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], From e57fc15971dd3c3aff9b2bdcb43741b5f34c329e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:43:33 +0100 Subject: [PATCH 030/185] Specify platform in `pip-compile` `pre-commit` hook so it runs on MacOS (#25273) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- requirements/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13ad3af97d839..a4ea888af3f3e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,7 +49,7 @@ repos: rev: 0.6.17 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/requirements/test.txt b/requirements/test.txt index 39040f210b2fd..3519aa524f418 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 +# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28 absl-py==2.1.0 # via rouge-score accelerate==1.0.1 From 48ecb4438b2845f757edf228c5455ca6095938af Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 19 Sep 2025 16:06:49 -0400 Subject: [PATCH 031/185] [Perf] Use FlashInfer RoPE for RotaryEmbedding.forward_cuda when available (#21126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Luka Govedič --- .../layers/rotary_embedding/base.py | 38 +++++++++++---- .../layers/rotary_embedding/common.py | 46 +++++++++++++++++++ .../rotary_embedding/deepseek_scaling_rope.py | 4 +- .../rotary_embedding/llama4_vision_rope.py | 2 +- .../layers/rotary_embedding/mrope.py | 2 + 5 files changed, 78 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index db50eb08db3ff..3dc249ae9adb9 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -6,6 +6,8 @@ from typing import Optional import torch from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer from .common import apply_rotary_emb_torch @@ -30,9 +32,17 @@ class RotaryEmbedding(CustomOp): self.base = base self.is_neox_style = is_neox_style self.dtype = dtype + # Flashinfer only supports head_size=64, 128, 256, 512. + # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 + self.use_flashinfer = (self.enabled() + and dtype in (torch.float16, torch.bfloat16) + and current_platform.is_cuda() + and has_flashinfer() + and self.head_size in [64, 128, 256, 512]) cache = self._compute_cos_sin_cache() - cache = cache.to(dtype) + if not self.use_flashinfer: + cache = cache.to(dtype) self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) @@ -57,6 +67,14 @@ class RotaryEmbedding(CustomOp): cache = torch.cat((cos, sin), dim=-1) return cache + def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None: + # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) + # is expensive, so avoid calling it if possible + if self.cos_sin_cache.device != query.device or \ + self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + def forward_native( self, positions: torch.Tensor, @@ -94,15 +112,16 @@ class RotaryEmbedding(CustomOp): query: torch.Tensor, key: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if self.use_flashinfer: + torch.ops.vllm.flashinfer_rotary_embedding(positions, query, key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style) + return query, key + from vllm import _custom_ops as ops - # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) - # is expensive, so avoid calling it if possible - if self.cos_sin_cache.device != query.device or \ - self.cos_sin_cache.dtype != query.dtype: - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - + self._match_cos_sin_cache_dtype(query) # ops.rotary_embedding() is an in-place operation # that updates the query and key tensors. ops.rotary_embedding(positions, query, key, self.head_size, @@ -117,8 +136,7 @@ class RotaryEmbedding(CustomOp): ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm._ipex_ops import ipex_ops as ops - self.cos_sin_cache = self.cos_sin_cache.to(positions.device, - dtype=query.dtype) + self._match_cos_sin_cache_dtype(query) # ops.rotary_embedding() is an in-place operation # that updates the query and key tensors. if key is None: diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 8d821bea19e3e..e3cd0a8e788eb 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -6,6 +6,7 @@ import math import torch from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op if current_platform.is_cuda(): from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb @@ -103,3 +104,48 @@ def yarn_get_mscale(scale: float = 1) -> float: if scale <= 1: return 1.0 return 0.1 * math.log(scale) + 1.0 + + +def _flashinfer_rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + """Custom op wrapper for flashinfer's rotary embedding. + + This is an in-place operation that modifies query and key tensors directly. + """ + from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace + + apply_rope_with_cos_sin_cache_inplace( + positions=positions, + query=query, + key=key, + head_size=head_size, + cos_sin_cache=cos_sin_cache, + is_neox=is_neox, + ) + + +def _flashinfer_rotary_embedding_fake( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + return + + +# Register flashinfer rotary embedding custom op +direct_register_custom_op( + op_name="flashinfer_rotary_embedding", + op_func=_flashinfer_rotary_embedding, + mutates_args=["query", "key"], # These tensors are modified in-place + fake_impl=_flashinfer_rotary_embedding_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 7ac2e4bb6c34f..736ec2c1dd3a3 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -97,15 +97,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" assert key is not None + self._match_cos_sin_cache_dtype(query) query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] if self.rotary_dim < self.head_size: query_pass = query[..., self.rotary_dim:] key_pass = key[..., self.rotary_dim:] - if self.cos_sin_cache.device != positions.device: - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( - positions.device) cos_sin = self.cos_sin_cache[torch.add(positions, offsets) if offsets is not None else positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index 37ead43e22bc4..8717280353068 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -59,7 +59,7 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): key: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert key is not None - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) + self._match_cos_sin_cache_dtype(query) query_ = torch.view_as_complex(query.float().reshape( *query.shape[:-1], -1, 2)) key_ = torch.view_as_complex(key.float().reshape( diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index ccc59bbbe233f..17d04a1ad715c 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -245,6 +245,7 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None + self._match_cos_sin_cache_dtype(query) num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) @@ -293,6 +294,7 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None + self._match_cos_sin_cache_dtype(query) num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) From b1a63d1b3be996babec6411e3abe559796f76ca9 Mon Sep 17 00:00:00 2001 From: nvjullin Date: Sat, 20 Sep 2025 04:36:34 +0800 Subject: [PATCH 032/185] [BugFix] Make FlashInferMetadataBuilder non-blocking (#25040) Signed-off-by: Julien Lin Co-authored-by: Michael Goin --- vllm/v1/attention/backends/flashinfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index dda6dd4fbea7a..cb092aa74e7f1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): kv_data_type=self.kv_cache_dtype, ) else: - attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) + attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to( + self.device, non_blocking=True) attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( - self.device) + self.device, non_blocking=True) if num_decodes > 0: pure_decode = num_prefills == 0 From ddc9048394ae6294d0db7fd67270efea59c3a065 Mon Sep 17 00:00:00 2001 From: David-Wen <18927700430@163.com> Date: Sat, 20 Sep 2025 04:44:24 +0800 Subject: [PATCH 033/185] Fix: Correct FusedMoE layer reference in auto_round quantization (#24818) Signed-off-by: David-Wen <18927700430@163.com> Signed-off-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/auto_round.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index 1ca92273430dd..bf5141fa48943 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return AWQMoEMethod(quant_args_marlin, layer.moe) + return AWQMoEMethod(quant_args_marlin, layer.moe_config) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -327,7 +327,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) else: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) From e69e0b8b5fc5ef2958f7b1fc159119e9c4c0e2d2 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:40:16 -0400 Subject: [PATCH 034/185] [Frontend] Responses API messages out, just harmony for now (#24985) Signed-off-by: Alec Solder Co-authored-by: Alec Solder Co-authored-by: Ye (Charlotte) Qi --- .../openai/test_response_api_with_harmony.py | 15 +++++++++++++++ vllm/entrypoints/openai/protocol.py | 17 ++++++++++++++++- vllm/entrypoints/openai/serving_responses.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 40a22c04b08a5..f3c3148577b85 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str): assert response_2 is not None assert response_2.status == "completed" assert response_2.output_text is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_output_messages_enabled(client: OpenAI, model_name: str, + server): + response = await client.responses.create( + model=model_name, + input="What is the capital of South Korea?", + extra_body={"enable_response_messages": True}) + + assert response is not None + assert response.status == "completed" + assert len(response.input_messages) > 0 + assert len(response.output_messages) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 05d5d6d964dd3..c30681318f693 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel): "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " "to 256 bit). Not supported by vLLM engine V0.")) + + enable_response_messages: bool = Field( + default=False, + description=( + "Dictates whether or not to return messages as part of the " + "response object. Currently only supported for non-streaming " + "non-background and gpt-oss only. ")) # --8<-- [end:responses-extra-params] _DEFAULT_SAMPLING_PARAMS = { @@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel): model: str object: Literal["response"] = "response" output: list[ResponseOutputItem] + # These are populated when enable_response_messages is set to True + # TODO: Currently an issue where content of harmony messages + # is not available when these are serialized. Metadata is available + input_messages: Optional[list[ChatCompletionMessageParam]] = None + output_messages: Optional[list[ChatCompletionMessageParam]] = None parallel_tool_calls: bool temperature: float tool_choice: ToolChoice @@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel): output: list[ResponseOutputItem], status: ResponseStatus, usage: Optional[ResponseUsage] = None, + input_messages: Optional[list[ChatCompletionMessageParam]] = None, + output_messages: Optional[list[ChatCompletionMessageParam]] = None, ) -> "ResponsesResponse": incomplete_details: Optional[IncompleteDetails] = None @@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel): # TODO: implement the other reason for incomplete_details, # which is content_filter # incomplete_details = IncompleteDetails(reason='content_filter') - return cls( id=request.request_id, created_at=created_time, @@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel): metadata=request.metadata, model=model_name, output=output, + input_messages=input_messages, + output_messages=output_messages, parallel_tool_calls=request.parallel_tool_calls, temperature=sampling_params.temperature, tool_choice=request.tool_choice, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4894623aeac28..6e243671af242 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing): # "completed" is implemented as the "catch-all" for now. status: ResponseStatus = "completed" + input_messages = None + output_messages = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) + if request.enable_response_messages: + input_messages = context.messages[:context.num_init_messages] + output_messages = context.messages[context.num_init_messages:] num_tool_output_tokens = context.num_tool_output_tokens if len(output) > 0: if context.finish_reason == "length": @@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing): output = self._make_response_output_items(request, final_output, tokenizer) + # TODO: context for non-gptoss models doesn't use messages + # so we can't get them out yet + if request.enable_response_messages: + raise NotImplementedError( + "enable_response_messages is currently" + " only supported for gpt-oss") # Calculate usage. assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 @@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing): response = ResponsesResponse.from_request( request, sampling_params, + input_messages=input_messages, + output_messages=output_messages, model_name=model_name, created_time=created_time, output=output, From 711e912946d23f4ccc1f554b1524c960553c5e28 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:23:19 -0400 Subject: [PATCH 035/185] [Compile] Fix Compile Warning for Ignoring `MIN_BLOCK_PER_SM` (#25193) Signed-off-by: yewentao256 --- csrc/launch_bounds_utils.h | 38 +++++++++++++++++++ .../activation_nvfp4_quant_fusion_kernels.cu | 6 ++- csrc/quantization/fp4/nvfp4_experts_quant.cu | 10 +++-- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 8 ++-- 4 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 csrc/launch_bounds_utils.h diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h new file mode 100644 index 0000000000000..d5a89690111bc --- /dev/null +++ b/csrc/launch_bounds_utils.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +// maximum blocks per SM cap +#ifndef VLLM_LAUNCH_BLOCKS_CAP + #define VLLM_LAUNCH_BLOCKS_CAP 4 +#endif + +// compile-time estimate of max threads per SM for launch bounds. +#ifndef VLLM_MAX_THREADS_PER_SM + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + #define VLLM_MAX_THREADS_PER_SM 1536 + #else + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif +#endif + +// compute the number of blocks per SM to request in __launch_bounds__ +#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL)) +#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \ + (((VAL) <= 0) \ + ? 1 \ + : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP)) +#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \ + VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS)) + +// runtime-time helper to compute blocks/SM +static inline int vllm_runtime_blocks_per_sm(int block_threads) { + int device = -1; + cudaGetDevice(&device); + int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM; + cudaDeviceGetAttribute(&max_threads_per_sm, + cudaDevAttrMaxThreadsPerMultiProcessor, device); + int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1; + return VLLM_CLAMP_BLOCKS_PER_SM(blocks); +} diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 74fde23782ce5..7539f836ecf37 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -26,6 +26,7 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { @@ -63,7 +64,7 @@ __inline__ __device__ PackedVec compute_silu_mul(PackedVec& vec, // Use UE4M3 by default. template -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { @@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); - int const numBlocksPerSM = 2048 / block.x; + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES( diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index ce3ba2c19b9eb..6d385e0dd94e7 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -26,12 +26,13 @@ #include "dispatch_utils.h" #include "nvfp4_utils.cuh" +#include "launch_bounds_utils.h" namespace vllm { // Use UE4M3 by default. template -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4) // Kernel for LARGE_M_TOPK = true (large m_topk optimized version) template -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input, int const workSizePerRow = k / ELTS_PER_THREAD; int const totalWorkSize = m_topk * workSizePerRow; dim3 block(std::min(workSizePerRow, 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(static_cast((totalWorkSize + block.x - 1) / block.x), multiProcessorCount * numBlocksPerSM)); while (grid.x <= multiProcessorCount && block.x > 64) { diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 0c1b9ef0664d7..5575ee8e4197e 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -26,13 +26,14 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { // Use UE4M3 by default. template -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec; @@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, // Grid, Block size. // Each thread converts 8 values. dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); // Launch the cvt kernel. From 431535b522c62f1422848e22dfd83bec2d04111a Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Fri, 19 Sep 2025 15:40:33 -0700 Subject: [PATCH 036/185] Enable modelopt gemma3 nvfp4/fp8, make workflow more robust (#22771) Signed-off-by: Zhiyu Cheng Signed-off-by: Michael Goin Co-authored-by: Michael Goin --- .../moe/test_modular_kernel_combinations.py | 3 +- vllm/compilation/backends.py | 7 ++- vllm/config/model.py | 3 ++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 4 +- .../layers/quantization/modelopt.py | 53 +++++++++++++------ vllm/model_executor/models/gemma3.py | 16 ++++++ vllm/model_executor/models/siglip.py | 18 ++++++- 7 files changed, 82 insertions(+), 22 deletions(-) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 19c4301bd23d5..1c7e62d7aa4c8 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -11,7 +11,8 @@ import pytest import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.config import VllmConfig, current_platform, set_current_vllm_config +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 3cc0fc3106f5a..d6bdb31a3c630 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -31,8 +31,11 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: - if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( - "2.8.0.dev"): + # Use standalone compile only if requested, version is new enough, + # and the symbol actually exists in this PyTorch build. + if (envs.VLLM_USE_STANDALONE_COMPILE + and is_torch_equal_or_newer("2.8.0.dev") + and hasattr(torch._inductor, "standalone_compile")): logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: diff --git a/vllm/config/model.py b/vllm/config/model.py index 21457d3660a23..4e847922b61e6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -964,6 +964,9 @@ class ModelConfig: "modelopt", "modelopt_fp4", "petit_nvfp4", + # Ensure heavy backends are probed last to avoid unnecessary + # imports during override detection (e.g., MXFP4 imports Triton) + "mxfp4", ] quantization_methods = [ q for q in supported_quantization if q not in overrides diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 08a9b34a42457..f12d3807517ff 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -20,10 +20,10 @@ if has_triton_kernels(): from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, matmul_ogs) from triton_kernels.routing import routing - except ModuleNotFoundError: + except (ModuleNotFoundError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " - "version is compatible.") + "version is compatible. Error: %s", e) def triton_kernel_moe_forward( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7eac40825ac33..1083f398a3a20 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -160,6 +160,7 @@ class ModelOptFp8Config(QuantizationConfig): def is_layer_excluded(self, prefix: str) -> bool: """ Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and substring matching. This method handles both regular models and multimodal models that use the language_model prefix. For multimodal models, it checks if the @@ -168,11 +169,18 @@ class ModelOptFp8Config(QuantizationConfig): if self.exclude_modules is None: return False - # Check if any excluded module matches the prefix + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Then check substring matching for patterns not caught by exact match for module in self.exclude_modules: - if (module in prefix - or (prefix.startswith("language_model.") - and module in prefix.removeprefix("language_model."))): + # Skip exact matches already handled above + if (module != prefix and + (module in prefix or + (prefix.startswith("language_model.") + and module in prefix.removeprefix("language_model.")))): return True return False @@ -180,9 +188,10 @@ class ModelOptFp8Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): @@ -778,22 +787,34 @@ class ModelOptNvFp4Config(QuantizationConfig): return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) - def is_layer_excluded(self, prefix: str, - exclude_modules: list[str]) -> bool: + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and pattern matching. + """ + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Check regex pattern matching for patterns not caught by exact match import regex as re - for pattern in exclude_modules: - regex_str = pattern.replace('.', r'\.').replace('*', r'.*') - if re.fullmatch(regex_str, prefix): - return True + for pattern in self.exclude_modules: + # Skip patterns that would be caught by exact matching + if '*' in pattern or '.' in pattern: + regex_str = pattern.replace('.', r'\.').replace('*', r'.*') + if re.fullmatch(regex_str, prefix): + return True return False def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix, self.exclude_modules)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptNvFp4LinearMethod(self) elif isinstance(layer, Attention): diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 1263e3049a14a..7246308d59028 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -446,6 +446,22 @@ class Gemma3Model(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 3630f59f53e0a..eb49d6d2c3350 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs @@ -506,6 +507,21 @@ class SiglipVisionModel(nn.Module): if layer_idx >= layer_count: continue + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From ee7a66dd9a5ead46f062502af33766f45076f05d Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:59:41 -0700 Subject: [PATCH 037/185] allow disable flashinfer prefill (#25276) Signed-off-by: Lu Fang --- vllm/envs.py | 3 +++ vllm/v1/attention/backends/mla/common.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 19e2f8635275d..294a0b920fb78 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False + VLLM_DISABLE_FLASHINFER_PREFILL: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 @@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DISABLE_FLASHINFER_PREFILL": + lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1", "VLLM_DO_NOT_TRACK": lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( "DO_NOT_TRACK", None) or "0") == "1", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index a990cb2f1a972..5b307810de930 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. - return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available + and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100)) From 14c1432789c9c1b66308481b2c37439d3ee6661a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 16:34:07 -0700 Subject: [PATCH 038/185] [BugFix] Fix async scheduling CPU tensor race take 2 (#25279) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_model_runner.py | 52 ++++++++++++++++++------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d0f26266f0c5..3539f75612050 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } - elif (self.enable_prompt_embeds and get_pp_group().is_first_rank): + elif self.enable_prompt_embeds and get_pp_group().is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. # TODO(qthequartermasterman): Since even when prompt embeds are @@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): invalid_req_indices, ) + @contextmanager + def synchronize_input_prep(self): + if self.prepare_inputs_event is None: + yield + return + + # Ensure prior step has finished with reused CPU tensors. + # This is required in the async scheduling case because + # the CPU->GPU transfer happens async. + self.prepare_inputs_event.synchronize() + try: + yield + finally: + self.prepare_inputs_event.record() + @torch.inference_mode() def execute_model( self, @@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: with record_function_or_nullcontext("Preprocess"): - self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: - if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, - self.vllm_config) - if self.cache_config.kv_sharing_fast_prefill: - assert not self.input_batch.num_prompt_logprobs, ( - "--kv-sharing-fast-prefill produces incorrect logprobs for " - "prompt tokens, tokens, please disable it when the requests" - " need prompt logprobs") + with self.synchronize_input_prep(): + # Update persistent batch states. + self._update_states(scheduler_output) + + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward( + scheduler_output, self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs") - if self.prepare_inputs_event is not None: - # Ensure prior step has finished with reused CPU tensors. - self.prepare_inputs_event.synchronize() - try: # Prepare the decoder inputs. (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata, max_query_len, ubatch_slices, num_tokens_after_padding ) = self._prepare_inputs(scheduler_output) - finally: - if self.prepare_inputs_event is not None: - self.prepare_inputs_event.record() - ( num_scheduled_tokens, num_input_tokens, From 3da17c2cc2c2e1d750020e033535f942f156f64c Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Fri, 19 Sep 2025 17:27:21 -0700 Subject: [PATCH 039/185] [Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090) Signed-off-by: Lucas Kabela --- tests/compile/test_basic_correctness.py | 16 ++-------------- tests/compile/test_full_graph.py | 4 +--- vllm/compilation/wrapper.py | 10 ++++------ vllm/envs.py | 5 ----- vllm/v1/worker/gpu_model_runner.py | 4 +--- vllm/worker/model_runner.py | 8 +++----- 6 files changed, 11 insertions(+), 36 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index fd2b1866e62e1..a1e5127ebeeb2 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -20,7 +20,6 @@ class TestSetting: tp_size: int attn_backend: str method: str - fullgraph: bool # we cannot afford testing the full Cartesian product @@ -36,7 +35,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # llama model with quantization TestSetting( @@ -46,7 +44,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # MoE model TestSetting( @@ -56,7 +53,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # embedding model TestSetting( @@ -73,7 +69,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), TestSetting( model="BAAI/bge-base-en-v1.5", @@ -82,7 +77,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), # vision language model TestSetting( @@ -92,7 +86,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate_with_image", - fullgraph=False, ), ], ) @@ -109,9 +102,8 @@ def test_compile_correctness( tp_size = test_setting.tp_size attn_backend = test_setting.attn_backend method = test_setting.method - fullgraph = test_setting.fullgraph - if cuda_device_count_stateless() != pp_size * tp_size: - pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got " + if cuda_device_count_stateless() < pp_size * tp_size: + pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got " f"{cuda_device_count_stateless()}") with monkeypatch.context() as m: @@ -149,9 +141,5 @@ def test_compile_correctness( ]: all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 84178344a5f36..3439a1b29038d 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -79,9 +79,7 @@ def test_full_graph( ): model, model_kwargs = model_info - with monkeypatch.context() as m: - # make sure these models can be captured in full graph mode - m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + with monkeypatch.context(): print(f"MODEL={model}") run_model(optimization_level, model, model_kwargs) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 96d4eae2ee9aa..930e4d27b410f 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -10,7 +10,6 @@ from typing import Callable, Optional import torch -import vllm.envs as envs from vllm.config import (CompilationLevel, CUDAGraphMode, get_current_vllm_config) from vllm.logger import init_logger @@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher: options = get_current_vllm_config( ).compilation_config.inductor_compile_config - compiled_callable = torch.compile( - self.forward, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend, - options=options) + compiled_callable = torch.compile(self.forward, + fullgraph=True, + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/envs.py b/vllm/envs.py index 294a0b920fb78..3991a789d80f6 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), - # Internal flag to enable Dynamo fullgraph capture - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": - lambda: bool( - os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is # enabled by default. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3539f75612050..dffadd1d769b7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model.compile( - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model.compile(fullgraph=True, backend=backend) return # for other compilation levels, cudagraph behavior is controlled by # CudagraphWraper and CudagraphDispatcher of vllm. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c91c871766cff..f662f5a85eff6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ import torch.distributed import torch.nn as nn from tqdm.auto import tqdm -import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState @@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile( - self.model, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model = torch.compile(self.model, + fullgraph=True, + backend=backend) def get_model(self) -> nn.Module: return self.model From a36c675817867235d368faf7e8d81e0ed3333d9c Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Fri, 19 Sep 2025 21:33:25 -0300 Subject: [PATCH 040/185] Don't skip special tokens with hermes-style tool calling (#25281) Signed-off-by: Max de Bayser --- .../openai/tool_parsers/hermes_tool_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index a6ce33af6bd00..e74c420da1d3c 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -98,6 +98,15 @@ class Hermes2ProToolParser(ToolParser): else: return delta_text + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because the tool_call tokens are + # marked "special" in some models. Since they are skipped + # prior to the call to the tool parser, it breaks tool calling. + request.skip_special_tokens = False + return request + def extract_tool_calls( self, model_output: str, From c7e713616a53a097809609d5a7b536e8bfad4ab8 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 19 Sep 2025 19:33:40 -0500 Subject: [PATCH 041/185] test: Remove vestigial skip for prompt embeds tests after landing v1 Prompt Embeds support (#25291) Signed-off-by: Andrew Sansom --- tests/entrypoints/openai/test_completion_with_prompt_embeds.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 0e3fc82f0c033..176c1825530e4 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,9 +14,6 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer -pytest.skip("Skipping prompt_embeds test until V1 supports it.", - allow_module_level=True) - # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" From b8a287a0a8a035073d67b5101687e3a753dd02ac Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 19 Sep 2025 19:46:23 -0500 Subject: [PATCH 042/185] [docs] Prompt Embedding feature support (#25288) Signed-off-by: Andrew Sansom --- docs/features/README.md | 34 ++++++++++++++++++---------------- docs/features/prompt_embeds.md | 3 --- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/docs/features/README.md b/docs/features/README.md index d8e26ec02aecc..10cc448cc2ee3 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -36,22 +36,23 @@ th:not(:first-child) { } -| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | -| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | -| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | -| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | -| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | -| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)^ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | -| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | -| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | [prompt-embeds](prompt_embeds.md) | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | +| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | +| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | +| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | +| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | | +| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | | +| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)^ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | | +| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | | +| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | | +| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ | \* Chunked prefill and prefix caching are only applicable to last-token pooling. ^ LoRA is only applicable to the language backbone of multimodal models. @@ -76,3 +77,4 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) | diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 83993bd0140fa..f9d3c1fb6c23d 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM. The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary. -!!! note - Prompt embeddings are currently only supported in the v0 engine. - ## Offline Inference To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: From 8945b001db3202f882108e50d16b6f9c5e6f01ed Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 19 Sep 2025 18:02:15 -0700 Subject: [PATCH 043/185] [torch.compile] CUDAGraph Inductor partition integration (#24281) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Boyuan Feng Signed-off-by: Boyuan Feng Signed-off-by: boyuanfeng Co-authored-by: Luka Govedič --- tests/compile/piecewise/test_simple.py | 71 ++++++++++++++++++---- tests/compile/silly_attention.py | 1 + tests/compile/test_full_graph.py | 59 +++++++++++++++++- tests/compile/test_fusion_attn.py | 16 ++++- vllm/attention/layer.py | 2 + vllm/compilation/backends.py | 10 ++- vllm/compilation/decorators.py | 57 ++++++++++++++++- vllm/config/compilation.py | 84 ++++++++++++++++++++++---- vllm/v1/cudagraph_dispatcher.py | 12 ++-- 9 files changed, 280 insertions(+), 32 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 84f4945c82725..41055f431569c 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from ..silly_attention import get_global_counter, reset_global_counter @@ -50,16 +51,21 @@ class SillyModel(nn.Module): return x -@pytest.mark.parametrize("use_inductor", [True, False]) -@torch.inference_mode() -def test_simple_piecewise_compile(use_inductor): - assert VLLM_USE_V1 - +def _run_simple_model( + splitting_ops, + use_inductor_graph_partition, + use_inductor, + expected_num_piecewise_graphs_seen, + expected_num_piecewise_capturable_graphs_seen, + expected_num_backend_compilations, + expected_num_cudagraph_captured, +): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, use_inductor=use_inductor, - splitting_ops=["silly.attention"], + splitting_ops=splitting_ops, + use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], )) @@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor): with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=5, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=3, # 1 + num_layers - num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured= - 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen= + expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ), set_forward_context(None, vllm_config=vllm_config): # background context # warm up with background context @@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor): output = model(input) assert get_global_counter() == 2 assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) + + +@pytest.mark.parametrize("use_inductor", [True, False]) +@torch.inference_mode() +def test_simple_piecewise_compile(use_inductor): + assert VLLM_USE_V1 + _run_simple_model( + splitting_ops=["silly.attention"], + use_inductor_graph_partition=False, + use_inductor=use_inductor, + expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + expected_num_backend_compilations= + 3, # num_piecewise_capturable_graphs_seen + expected_num_cudagraph_captured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ) + + +@torch.inference_mode() +@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) +def test_simple_inductor_graph_partition(splitting_ops): + assert VLLM_USE_V1 + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + _run_simple_model( + # inductor graph partition automatically resets splitting_ops + # to be an empty list + splitting_ops=splitting_ops, + use_inductor_graph_partition=True, + use_inductor=True, + expected_num_piecewise_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_piecewise_capturable_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_backend_compilations= + 1, # since not splitting at fx graph level + expected_num_cudagraph_captured= + 6, # inductor graph partition still captures 6 + # graph, same as fx graph partition. + ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index 13eb0bf4b1fa1..baedafbae99f1 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -60,4 +60,5 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 3439a1b29038d..870aa553ca628 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -3,6 +3,7 @@ from __future__ import annotations +import logging import tempfile from typing import Any, Optional, Union @@ -10,9 +11,13 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported +from tests.v1.attention.utils import _Backend from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel, PassConfig +from vllm.attention.selector import global_force_attn_backend_context_manager +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + PassConfig) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from ..utils import create_new_process_for_each_test @@ -105,6 +110,18 @@ def test_full_graph( (CompilationConfig(level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()), ("facebook/opt-125m", {})), + ] + [ + # graph inductor partition + ( + CompilationConfig( + level=CompilationLevel.PIECEWISE, + # inductor graph partition uses + # torch._C.Tag.cudagraph_unsafe to specify splitting ops + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + compile_sizes=[1, 2]), + model) for model in models_list(all=False) + if is_torch_equal_or_newer("2.9.0.dev") ]) # only test some of the models @create_new_process_for_each_test() @@ -112,11 +129,51 @@ def test_custom_compile_config( compilation_config: CompilationConfig, model_info: tuple[str, dict[str, Any]], ): + if (compilation_config.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + model, model_kwargs = model_info print(f"MODEL={model}") run_model(compilation_config, model, model_kwargs) +def test_inductor_graph_partition_attn_fusion(caplog_vllm): + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + custom_ops=["+quant_fp8"], + pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), + ) + model_kwargs = { + "kv_cache_dtype": "fp8", + "max_model_len": 1024, + } + with caplog_vllm.at_level( + logging.DEBUG), global_force_attn_backend_context_manager( + _Backend.FLASHINFER): + run_model(compilation_config, model, model_kwargs) + + try: + assert ("Fused quantization onto 48 attention nodes" + in caplog_vllm.text), caplog_vllm.text + except AssertionError: + # Note: this message is only triggered when the compilation goes + # through the custom pass. Due to multiple layers of cache on + # PyTorch side, the compilation of a graph may be cached such + # that custom pass directly goes through cache. In this case, + # we go through this branch and assert that the pass is not + # triggered. + assert "Fused quantization" not in caplog_vllm.text + + def run_model(compile_config: Union[int, CompilationConfig], model: str, model_kwargs: dict[str, Any]): prompts = [ diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 6baf4bf83f499..022f183b31932 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() @@ -339,6 +340,10 @@ else: @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) +# TODO(boyuan): test inductor graph partition on rocm +@pytest.mark.parametrize( + "use_inductor_graph_partition", + [False] if current_platform.is_rocm() else [False, True]) @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA") @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") @@ -352,9 +357,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, dtype: torch.dtype, model_name: str, model_class: type[AttentionQuantPatternModel], backend: _Backend, split_attention: bool, - monkeypatch, dist_init): + use_inductor_graph_partition: bool, + monkeypatch, dist_init, caplog_vllm): """Test AttentionStaticQuantPattern fusion pass""" + if use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + monkeypatch.setenv("VLLM_USE_V1", "1") if split_attention: monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1") @@ -372,6 +383,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, custom_ops=["+quant_fp8"], + use_inductor_graph_partition=use_inductor_graph_partition, ), cache_config=CacheConfig(cache_dtype="fp8")) @@ -444,6 +456,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, backend=test_backend, fullgraph=True) assert model_compiled.attn._o_scale_float is None + result_fused_1 = model_compiled(q, k, v) if backend == _Backend.FLASHINFER: @@ -453,6 +466,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, # _o_scale_float assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) + assert model_compiled.attn._o_scale_float is not None torch.testing.assert_close(result_unfused, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8d5ebd93e063d..3d1269c0ecea8 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -577,6 +577,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) @@ -627,4 +628,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index d6bdb31a3c630..17fc727b8fc70 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -329,6 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] global compilation_start_time + compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, @@ -339,7 +340,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): num_graphs=len(self.compile_submod_names), runtime_shape=None) # Lazy import here to avoid circular import - from .cuda_graph import CUDAGraphOptions from .cuda_piecewise_backend import PiecewiseBackend piecewise_backend = PiecewiseBackend( @@ -347,7 +347,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) - if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and + not self.compilation_config.use_inductor_graph_partition): + # We're using Dynamo-based piecewise splitting, so we wrap + # the whole subgraph with a static graph wrapper. + from .cuda_graph import CUDAGraphOptions + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper # class) as platform dependent. static_graph_wrapper_class = resolve_obj_by_qualname( diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 41d9fcb824b01..b7a6e23c1aa79 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import inspect from typing import Callable, Optional, TypeVar, Union, overload from unittest.mock import patch @@ -14,7 +15,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors -from vllm.utils import supports_dynamo +from vllm.utils import resolve_obj_by_qualname, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -301,8 +302,11 @@ def _support_torch_compile( with patch.object(InliningInstructionTranslator, 'inline_call', patched_inline_call), torch._dynamo.config.patch( - **dynamo_config_patches): + **dynamo_config_patches + ), maybe_use_cudagraph_partition_wrapper( + self.vllm_config): output = self.compiled_callable(*args, **kwargs) + return output # usually, capturing the model once is enough, and then we can @@ -314,3 +318,52 @@ def _support_torch_compile( cls.__call__ = __call__ return cls + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.config import CUDAGraphMode + + compilation_config = vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper(f, + metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + + yield + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers(None) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3618f472e742d..22b38daf46c39 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -299,6 +299,26 @@ class CompilationConfig: minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. """ + use_inductor_graph_partition: bool = False + """Use inductor graph partition to split the graph at cudagraph_unsafe ops. + This partition happens at inductor codegen time after all passes and fusions + are finished. It generates a single `call` function which wraps + cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops + outside the partition functions. For a graph with N cudagraph-unsafe ops + (e.g., Attention), there would be N+1 partitions. To mark an op as + cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when + register the custom op. + + This config supports both full cudagraph and piecewise cudagraph without + compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper + to each partition. For N+1 partitions, there would be N+1 + CUDAGraph wrapper instances. + + For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the + inductor `call` function in the model runner. The top-level full cudagraph + capture ignores all partitioning. + """ + pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -461,6 +481,12 @@ class CompilationConfig: "since full_cuda_graph is deprecated.") self.cudagraph_mode = CUDAGraphMode.FULL + if (self.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + raise ValueError("use_inductor_graph_partition is only " + "supported with torch>=2.9.0.dev. Set " + "use_inductor_graph_partition=False instead.") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -540,19 +566,36 @@ class CompilationConfig: "set_splitting_ops_for_v1 should only be called when " "level is CompilationLevel.PIECEWISE") + use_inductor_graph_partition_msg = ( + "When use_inductor_graph_partition=True, splitting_ops " + "are ignored and set to an empty list. Instead, " + "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " + "used to annotate custom ops for graph partition.") + if self.splitting_ops is None: - # NOTE: When using full cudagraph, instead of setting an empty - # list and capture the full cudagraph inside the flattened fx - # graph, we keep the piecewise fx graph structure but capture the - # full cudagraph outside the fx graph. This reduces some cpu - # overhead when the runtime batch_size is not cudagraph captured. - # see https://github.com/vllm-project/vllm/pull/20059 for details. - # make a copy to avoid mutating the class-level list via reference. - self.splitting_ops = list(self._attention_ops) + if self.use_inductor_graph_partition: + # When using inductor graph partition, we set splitting_ops + # to be empty and rely on torch._C.Tag.cudagraph_unsafe to + # annotate custom ops as splitting ops. + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] + else: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: - logger.warning_once("Using piecewise compilation with empty " - "splitting_ops.") - if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "Using piecewise compilation with empty " + "splitting_ops and use_inductor_graph_partition" + f"={self.use_inductor_graph_partition}.") + if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE + and not self.use_inductor_graph_partition): logger.warning_once( "When compilation level is piecewise with empty " "splitting_ops, PIECEWISE cudagraph_mode will be " @@ -562,7 +605,26 @@ class CompilationConfig: "any problems.") self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] + elif self.use_inductor_graph_partition: + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) + + def is_attention_compiled_piecewise(self) -> bool: + use_fx_graph_piecewise_compilation = ( + self.level == CompilationLevel.PIECEWISE + and self.splitting_ops_contain_attention()) + + inductor_used = (self.level == CompilationLevel.PIECEWISE + and self.use_inductor) or ( + self.level >= CompilationLevel.DYNAMO_AS_IS + and self.backend == "inductor") + use_inductor_piecewise_compilation = ( + inductor_used and self.use_inductor_graph_partition + and not self.splitting_ops_contain_attention()) + + return use_fx_graph_piecewise_compilation or \ + use_inductor_piecewise_compilation diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index d2db7dcb3f091..ea4fba8eeea6d 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional -from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.forward_context import BatchDescriptor from vllm.logger import init_logger @@ -39,11 +39,15 @@ class CudagraphDispatcher: CUDAGraphMode.FULL: set(), } - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()), \ + not_use_piecewise_compilation = ( + not self.cudagraph_mode.requires_piecewise_compilation()) + + assert not_use_piecewise_compilation or \ + self.compilation_config.is_attention_compiled_piecewise(), \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ + "and attention should be in splitting_ops or "\ + "inductor splitting should be used. " \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From a25ade5d473fc00107bd3950141d8211331d3377 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 18:06:34 -0700 Subject: [PATCH 044/185] [BugFix] Ensure appropriate guards in destructors (#25284) Signed-off-by: Nick Hill Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/compilation/collective_fusion.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 9 +++++---- vllm/executor/executor_base.py | 3 --- vllm/v1/worker/gpu_worker.py | 3 ++- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 71274420c3426..0658b59a2e215 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1183,7 +1183,7 @@ class AllReduceFusionPass(VllmInductorPass): self.end_and_log() def __del__(self): - if self.disabled: + if getattr(self, "disabled", True): return if flashinfer_comm is not None: flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ff62f60e5a42c..d3a08af088c11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -569,9 +569,10 @@ class NixlConnectorWorker: def __del__(self): """Cleanup background threads on destruction.""" - self._handshake_initiation_executor.shutdown(wait=False) - if self._nixl_handshake_listener_t: - self._nixl_handshake_listener_t.join(timeout=0) + if executor := getattr(self, "_handshake_initiation_executor", None): + executor.shutdown(wait=False) + if listener_t := getattr(self, "_nixl_handshake_listener_t", None): + listener_t.join(timeout=0) @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, @@ -1379,4 +1380,4 @@ class NixlKVConnectorStats(KVConnectorStats): # TODO: reduce stats to a single value, calculate latency/throughput return { "num_successful_transfers": self.data["num_successful_transfers"] - } \ No newline at end of file + } diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index d18bef1256af5..42aa8d14a21eb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -235,9 +235,6 @@ class ExecutorBase(ABC): """Shutdown the executor.""" self.collective_rpc("shutdown") - def __del__(self): - self.shutdown() - async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6855526583f04..8b1e1bb8f45ca 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -683,7 +683,8 @@ class Worker(WorkerBase): tensorizer_config=tensorizer_config, ) def shutdown(self) -> None: - self.model_runner.ensure_kv_transfer_shutdown() + if runner := getattr(self, "model_runner", None): + runner.ensure_kv_transfer_shutdown() def init_worker_distributed_environment( From 535d80056b72443e68a96c1e4a1049cd9a85587d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 19:02:38 -0700 Subject: [PATCH 045/185] [Misc] Support more collective_rpc return types (#25294) Signed-off-by: Nick Hill --- tests/v1/engine/test_engine_core_client.py | 203 ++++++++++++++++++++- vllm/v1/serial_utils.py | 60 ++++-- 2 files changed, 246 insertions(+), 17 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 625a3470e8025..992c4e01386e5 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -8,7 +8,7 @@ import time import uuid from dataclasses import dataclass from threading import Thread -from typing import Optional, Union +from typing import Any, Optional, Union from unittest.mock import MagicMock import pytest @@ -331,6 +331,46 @@ def echo_dc( return [val for _ in range(3)] if return_list else val +# Dummy utility function to test dict serialization with custom types. +def echo_dc_dict( + self, + msg: str, + return_dict: bool = False, +) -> Union[MyDataclass, dict[str, MyDataclass]]: + print(f"echo dc dict util function called: {msg}") + val = None if msg is None else MyDataclass(msg) + # Return dict of dataclasses to verify support for returning dicts + # with custom value types. + if return_dict: + return {"key1": val, "key2": val, "key3": val} + else: + return val + + +# Dummy utility function to test nested structures with custom types. +def echo_dc_nested( + self, + msg: str, + structure_type: str = "list_of_dicts", +) -> Any: + print(f"echo dc nested util function called: {msg}, " + f"structure: {structure_type}") + val = None if msg is None else MyDataclass(msg) + + if structure_type == "list_of_dicts": # noqa + # Return list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + return [{"a": val, "b": val}, {"c": val, "d": val}] + elif structure_type == "dict_of_lists": + # Return dict of lists: {"list1": [val, val], "list2": [val, val]} + return {"list1": [val, val], "list2": [val, val]} + elif structure_type == "deep_nested": + # Return deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + return {"outer": [{"inner": [val, val]}, {"inner": [val]}]} + else: + return val + + @pytest.mark.asyncio(loop_scope="function") async def test_engine_core_client_util_method_custom_return( monkeypatch: pytest.MonkeyPatch): @@ -384,6 +424,167 @@ async def test_engine_core_client_util_method_custom_return( client.shutdown() +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_custom_dict_return( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_dict", echo_dc_dict, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + # Test utility method returning custom / non-native data type. + core_client: AsyncMPClient = client + + # Test single object return + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", False) + assert isinstance(result, + MyDataclass) and result.message == "testarg3" + + # Test dict return with custom value types + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert isinstance(val, + MyDataclass) and val.message == "testarg3" + + # Test returning dict with None values + result = await core_client.call_utility_async( + "echo_dc_dict", None, True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert val is None + + finally: + client.shutdown() + + +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_nested_structures( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_nested", echo_dc_nested, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + core_client: AsyncMPClient = client + + # Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + result = await core_client.call_utility_async( + "echo_dc_nested", "nested1", "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for i, item in enumerate(result): + assert isinstance(item, dict) + if i == 0: + assert "a" in item and "b" in item + assert isinstance( + item["a"], + MyDataclass) and item["a"].message == "nested1" + assert isinstance( + item["b"], + MyDataclass) and item["b"].message == "nested1" + else: + assert "c" in item and "d" in item + assert isinstance( + item["c"], + MyDataclass) and item["c"].message == "nested1" + assert isinstance( + item["d"], + MyDataclass) and item["d"].message == "nested1" + + # Test dict of lists: {"list1": [val, val], "list2": [val, val]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested2", "dict_of_lists") + assert isinstance(result, dict) and len(result) == 2 + assert "list1" in result and "list2" in result + for key, lst in result.items(): + assert isinstance(lst, list) and len(lst) == 2 + for item in lst: + assert isinstance( + item, MyDataclass) and item.message == "nested2" + + # Test deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested3", "deep_nested") + assert isinstance(result, dict) and "outer" in result + outer_list = result["outer"] + assert isinstance(outer_list, list) and len(outer_list) == 2 + + # First dict in outer list should have "inner" with 2 items + inner_dict1 = outer_list[0] + assert isinstance(inner_dict1, dict) and "inner" in inner_dict1 + inner_list1 = inner_dict1["inner"] + assert isinstance(inner_list1, list) and len(inner_list1) == 2 + for item in inner_list1: + assert isinstance(item, + MyDataclass) and item.message == "nested3" + + # Second dict in outer list should have "inner" with 1 item + inner_dict2 = outer_list[1] + assert isinstance(inner_dict2, dict) and "inner" in inner_dict2 + inner_list2 = inner_dict2["inner"] + assert isinstance(inner_list2, list) and len(inner_list2) == 1 + assert isinstance( + inner_list2[0], + MyDataclass) and inner_list2[0].message == "nested3" + + # Test with None values in nested structures + result = await core_client.call_utility_async( + "echo_dc_nested", None, "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for item in result: + assert isinstance(item, dict) + for val in item.values(): + assert val is None + + finally: + client.shutdown() + + @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 50c1470c67edc..c812a2ec6427a 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -7,7 +7,7 @@ import pickle from collections.abc import Sequence from inspect import isclass from types import FunctionType -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import cloudpickle import msgspec @@ -59,6 +59,42 @@ def _typestr(val: Any) -> Optional[tuple[str, str]]: return t.__module__, t.__qualname__ +def _encode_type_info_recursive(obj: Any) -> Any: + """Recursively encode type information for nested structures of + lists/dicts.""" + if obj is None: + return None + if type(obj) is list: + return [_encode_type_info_recursive(item) for item in obj] + if type(obj) is dict: + return {k: _encode_type_info_recursive(v) for k, v in obj.items()} + return _typestr(obj) + + +def _decode_type_info_recursive( + type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any], + Any]) -> Any: + """Recursively decode type information for nested structures of + lists/dicts.""" + if type_info is None: + return data + if isinstance(type_info, dict): + assert isinstance(data, dict) + return { + k: _decode_type_info_recursive(type_info[k], data[k], convert_fn) + for k in type_info + } + if isinstance(type_info, list) and ( + # Exclude serialized tensors/numpy arrays. + len(type_info) != 2 or not isinstance(type_info[0], str)): + assert isinstance(data, list) + return [ + _decode_type_info_recursive(ti, d, convert_fn) + for ti, d in zip(type_info, data) + ] + return convert_fn(type_info, data) + + class MsgpackEncoder: """Encoder with custom torch tensor and numpy array serialization. @@ -129,12 +165,10 @@ class MsgpackEncoder: result = obj.result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: return None, result - # Since utility results are not strongly typed, we also encode - # the type (or a list of types in the case it's a list) to - # help with correct msgspec deserialization. - return _typestr(result) if type(result) is not list else [ - _typestr(v) for v in result - ], result + # Since utility results are not strongly typed, we recursively + # encode type information for nested structures of lists/dicts + # to help with correct msgspec deserialization. + return _encode_type_info_recursive(result), result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError(f"Object of type {type(obj)} is not serializable" @@ -288,15 +322,9 @@ class MsgpackDecoder: if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must " "be set to use custom utility result types") - assert isinstance(result_type, list) - if len(result_type) == 2 and isinstance(result_type[0], str): - result = self._convert_result(result_type, result) - else: - assert isinstance(result, list) - result = [ - self._convert_result(rt, r) - for rt, r in zip(result_type, result) - ] + # Use recursive decoding to handle nested structures + result = _decode_type_info_recursive(result_type, result, + self._convert_result) return UtilityResult(result) def _convert_result(self, result_type: Sequence[str], result: Any) -> Any: From c308501cb6a922af8c4183bd65be0094dd73de9a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 20 Sep 2025 04:11:03 +0100 Subject: [PATCH 046/185] Improve weight loading for encoder models in Transformers backend (#25289) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 28 ++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f40a20dee63d7..3bd4d10316ec6 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -702,21 +702,45 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): class TransformersModel(TransformersBase): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ + # Handle BERT-like models + "bert": "model", # Add `model.` prefix for base model checkpoints "": "model.", - # Remove `model.` from places it should not be + # Remove `model.` prefix if it was already there "model.model.": "model.", + # Pooling adapters will be adjacent to `model` + "model.pooler": "pooler", "model.score": "score", + # Classifier adapter's classifier layer is renamed to score + "model.classifier": "score", + }, + orig_to_new_suffix={ + # Replace legacy suffixes used for norms + ".gamma": ".weight", + ".beta": ".bias", }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - # Some encoder models have the position_ids buffer in the checkpoint + # After creating a pooling model, `pooler` will be duplicated. + # The one inside `model` comes from the Transformers modelling code. + # The one after `model` is an adapter from vLLM. + # We want to use the adapter so we nullify the original pooler. + if getattr(self.model, "pooler", None) is not None: + self.skip_prefixes.append("pooler.") + self.model.pooler = torch.nn.Identity() + + # Some encoder models have the position_ids buffer in the checkpoint. # vLLM will always pass position_ids as an argument, so we skip loading # the buffer if it exists self.skip_substrs.append("position_ids") + # Some encoder models have the bias of the final classifier layer + # in the checkpoint. vLLM does not use this bias, so we skip loading + # it if it exists + self.skip_substrs.append("score.bias") + def create_attention_instances( self, attn_type: AttentionType = AttentionType.DECODER): # TODO(hmellor): Better way to detect encoder models From 36429096171ff8785645c40c662d859dddedd829 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 20 Sep 2025 05:18:13 +0200 Subject: [PATCH 047/185] [BUGFIX] GPTQ quantization compatibility for Qwen3 Next MOE models (AutoGPTQ and AutoRound-GPTQ) (#25268) Signed-off-by: JartX --- vllm/model_executor/models/qwen3_next.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 98749c160ba4d..ce917f92bd2e5 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -148,9 +148,11 @@ class Qwen3NextSparseMoeBlock(nn.Module): def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ - # seems to avoid gate quantization. - # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4 - if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + # seems to avoid gate quantization while AutoRound does. + if isinstance( + quant_config, + (GPTQConfig, + GPTQMarlinConfig)) and not quant_config.autoround_version: return None return quant_config From b7f186bbb3101e97bb9ad42b7ffb3cdb4bb590fd Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 21:28:31 -0700 Subject: [PATCH 048/185] [BugFix] Exclude self when checking for port collision (#25286) Signed-off-by: Nick Hill --- vllm/utils/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d4013a69e99fe..fd1c0af31269c 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -987,8 +987,10 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: if sys.platform.startswith("darwin"): return None + our_pid = os.getpid() for conn in psutil.net_connections(): - if conn.laddr.port == port: + if conn.laddr.port == port and (conn.pid is not None + and conn.pid != our_pid): try: return psutil.Process(conn.pid) except psutil.NoSuchProcess: From 6c5f82e5aa87cd73ce03ce10fc44138f75ee1aea Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 19 Sep 2025 23:41:23 -0500 Subject: [PATCH 049/185] [BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298) Signed-off-by: Chendi Xue --- vllm/attention/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 3d1269c0ecea8..544a720524429 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None +try: + tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, ) +except AttributeError: + tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -577,7 +581,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) @@ -628,5 +632,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) From f91480b2d44c263fb600b5cba5b0e6c7a195f742 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 20 Sep 2025 13:29:54 +0800 Subject: [PATCH 050/185] [Bugfix] fix tool call arguments is empty (#25223) Signed-off-by: chaunceyjiang Co-authored-by: xin.li --- .../test_completion_with_function_calling.py | 60 +++++++++++++++++++ vllm/entrypoints/chat_utils.py | 8 ++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 3649cefa9bf42..4355603fcd70b 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import datetime from typing import Union import openai # use the official client for correctness check @@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) for o in output: assert o.id is None or o.id == 'functions.get_current_weather:0' + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("arguments", ["{}", '']) +async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str, + arguments: str): + # Step 1: Define a tool that requires no parameters + tools = [{ + "type": "function", + "function": { + "name": "get_current_time", + "description": + "Get the current date and time. No parameters needed.", + "parameters": { + "type": "object", + "properties": {}, # No parameters + "required": [] # No required fields + } + } + }] + messages = [{"role": "user", "content": "What time is it now?"}] + # Step 2: Send user message and let model decide whether to call the tool + response = await client.chat.completions.create( + model=model_name, + messages=messages, + tools=tools, + tool_choice="auto" # Let model choose automatically + ) + + # Step 3: Check if model wants to call a tool + message = response.choices[0].message + if message.tool_calls: + # Get the first tool call + tool_call = message.tool_calls[0] + tool_name = tool_call.function.name + # Step 4: Execute the tool locally (no parameters) + if tool_name == "get_current_time": + # Test both empty string and "{}" for no-arg tool calls + tool_call.function.arguments = arguments + messages.append(message) + current_time = datetime.datetime.now() + result = current_time.isoformat() + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": result, + }) + # Step 5: Send tool result back to model to continue conversation + final_response = await client.chat.completions.create( + model=model_name, + messages=messages, + ) + # Output final natural language response + assert final_response.choices[0].message.content is not None + + else: + # No tool called — just print model's direct reply + assert message.content is not None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 00ef39f134653..c2c0ad74ef431 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1450,9 +1450,11 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: and isinstance(message["tool_calls"], list) ): for item in message["tool_calls"]: - item["function"]["arguments"] = json.loads( - item["function"]["arguments"] - ) + # if arguments is None or empty string, set to {} + if content := item["function"].get("arguments"): + item["function"]["arguments"] = json.loads(content) + else: + item["function"]["arguments"] = {} def parse_chat_messages( From c60e6137f0bf2034853919b3a9d705d7e06b93cf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 20 Sep 2025 13:30:22 +0800 Subject: [PATCH 051/185] [Optimization] Avoid repeated model architecture conversion for pooling models (#25261) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 22 +++++++++++++++++++++- vllm/model_executor/model_loader/utils.py | 17 ++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 4e847922b61e6..921322bb475c5 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -322,8 +322,28 @@ class ModelConfig: factors.append(self.override_generation_config) factors.append(self.rope_scaling) factors.append(self.rope_theta) + # hf_config can control how the model looks! - factors.append(self.hf_config.to_json_string()) + try: + hf_config_json = self.hf_config.to_json_string(use_diff=False) + except TypeError: + from transformers import PretrainedConfig + + from vllm.utils.jsontree import json_map_leaves + + # Handle nested HF configs with unserializable values gracefully + hf_config_json = json.dumps( + json_map_leaves( + lambda v: v.to_dict() + if isinstance(v, PretrainedConfig) else str(v), + self.hf_config.to_dict(), + ), + indent=2, + sort_keys=True, + ) + "\n" + + factors.append(hf_config_json) + str_factors = str(factors) assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index bd1773c753a93..e007d431880eb 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -165,7 +165,11 @@ def device_loading_context(module: torch.nn.Module, # New parameters or parameters already on target device are untouched -def get_model_architecture( +_MODEL_ARCH_BY_HASH = dict[str, tuple[type[nn.Module], str]]() +"""Caches the outputs of `_get_model_architecture`.""" + + +def _get_model_architecture( model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) @@ -209,6 +213,17 @@ def get_model_architecture( return model_cls, arch +def get_model_architecture( + model_config: ModelConfig) -> tuple[type[nn.Module], str]: + key = model_config.compute_hash() + if key in _MODEL_ARCH_BY_HASH: + return _MODEL_ARCH_BY_HASH[key] + + model_arch = _get_model_architecture(model_config) + _MODEL_ARCH_BY_HASH[key] = model_arch + return model_arch + + def get_model_cls(model_config: ModelConfig) -> type[nn.Module]: return get_model_architecture(model_config)[0] From 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 19 Sep 2025 23:43:59 -0700 Subject: [PATCH 052/185] [Hybrid Allocator] Support full attention with different hidden size (#25101) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 118 +++++++++++++++++++--- vllm/v1/core/kv_cache_utils.py | 144 ++++++++++++++++++++------- vllm/v1/engine/core.py | 16 ++- vllm/v1/kv_cache_interface.py | 70 +++++++++++++ vllm/v1/worker/gpu_model_runner.py | 65 ++++++------ vllm/v1/worker/utils.py | 3 +- 6 files changed, 324 insertions(+), 92 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 4bf6bbbfeae28..4cb7ed6ce3824 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -18,12 +18,14 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, - get_kv_cache_configs, get_max_concurrency_for_kv_cache_config, - get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, make_block_hash_with_group_id) + generate_scheduler_kv_cache_config, get_kv_cache_configs, + get_max_concurrency_for_kv_cache_config, get_request_block_hasher, + hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform, + make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -927,36 +929,36 @@ def test_merge_kv_cache_spec(): assert merged_layer_spec.sliding_window == 1 -def test_is_kv_cache_type_uniform(): +def test_is_kv_cache_spec_uniform(): kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) @pytest.mark.parametrize( @@ -1286,14 +1288,28 @@ def test_get_kv_cache_config_one_worker(): ], ) - # different hidden size, unimplemented + # different hidden size kv_cache_specs_hybrid = { 'layer_1': new_kv_cache_spec(head_size=128), - 'layer_2': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=64), } - with pytest.raises(NotImplementedError): - get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid], - [mem_per_block_per_layer * 2 * 32])[0] + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 3 * 32])[0] + assert kv_cache_config_hybrid == KVCacheConfig( + num_blocks=32, + kv_cache_tensors=[ + KVCacheTensor(size=mem_per_block_per_layer * 32 * 2, + shared_by=["layer_1"]), + KVCacheTensor(size=mem_per_block_per_layer * 32, + shared_by=["layer_2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer_1", "layer_2"], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs_hybrid)) + ]) # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 @@ -1324,3 +1340,75 @@ def test_get_kv_cache_configs_attention_free(): kv_cache_groups=[], ) ] + + +def test_generate_uniform_type_kv_cache_specs(): + # All layers are full attention, can be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # Full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_sliding_window_spec(sliding_window=1), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # different order of full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_kv_cache_spec(), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # Same-size sliding window, can be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # different block sizes, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(block_size=16), + 'layer_2': new_kv_cache_spec(block_size=32), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + +def test_generate_scheduler_kv_cache_config(): + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + kv_cache_configs = [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs)), + ], + ) + ] + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + assert scheduler_kv_cache_config == KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec()) + ], + ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bc2ec5e42ea20..3ccd00121f8ed 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """KV-Cache Utilities.""" +import copy import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence @@ -15,7 +16,8 @@ from vllm.utils import GiB_bytes, cdiv, sha256_cbor from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -750,7 +752,7 @@ def create_kv_cache_group_specs( return kv_cache_groups -def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: +def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ Whether all layers in the given KVCacheSpec have the same KV cache spec. Note that we regard FullAttentionSpec with and without sliding window as @@ -793,6 +795,21 @@ def get_max_concurrency_for_kv_cache_config( return max_concurrency +def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int: + """ + Override the number of kv cache blocks if `num_gpu_blocks_override` is set. + """ + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + return num_blocks + + def get_num_blocks(vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int) -> int: """ @@ -806,13 +823,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int, """ num_blocks = int(available_memory // page_size // num_layers) num_blocks = max(num_blocks, 0) - if vllm_config.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = \ - vllm_config.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) - num_blocks = num_gpu_blocks_override + num_blocks = may_override_num_blocks(vllm_config, num_blocks) return num_blocks @@ -825,11 +836,11 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: return page_sizes.pop() -def _get_kv_cache_groups_uniform_type( +def _get_kv_cache_groups_uniform_spec( kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for a model with one type of KV cache. - Divide the available memory equally among all layers. + Generates the KV cache configuration for a model with the same KV cache + spec for all layers. Args: kv_cache_specs: The kv cache spec of each attention layer in the model @@ -842,6 +853,22 @@ def _get_kv_cache_groups_uniform_type( [list(kv_cache_specs.keys())]) +def _get_kv_cache_groups_uniform_type( + spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]: + """ + Generates the KV cache configuration for a model with one type of KV cache + but different hidden sizes. All layers are merged into one group. + + Args: + spec: The UniformTypeKVCacheSpecs of the model + + Returns: + The generated KVCacheGroupSpecs + """ + + return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)] + + def is_kv_cache_page_size_uniform( kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ @@ -1000,28 +1027,45 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig, ) # Determine how model runners should initialize the KV cache tensors. - # We will have group_size memory pools, each is shared by one layer from - # each group. As layers of different groups have different block table, - # they will use different parts of the shared Tensor. - # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), - # (sw.1, padding) will be: (group_size = 2) - # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 - # full.1, sw.2: share another Tensor with size=available_memory//2 - group_size = max(len(group.layer_names) for group in kv_cache_groups) + if len(kv_cache_groups) == 1 and \ + isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs): + # Special case: all layers have the same type of KV cache but with + # different hidden size. Allocate different amount of memory for each + # layer based on its hidden size. + num_blocks = available_memory // kv_cache_groups[ + 0].kv_cache_spec.page_size_bytes + num_blocks = may_override_num_blocks(vllm_config, num_blocks) + per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs + kv_cache_tensors = [ + KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes * + num_blocks, + shared_by=[layer_name]) + for layer_name in kv_cache_groups[0].layer_names + ] + else: + # General case: + # We will have group_size memory pools, each is shared by one layer from + # each group. As layers of different groups have different block table, + # they will use different parts of the shared Tensor. + # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), + # (sw.1, padding) will be: (group_size = 2) + # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 + # full.1, sw.2: share another Tensor with size=available_memory//2 + group_size = max(len(group.layer_names) for group in kv_cache_groups) - page_size = get_uniform_page_size(kv_cache_specs) - assert group_size > 0, "group_size must be greater than 0" - num_blocks = get_num_blocks(vllm_config, group_size, available_memory, - page_size) - per_memory_pool_size = page_size * num_blocks - kv_cache_tensors = [] - for i in range(group_size): - shared_by = [] - for j in range(len(kv_cache_groups)): - if i < len(kv_cache_groups[j].layer_names): - shared_by.append(kv_cache_groups[j].layer_names[i]) - kv_cache_tensors.append( - KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by)) + page_size = get_uniform_page_size(kv_cache_specs) + assert group_size > 0, "group_size must be greater than 0" + num_blocks = get_num_blocks(vllm_config, group_size, available_memory, + page_size) + kv_cache_tensors = [] + for i in range(group_size): + shared_by = [] + for j in range(len(kv_cache_groups)): + if i < len(kv_cache_groups[j].layer_names): + shared_by.append(kv_cache_groups[j].layer_names[i]) + kv_cache_tensors.append( + KVCacheTensor(size=page_size * num_blocks, + shared_by=shared_by)) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, @@ -1059,7 +1103,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): kv_cache_spec: The kv cache spec of each attention layer in the model """ - if is_kv_cache_type_uniform(kv_cache_spec): + if is_kv_cache_spec_uniform(kv_cache_spec): return logger.warning( @@ -1097,7 +1141,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): attention_chunk_size=spec.attention_chunk_size, ) - if not is_kv_cache_type_uniform(kv_cache_spec): + if not is_kv_cache_spec_uniform(kv_cache_spec): raise ValueError("Hybrid KV cache manager is disabled but failed to " "convert the KV cache specs to one unified type.") @@ -1122,11 +1166,16 @@ def get_kv_cache_groups( # This returns an empty list to allow for the KVCacheManager to handle # attention free models. return [] - elif is_kv_cache_type_uniform(kv_cache_spec): + elif is_kv_cache_spec_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. - return _get_kv_cache_groups_uniform_type(kv_cache_spec) + return _get_kv_cache_groups_uniform_spec(kv_cache_spec) + elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec): + # All layers need the same number of token slots (e.g., all layers are + # full attention, or all layers are sliding window attention with the + # same window size). Put all layers into one group. + return _get_kv_cache_groups_uniform_type(uniform_spec) elif is_kv_cache_page_size_uniform(kv_cache_spec): # Model contains multiple attention types, but KV cache of all layers # have the same physical memory per block per layer. Split the layers @@ -1137,6 +1186,27 @@ def get_kv_cache_groups( raise NotImplementedError +def generate_scheduler_kv_cache_config( + kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig: + """ + Generate the KV cache configuration for the scheduler. + """ + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + # All workers have the same kv_cache_config except layer names, so use + # an arbitrary one to initialize the scheduler. + cfg = copy.deepcopy(kv_cache_configs[0]) + for group in cfg.kv_cache_groups: + if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs): + # All layers in the UniformTypeKVCacheSpecs have the same type, + # so use an arbitrary one to initialize the scheduler. + group.kv_cache_spec = next( + iter(group.kv_cache_spec.kv_cache_specs.values())) + return cfg + + def get_kv_cache_configs(vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], available_memory: list[int]) -> list[KVCacheConfig]: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a022e9c0d7058..a43042a5510a8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,7 +29,9 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs, +from vllm.v1.core.kv_cache_utils import (BlockHash, + generate_scheduler_kv_cache_config, + get_kv_cache_configs, get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.interface import SchedulerInterface @@ -196,16 +198,10 @@ class EngineCore: kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, available_gpu_memory) - - # All workers have the same kv_cache_config except layer names, so use - # an arbitrary one to initialize the scheduler. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + num_gpu_blocks = scheduler_kv_cache_config.num_blocks num_cpu_blocks = 0 - scheduler_kv_cache_config = kv_cache_configs[0] # Initialize kv cache and warmup the execution self.model_executor.initialize_from_config(kv_cache_configs) diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 0cf92a680a689..f72cc8f93a6c2 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -234,6 +234,76 @@ class CrossAttentionSpec(AttentionSpec): return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes +@dataclass(frozen=True) +class UniformTypeKVCacheSpecs(KVCacheSpec): + """ + A KV cache spec for multiple layers with the same type of attention. Here, + same types means always need the same number of token slots. For example, + sliding window attentions with different window sizes are not the same type + and should not be merged into one UniformTypeKVCacheSpecs. + """ + kv_cache_specs: dict[str, KVCacheSpec] + + @property + def page_size_bytes(self) -> int: + return sum(spec.page_size_bytes + for spec in self.kv_cache_specs.values()) + + def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + max_num_pages = max( + cdiv(spec.max_memory_usage_bytes(vllm_config), + spec.page_size_bytes) + for spec in self.kv_cache_specs.values()) + return max_num_pages * self.page_size_bytes + + @classmethod + def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool: + """ + Whether all layers have the same type of KV cache spec. + """ + block_sizes = set(spec.block_size for spec in kv_cache_specs.values()) + if len(block_sizes) > 1: + # Different block sizes, not uniform. + return False + one_spec = next(iter(kv_cache_specs.values())) + if isinstance(one_spec, (FullAttentionSpec, CrossAttentionSpec)): + return all( + isinstance(spec, type(one_spec)) + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, SlidingWindowSpec): + return all( + isinstance(spec, SlidingWindowSpec) + and spec.sliding_window == one_spec.sliding_window + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, ChunkedLocalAttentionSpec): + return all( + isinstance(spec, ChunkedLocalAttentionSpec) + and spec.attention_chunk_size == one_spec.attention_chunk_size + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, MambaSpec): + return all( + isinstance(spec, MambaSpec) and spec.num_speculative_blocks == + one_spec.num_speculative_blocks + for spec in kv_cache_specs.values()) + else: + # NOTE(Chen): Please add new branches for new KV cache spec types. + raise NotImplementedError( + f"Unsupported KV cache spec type: {type(one_spec)}") + + @classmethod + def from_specs(cls, kv_cache_specs: dict[str, + KVCacheSpec]) -> Optional[Self]: + """ + Return a SameTypeKVCacheSpecs object if all layers have the same type + of KV cache spec. Return None if not. + """ + if cls.is_uniform_type(kv_cache_specs): + block_size = next(iter(kv_cache_specs.values())).block_size + return cls(block_size=block_size, kv_cache_specs=kv_cache_specs) + else: + return None + + @dataclass class KVCacheTensor: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dffadd1d769b7..233df8f1b0e9b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager from copy import deepcopy -from typing import TYPE_CHECKING, Any, Optional, Union, cast +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast import numpy as np import torch @@ -74,7 +74,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, EncoderOnlyAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - MambaSpec, SlidingWindowSpec) + MambaSpec, SlidingWindowSpec, + UniformTypeKVCacheSpecs) # yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, LogprobsLists, LogprobsTensors, @@ -1187,7 +1188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, num_common_prefix_blocks, - kv_cache_group_spec.kv_cache_spec, + attn_group.kv_cache_spec, builder, ) @@ -3453,12 +3454,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(self.attn_groups) == 0, \ "Attention backends are already initialized" - def get_attn_backends_for_layers( - layer_names: list[str] - ) -> dict[type[AttentionBackend], list[str]]: - layers = get_layers_from_vllm_config(self.vllm_config, - AttentionLayerBase, - layer_names) + class AttentionGroupKey(NamedTuple): + attn_backend: type[AttentionBackend] + kv_cache_spec: KVCacheSpec + + def get_attn_backends_for_group( + kv_cache_group_spec: KVCacheGroupSpec, + ) -> dict[AttentionGroupKey, list[str]]: + layers = get_layers_from_vllm_config( + self.vllm_config, AttentionLayerBase, + kv_cache_group_spec.layer_names) attn_backends = {} attn_backend_layers = defaultdict(list) # Dedupe based on full class name; this is a bit safer than @@ -3466,7 +3471,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # attention backend subclasses (e.g. ChunkedLocalAttention) unless # they are cached correctly, there will be different objects per # layer. - for layer_name in layer_names: + for layer_name in kv_cache_group_spec.layer_names: attn_backend = layers[layer_name].get_attn_backend() if layer_name in self.kv_sharing_fast_prefill_eligible_layers: @@ -3475,8 +3480,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_backend, ) - key = attn_backend.full_cls_name() - attn_backends[key] = attn_backend + full_cls_name = attn_backend.full_cls_name() + layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs): + layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[ + layer_name] + key = (full_cls_name, layer_kv_cache_spec) + attn_backends[key] = AttentionGroupKey(attn_backend, + layer_kv_cache_spec) attn_backend_layers[key].append(layer_name) return { attn_backends[k]: v @@ -3484,11 +3495,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): } def create_attn_groups( - attn_backends_map: dict[AttentionBackend, list[str]], - kv_cache_spec: KVCacheSpec, + attn_backends_map: dict[AttentionGroupKey, list[str]], ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] - for attn_backend, layer_names in attn_backends_map.items(): + for (attn_backend, + kv_cache_spec), layer_names in attn_backends_map.items(): attn_metadata_builders = [] attn_metadata_builders.append(attn_backend.get_builder_cls()( kv_cache_spec, @@ -3506,16 +3517,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): )) attn_group = AttentionGroup(attn_backend, attn_metadata_builders, - layer_names) + layer_names, kv_cache_spec) attn_groups.append(attn_group) return attn_groups for kv_cache_group_spec in kv_cache_config.kv_cache_groups: - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - attn_backends = get_attn_backends_for_layers( - kv_cache_group_spec.layer_names) - self.attn_groups.append( - create_attn_groups(attn_backends, kv_cache_spec)) + attn_backends = get_attn_backends_for_group(kv_cache_group_spec) + self.attn_groups.append(create_attn_groups(attn_backends)) # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() @@ -3680,14 +3688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _attn_group_iterator(self) -> Iterator[AttentionGroup]: return itertools.chain.from_iterable(self.attn_groups) - def _kv_cache_spec_attn_group_iterator( - self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]: + def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]: if not self.kv_cache_config.kv_cache_groups: return - for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups): - for attn_group in attn_groups: - yield self.kv_cache_config.kv_cache_groups[ - kv_cache_spec_id].kv_cache_spec, attn_group + for attn_groups in self.attn_groups: + yield from attn_groups def _reshape_kv_cache_tensors( self, @@ -3707,7 +3712,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): """ kv_caches: dict[str, torch.Tensor] = {} has_attn, has_mamba = False, False - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -3787,7 +3793,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_caches: The KV cache buffer of each layer. """ - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec for layer_name in group.layer_names: kv_cache = kv_caches[layer_name] if (isinstance(kv_cache_spec, AttentionSpec) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b76ac633892f3..021d18b2500f0 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -15,7 +15,7 @@ from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget -from vllm.v1.kv_cache_interface import KVCacheGroupSpec +from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec if TYPE_CHECKING: from vllm.attention.layer import Attention @@ -132,6 +132,7 @@ class AttentionGroup: backend: type[AttentionBackend] metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] + kv_cache_spec: KVCacheSpec def get_metadata_builder(self, ubatch_id: Optional[int] = None From be874c020196080305baf988ed8c1c82047323be Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Sep 2025 00:04:05 -0700 Subject: [PATCH 053/185] [Bugfix] Fix Qwen3-VL-MoE weight loading for EP (#25300) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_vl_moe.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 625f94cf7ad77..7912cf3ea52b0 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -122,9 +122,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): def load_fused_expert_weights(self, name: str, params_dict: dict, loaded_weight: torch.Tensor, shard_id: str, - num_experts: int): + num_experts: int) -> bool: param = params_dict[name] weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False for expert_id in range(num_experts): curr_expert_weight = loaded_weight[expert_id] success = weight_loader(param, @@ -133,9 +134,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): shard_id, expert_id, return_success=True) - if not success: - return False - return True + if success: + loaded_local_expert = True + + return loaded_local_expert def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -345,4 +347,4 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None self.visual_dim = config.vision_config.out_hidden_size - self.multiscale_dim = self.visual_dim * self.deepstack_num_level \ No newline at end of file + self.multiscale_dim = self.visual_dim * self.deepstack_num_level From 3d9a1d2de5091455bb2fbf6b21fc9188fd4612a4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 20 Sep 2025 15:14:35 +0800 Subject: [PATCH 054/185] [V1] Support `LLM.apply_model` (#18465) Signed-off-by: DarkLight1337 --- tests/conftest.py | 12 +--- tests/kernels/moe/test_mxfp4_moe.py | 37 ++++++---- .../multimodal/generation/test_qwen2_vl.py | 46 ++++++------ tests/models/quantization/test_awq.py | 2 +- tests/quantization/test_compressed_tensors.py | 18 +++-- tests/quantization/test_fp8.py | 8 +-- tests/quantization/test_gptq_dynamic.py | 71 ++++++++++--------- tests/quantization/test_lm_head.py | 4 +- tests/quantization/test_modelopt.py | 10 +-- tests/quantization/test_ptpc_fp8.py | 47 +++++++----- tests/quantization/test_quark.py | 26 +++---- .../test_register_quantization_config.py | 17 +++-- vllm/engine/llm_engine.py | 7 +- vllm/entrypoints/llm.py | 9 ++- vllm/executor/executor_base.py | 33 +++++---- vllm/v1/engine/llm_engine.py | 7 +- vllm/worker/worker_base.py | 9 ++- 17 files changed, 194 insertions(+), 169 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3cd93f4ad3289..e8e95357ff5b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -987,17 +987,7 @@ class VllmRunner: return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - if hasattr(self.llm.llm_engine, "model_executor"): - # This works either in V0 or in V1 with - # VLLM_ENABLE_V1_MULTIPROCESSING=0 - executor = self.llm.llm_engine.model_executor - return executor.apply_model(func) - - # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1 - def _apply_model(self): - return func(self.get_model()) - - return self.llm.llm_engine.collective_rpc(_apply_model) + return self.llm.apply_model(func) def get_llm(self) -> LLM: return self.llm diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index a3b8f07638d9a..61d3311cc1624 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -1,21 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib import importlib.metadata from dataclasses import dataclass +from importlib.util import find_spec from typing import Optional import pytest import torch from packaging import version +from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 + QuarkLinearMethod, QuarkW4A4MXFP4) +from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 + QuarkW4A4MXFp4MoEMethod) from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda( ) and current_platform.is_device_capability(100) @@ -39,6 +42,12 @@ class ModelCase: tp: int +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + @pytest.mark.parametrize('model_case', [ ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1), ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8), @@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): tensor_parallel_size=model_case.tp, load_format="dummy") as llm: - # TODO: llm.apply_model(check_model) currently relies on V0 internals. - # Re-enable this later. - # def check_model(model): - # layer = model.model.layers[0] + def check_model(model): + layer = model.model.layers[0] - # qkv_proj = layer.self_attn.qkv_proj + qkv_proj = layer.self_attn.qkv_proj - # assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - # assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) - # assert isinstance(layer.mlp.experts.quant_method, - # QuarkW4A4MXFp4MoEMethod) + assert isinstance(layer.mlp.experts.quant_method, + QuarkW4A4MXFp4MoEMethod) - # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": - # llm.apply_model(check_model) + if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": + llm.apply_model(check_model) output = llm.generate_greedy("Today I am in the French Alps and", max_tokens=20) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index a81f5e7ec8872..e56f4e4075be4 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -10,6 +10,7 @@ from PIL import Image from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video +from vllm.utils import set_default_torch_num_threads from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) @@ -17,11 +18,9 @@ from ...utils import check_logprobs_close @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - V1 Test: batch_make_xxxxx_embeddings calls a V0 internal - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") models = ["Qwen/Qwen2-VL-2B-Instruct"] @@ -126,9 +125,8 @@ def batch_make_image_embeddings( image_grid_thw_on_device = image_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + grid_thw=image_grid_thw_on_device).cpu() - # V1 Test: this calls a V0 internal. image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches @@ -210,7 +208,7 @@ def batch_make_video_embeddings( video_grid_thw_on_device = video_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + grid_thw=video_grid_thw_on_device).cpu() # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) @@ -266,19 +264,22 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) # max_model_len should be greater than image_feature_size - with vllm_runner(model, - runner="generate", - max_model_len=4000, - max_num_seqs=3, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: + with set_default_torch_num_threads(1): + vllm_model = vllm_runner( + model, + runner="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) + with vllm_model: outputs_per_case_for_original_input = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -329,9 +330,8 @@ def run_embedding_input_test( @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, - size_factors, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + size_factors, dtype, max_tokens, + num_logprobs, monkeypatch) -> None: images = [asset.pil_image for asset in image_assets] inputs_per_case: list[tuple[ diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index bd696198931ff..7005e435ecf46 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model, monkeypatch) -> None: # Test V1: this test hangs during setup on single-scale input. - # TODO: fixure out why and re-enable this on V1. + # TODO: figure out why and re-enable this on V1. monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 484f53246f349..b7949a488ad05 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -43,12 +43,9 @@ ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize( @@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs( dtype = "bfloat16" - # skip language translation prompt for the static per tensor asym model - if (model_path == - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" - ): # noqa: E501 + # skip language translation prompt for the static per tensor models + if model_path in ( + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + ): example_prompts = example_prompts[0:-1] with hf_runner(model_path, dtype=dtype) as hf_model: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index d781f462b4ad7..db53061cf2d1a 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: def check_model(model): @@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index aea50e99c1dd5..00a5946ed0154 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -31,41 +31,46 @@ MODEL_QUANT = [ @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch): - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") - - vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.llm.llm_engine.model_executor. - driver_worker.model_runner.model.named_modules()): - if name == "lm_head": - assert isinstance(submodule.quant_method, linear_method_cls) - elif name == 'model.layers.0.self_attn.qkv_proj': - # The first layer is quantized using bits=4, group_size=128 - # desc_act=True - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert config.weight_bits == 4 - assert config.group_size == 128 - assert config.desc_act - elif name == 'model.layers.1.self_attn.qkv_proj': - # The second layer is quantized using bits=8, group_size=32 - # desc_act=False - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert get_dynamic_override(config, layer_name=name, - key="bits") == 8 - assert get_dynamic_override(config, - layer_name=name, - key="group_size") == 32 - assert not get_dynamic_override( - config, layer_name=name, key="desc_act") - elif (name == 'model.layers.2.self_attn.qkv_proj' - or name == 'model.layers.2.mlp.gate_up_proj'): - # All other layers (layer index >= 2) are not quantized - assert isinstance(submodule.quant_method, UnquantizedLinearMethod) + with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: - del vllm_model + def check_model(model): + for name, submodule in model.named_modules(): + if name == "lm_head": + assert isinstance(submodule.quant_method, + linear_method_cls) + elif name == 'model.layers.0.self_attn.qkv_proj': + # The first layer is quantized using bits=4, group_size=128 + # desc_act=True + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert config.weight_bits == 4 + assert config.group_size == 128 + assert config.desc_act + elif name == 'model.layers.1.self_attn.qkv_proj': + # The second layer is quantized using bits=8, group_size=32 + # desc_act=False + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert get_dynamic_override(config, + layer_name=name, + key="bits") == 8 + assert get_dynamic_override(config, + layer_name=name, + key="group_size") == 32 + assert not get_dynamic_override( + config, layer_name=name, key="desc_act") + elif (name == 'model.layers.2.self_attn.qkv_proj' + or name == 'model.layers.2.mlp.gate_up_proj'): + # All other layers (layer index >= 2) are not quantized + assert isinstance(submodule.quant_method, + UnquantizedLinearMethod) + + llm.apply_model(check_model) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index b24964a9d0a9f..e69d4ad349c38 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -29,8 +29,8 @@ def test_lm_head( lm_head_quantized: bool, monkeypatch, ) -> None: - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index c60a03f44baec..e7174be73626a 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -11,16 +11,12 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.skipif(not is_quant_method_supported("modelopt"), diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 5f78bc30504c0..088b68510cffa 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -13,6 +13,16 @@ from vllm.model_executor.layers.quantization.ptpc_fp8 import ( PTPCFp8LinearMethod) from vllm.platforms import current_platform +UNSUPPORTED_STR = ( + "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only " + "support output dtype of bfloat16. torch.float16 is specified.") + + +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + @pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"), reason="PTPC FP8 is not supported on this GPU type.") @@ -21,14 +31,22 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"]) def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - try: - with vllm_runner("facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - kv_cache_dtype=kv_cache_dtype) as llm: + llm = vllm_runner("facebook/opt-125m", + dtype=dtype, + quantization="ptpc_fp8", + kv_cache_dtype=kv_cache_dtype) + except AssertionError as e: + if str(e) == UNSUPPORTED_STR: + # If the error message matches, the test passes + return + else: + # If the error message does not match, re-raise the exception + raise - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + with llm: + + def check_model(model): fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, PTPCFp8LinearMethod) if kv_cache_dtype == "ptpc_fp8": @@ -40,17 +58,8 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: if current_platform.has_device_capability(94): # For GPUs with hardware support, we keep weights in fp8 assert fc1.weight.dtype == torch.float8_e4m3fnuz - else: - pytest.skip() - output = llm.generate_greedy("Hello my name is", max_tokens=20) - assert output - except AssertionError as e: - if str( - e - ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.": # noqa: E501 - # If the error message matches, the test passes - pass - else: - # If the error message does not match, re-raise the exception - raise + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index c09931971e6fb..930f4acb328fd 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -7,10 +7,10 @@ Run `pytest tests/quantization/test_quark.py`. See also `tests/kernels/moe/test_mxfp4_moe.py`. """ -import importlib import importlib.metadata import os from dataclasses import dataclass +from importlib.util import find_spec import huggingface_hub import lm_eval @@ -24,9 +24,8 @@ from vllm.platforms import current_platform from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') if QUARK_MXFP4_AVAILABLE: from quark.torch.export.nn.modules.realquantizer import ( @@ -43,11 +42,9 @@ except huggingface_hub.errors.RepositoryNotFoundError: @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8']) @@ -132,13 +129,12 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.llm.llm_engine.model_executor. - driver_worker.model_runner.model) - quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. - model_runner.model) - fp8_state_dict = fp8_model.state_dict() + def get_state_dict(model): + return {k: v.cpu() for k, v in model.state_dict().items()} + + quark_state_dict, = quark_handle.apply_model(get_state_dict) + fp8_state_dict, = fp8_handle.apply_model(get_state_dict) assert fp8_state_dict.keys() == quark_state_dict.keys() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 84705e92c85bb..03fe59d7e3bff 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -105,18 +105,21 @@ def test_register_quantization_config(): ]) def test_custom_quant(vllm_runner, model, monkeypatch): """Test infer with the custom quantization method.""" - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + with vllm_runner(model_name=model, quantization="custom_quant", enforce_eager=True) as llm: - model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj - # Check the quantization method is FakeQuantLinearMethod - assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 708f3bbeeff15..014bc56bc8ece 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -13,6 +13,7 @@ from typing import Sequence as GenericSequence from typing import Set, Type, Union, cast import torch +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -55,6 +56,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind from vllm.version import __version__ as VLLM_VERSION from vllm.worker.model_runner_base import InputProcessingError +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -1817,13 +1819,16 @@ class LLMEngine: return sampling_params def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e21bfce0ab085..f2282c40f7073 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -522,9 +522,14 @@ class LLM: """ Run a function directly on the model inside each worker, returning the result for each of them. + + !!! warning + To reduce the overhead of data transfer, avoid returning large + arrays or tensors from this method. If you must return them, + make sure you move them to CPU first to avoid taking up additional + VRAM! """ - executor = self.llm_engine.model_executor - return executor.apply_model(func) + return self.llm_engine.apply_model(func) def _get_beam_search_lora_requests( self, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 42aa8d14a21eb..b75b94ad0acc2 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -5,11 +5,10 @@ import asyncio import time from abc import ABC, abstractmethod from functools import cached_property -from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, - Union) +from typing import Any, Awaitable, Callable, List, Optional, Set, Union import torch.nn as nn -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.platforms from vllm.config import VllmConfig @@ -63,10 +62,10 @@ class ExecutorBase(ABC): @abstractmethod def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -91,7 +90,7 @@ class ExecutorBase(ABC): """ raise NotImplementedError - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -99,9 +98,10 @@ class ExecutorBase(ABC): ExecutorBase may require modification of the result, e.g. to ensure the selected cache sizes are compatible with all workers. - Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks - are blocks that are "active" on the device and can be appended to. - num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where + `num_gpu_blocks` are blocks that are "active" on the device and can be + appended to. + `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be appended to. """ results = self.collective_rpc("determine_num_available_blocks") @@ -127,16 +127,15 @@ class ExecutorBase(ABC): self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + @deprecated("`llm_engine.model_executor.apply_model` will no longer work " + "in V1 Engine. Please replace with `llm_engine.apply_model` " + "and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.") def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: """ Run a function directly on the model inside each worker, returning the result for each of them. """ - - def rpc_func(worker: WorkerBase) -> _R: - return func(worker.get_model()) - - return self.collective_rpc(rpc_func) + return self.collective_rpc("apply_model", args=(func, )) @cached_property # Avoid unnecessary RPC calls def supported_tasks(self) -> tuple[SupportedTask, ...]: @@ -308,8 +307,8 @@ class DistributedExecutorBase(ExecutorBase): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[Any]: return self._run_workers(method, *args, **(kwargs or {})) @abstractmethod diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c93bfc35f0aeb..907656d1b24cb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -5,6 +5,7 @@ from collections.abc import Mapping from copy import copy from typing import Any, Callable, Optional, Union +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -33,6 +34,7 @@ from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, StatLoggerFactory) from vllm.v1.metrics.reader import Metric, get_metrics_snapshot from vllm.v1.metrics.stats import IterationStats +from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -319,12 +321,15 @@ class LLMEngine: return self.engine_core.pin_lora(lora_id) def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.engine_core.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + def __del__(self): if dp_group := getattr(self, "dp_group", None): stateless_destroy_torch_distributed_process_group(dp_group) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index aa76d21f0fcaa..d0a56f6ff4637 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -5,7 +5,8 @@ import dataclasses import os import time from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type, + TypeVar, Union) import cloudpickle import torch @@ -28,6 +29,8 @@ from vllm.worker.model_runner_base import (BroadcastableModelInput, logger = init_logger(__name__) +_R = TypeVar("_R") + @warn_for_unimplemented_methods class WorkerBase: @@ -70,6 +73,10 @@ class WorkerBase: def get_model(self) -> nn.Module: raise NotImplementedError + def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R: + """Apply a function on the model inside this worker.""" + return fn(self.get_model()) + def load_model(self) -> None: """Load model onto target device.""" raise NotImplementedError From e08a3a3fdbdb5408f904a237b31ff2447a336b2f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 20 Sep 2025 04:16:56 -0400 Subject: [PATCH 055/185] [CI Failure] Disable FlashInfer RoPE to unblock CI (#25299) Signed-off-by: mgoin --- .../model_executor/layers/rotary_embedding/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 3dc249ae9adb9..1c3576bee5392 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -6,8 +6,6 @@ from typing import Optional import torch from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer from .common import apply_rotary_emb_torch @@ -32,13 +30,15 @@ class RotaryEmbedding(CustomOp): self.base = base self.is_neox_style = is_neox_style self.dtype = dtype + # TODO(mgoin): disabled for now due to failures # Flashinfer only supports head_size=64, 128, 256, 512. # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 - self.use_flashinfer = (self.enabled() - and dtype in (torch.float16, torch.bfloat16) - and current_platform.is_cuda() - and has_flashinfer() - and self.head_size in [64, 128, 256, 512]) + # self.use_flashinfer = (self.enabled() + # and dtype in (torch.float16, torch.bfloat16) + # and current_platform.is_cuda() + # and has_flashinfer() + # and self.head_size in [64, 128, 256, 512]) + self.use_flashinfer = False cache = self._compute_cos_sin_cache() if not self.use_flashinfer: From 032d661d27db873ddd71ffb609c79c581b3f27b9 Mon Sep 17 00:00:00 2001 From: Wenlong Wang Date: Sat, 20 Sep 2025 04:45:18 -0700 Subject: [PATCH 056/185] [Docs] Fix warnings in mkdocs build (continued) (#25042) Signed-off-by: wwl2755 --- vllm/multimodal/__init__.py | 2 +- vllm/utils/__init__.py | 6 +++--- vllm/v1/core/kv_cache_utils.py | 4 ++-- vllm/v1/sample/rejection_sampler.py | 10 +++++----- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 3 ++- vllm/worker/model_runner.py | 12 ++++++++++-- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index b7d4cd298e24f..7ffa732cf3708 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -15,7 +15,7 @@ is used by model runners to dispatch data processing according to the target model. Info: - [mm_processing](../../../design/mm_processing.html) + [mm_processing](../../../design/mm_processing.md) """ __all__ = [ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index fd1c0af31269c..968bba664f0a9 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3216,7 +3216,7 @@ def cprofile_context(save_file: Optional[str] = None): Args: save_file: path to save the profile result. "1" or - None will result in printing to stdout. + None will result in printing to stdout. """ import cProfile @@ -3273,7 +3273,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool: and getattr(cfg.attn_config, "alibi", False))))) -def sha256(input) -> bytes: +def sha256(input: Any) -> bytes: """Hash any picklable Python object using SHA-256. The input is serialized using pickle before hashing, which allows @@ -3290,7 +3290,7 @@ def sha256(input) -> bytes: return hashlib.sha256(input_bytes).digest() -def sha256_cbor(input) -> bytes: +def sha256_cbor(input: Any) -> bytes: """ Hash objects using CBOR serialization and SHA-256. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3ccd00121f8ed..47a41322c423c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1229,8 +1229,8 @@ def get_kv_cache_configs(vllm_config: VllmConfig, Args: vllm_config: The global VllmConfig kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker. - available_memory: Memory available for KV cache in bytes for each - worker. + available_memory: Memory available for KV cache in bytes for each + worker. Returns: The generated KVCacheConfigs for each worker. diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 3d5e59addfcfa..ced5c7a970388 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -351,17 +351,17 @@ def generate_uniform_probs( without a seed. Args: - num_tokens : int + num_tokens: int Total number of tokens. - num_draft_tokens : List[List[int]] + num_draft_tokens: List[List[int]] Number of draft tokens per request. - generators : Optional[Dict[int, torch.Generator]] + generators: Optional[Dict[int, torch.Generator]] A dictionary mapping indices in the batch to `torch.Generator` objects. - device : torch.device + device: torch.device The device on which to allocate the tensor. Returns: - uniform_rand : torch.Tensor + uniform_rand: torch.Tensor A tensor of shape `(num_tokens, )` containing uniform random values in the range [0, 1). """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 233df8f1b0e9b..d0946e8c5d7d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1479,7 +1479,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: scheduler_output: The scheduler output containing scheduled encoder - inputs. + inputs. Returns: A tuple of (mm_kwargs, req_ids_pos) where: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 021d18b2500f0..af922f9979d1a 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -205,7 +205,8 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of [scatter_mm_placeholders][]. + This is the operation of [`scatter_mm_placeholders`] + [vllm.v1.worker.utils.scatter_mm_placeholders]. """ if is_embed is None: return placeholders diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f662f5a85eff6..bab89586b0f2c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1810,7 +1810,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return [output] - def need_recv_kv(self, model_input, kv_caches) -> bool: + def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to receive kv-cache from the other worker. We need to receive KV when 1. current vLLM instance is KV cache consumer/decode vLLM instance @@ -1825,6 +1826,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1835,7 +1839,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return self.vllm_config.kv_transfer_config.is_kv_consumer and ( not is_profile_run) and is_prefill_run - def need_send_kv(self, model_input, kv_caches) -> bool: + def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to send kv-cache to the other worker. We need to send KV when 1. current vLLM instance is KV cache producer/prefill vLLM instance @@ -1850,6 +1855,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling From bf8b26cad17f21f142de6c06ee657f01ccb2b816 Mon Sep 17 00:00:00 2001 From: Manoel Marques Date: Sat, 20 Sep 2025 07:51:13 -0400 Subject: [PATCH 057/185] Generate _ModelInfo properties file when loading to improve loading speed (#23558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Manoel Marques Signed-off-by: Manoel Marques Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič --- vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/log_time.py | 32 +++++++ .../model_loader/weight_utils.py | 44 +++++++++ vllm/model_executor/models/registry.py | 92 ++++++++++++++++++- 4 files changed, 167 insertions(+), 3 deletions(-) create mode 100644 vllm/logging_utils/log_time.py diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index cf690a89ae9bc..7202259ca21aa 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "logtime", ] diff --git a/vllm/logging_utils/log_time.py b/vllm/logging_utils/log_time.py new file mode 100644 index 0000000000000..013dd144beaf8 --- /dev/null +++ b/vllm/logging_utils/log_time.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Provides a timeslice logging decorator +""" + +import functools +import time + + +def logtime(logger, msg=None): + """ + Logs the execution time of the decorated function. + Always place it beneath other decorators. + """ + + def _inner(func): + + @functools.wraps(func) + def _wrapper(*args, **kwargs): + start = time.perf_counter() + result = func(*args, **kwargs) + elapsed = time.perf_counter() - start + + prefix = f"Function '{func.__module__}.{func.__qualname__}'" \ + if msg is None else msg + logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed) + return result + + return _wrapper + + return _inner diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f2c66763d0816..a72086da18c4d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -11,6 +11,7 @@ import tempfile import time from collections import defaultdict from collections.abc import Generator +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Optional, Union @@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path], return lock +@contextmanager +def atomic_writer(filepath: Union[str, Path], + mode: str = 'w', + encoding: Optional[str] = None): + """ + Context manager that provides an atomic file writing routine. + + The context manager writes to a temporary file and, if successful, + atomically replaces the original file. + + Args: + filepath (str or Path): The path to the file to write. + mode (str): The file mode for the temporary file (e.g., 'w', 'wb'). + encoding (str): The encoding for text mode. + + Yields: + file object: A handle to the temporary file. + """ + # Create a temporary file in the same directory as the target file + # to ensure it's on the same filesystem for an atomic replace. + temp_dir = os.path.dirname(filepath) + temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir) + + try: + # Open the temporary file for writing + with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file: + yield temp_file + + # If the 'with' block completes successfully, + # perform the atomic replace. + os.replace(temp_path, filepath) + + except Exception: + logger.exception( + "Error during atomic write. Original file '%s' not modified", + filepath) + raise + finally: + # Clean up the temporary file if it still exists. + if os.path.exists(temp_path): + os.remove(temp_path) + + def maybe_download_from_modelscope( model: str, revision: Optional[str] = None, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 76f2bd087624c..5dc5d545bb9c5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -4,7 +4,9 @@ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. """ +import hashlib import importlib +import json import os import pickle import subprocess @@ -12,16 +14,19 @@ import sys import tempfile from abc import ABC, abstractmethod from collections.abc import Set -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from functools import lru_cache +from pathlib import Path from typing import Callable, Optional, TypeVar, Union import torch.nn as nn import transformers +from vllm import envs from vllm.config import (ModelConfig, iter_architecture_defaults, try_match_architecture_defaults) from vllm.logger import init_logger +from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) @@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel): module_name: str class_name: str - # Performed in another process to avoid initializing CUDA + @staticmethod + def _get_cache_dir() -> Path: + return Path(envs.VLLM_CACHE_ROOT) / "modelinfos" + + def _get_cache_filename(self) -> str: + cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-") + return f"{cls_name}.json" + + def _load_modelinfo_from_cache(self, + module_hash: str) -> _ModelInfo | None: + try: + try: + modelinfo_path = self._get_cache_dir( + ) / self._get_cache_filename() + with open(modelinfo_path, encoding="utf-8") as file: + mi_dict = json.load(file) + except FileNotFoundError: + logger.debug(("Cached model info file " + "for class %s.%s not found"), self.module_name, + self.class_name) + return None + + if mi_dict["hash"] != module_hash: + logger.debug(("Cached model info file " + "for class %s.%s is stale"), self.module_name, + self.class_name) + return None + + # file not changed, use cached _ModelInfo properties + return _ModelInfo(**mi_dict["modelinfo"]) + except Exception: + logger.exception(("Cached model info " + "for class %s.%s error. "), self.module_name, + self.class_name) + return None + + def _save_modelinfo_to_cache(self, mi: _ModelInfo, + module_hash: str) -> None: + """save dictionary json file to cache""" + from vllm.model_executor.model_loader.weight_utils import atomic_writer + try: + modelinfo_dict = { + "hash": module_hash, + "modelinfo": asdict(mi), + } + cache_dir = self._get_cache_dir() + cache_dir.mkdir(parents=True, exist_ok=True) + modelinfo_path = cache_dir / self._get_cache_filename() + with atomic_writer(modelinfo_path, encoding='utf-8') as f: + json.dump(modelinfo_dict, f, indent=2) + except Exception: + logger.exception("Error saving model info cache.") + + @logtime(logger=logger, msg="Registry inspect model class") def inspect_model_cls(self) -> _ModelInfo: - return _run_in_subprocess( + model_path = Path( + __file__).parent / f"{self.module_name.split('.')[-1]}.py" + + assert model_path.exists(), \ + f"Model {self.module_name} expected to be on path {model_path}" + with open(model_path, "rb") as f: + module_hash = hashlib.md5(f.read()).hexdigest() + + mi = self._load_modelinfo_from_cache(module_hash) + if mi is not None: + logger.debug(("Loaded model info " + "for class %s.%s from cache"), self.module_name, + self.class_name) + return mi + else: + logger.debug(("Cache model info " + "for class %s.%s miss. " + "Loading model instead."), self.module_name, + self.class_name) + + # Performed in another process to avoid initializing CUDA + mi = _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) + logger.debug("Loaded model info for class %s.%s", self.module_name, + self.class_name) + + # save cache file + self._save_modelinfo_to_cache(mi, module_hash) + + return mi def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) From 3c713a97116f34ffdc75ba10c9dfbb2850437984 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 20 Sep 2025 20:46:24 +0800 Subject: [PATCH 058/185] [Model] Cleanup InternViT's data parallel implementation (#25306) Signed-off-by: Isotr0py --- vllm/model_executor/models/intern_vit.py | 158 ++++++----------------- 1 file changed, 37 insertions(+), 121 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 118cce810a1f2..892188c047228 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -164,23 +163,15 @@ class InternParallelAttention(nn.Module): self.tp_size) self.scale = self.head_dim**-0.5 - if use_data_parallel: - self.qkv = ReplicatedLinear( - self.embed_dim, - 3 * self.head_dim * self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) - else: - self.qkv = QKVParallelLinear( - self.embed_dim, - self.head_dim, - num_dummy_heads + self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + num_dummy_heads + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) self.qk_normalization = config.qk_normalization @@ -192,20 +183,13 @@ class InternParallelAttention(nn.Module): eps=config.layer_norm_eps, var_hidden_size=self.embed_dim) - if use_data_parallel: - self.proj = ReplicatedLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) - else: - self.proj = RowParallelLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + self.proj = RowParallelLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel, + ) self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -236,72 +220,6 @@ class InternParallelAttention(nn.Module): return out -class InternSdpaAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - config: PretrainedConfig, - *, - num_dummy_heads: int = 0, - ) -> None: - super().__init__() - - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f'embed_dim must be divisible by num_heads ' - f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' - f' {self.num_heads}).') - - # Additional dummy heads are used to enable TP for common GPU counts. - self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim - - self.scale = self.head_dim**-0.5 - self.qkv = nn.Linear(self.embed_dim, - 3 * self.dummy_dim, - bias=config.qkv_bias) - - self.qk_normalization = config.qk_normalization - - if self.qk_normalization: - self.q_norm = RMSNorm(self.dummy_dim, - eps=config.layer_norm_eps, - var_hidden_size=self.embed_dim) - self.k_norm = RMSNorm(self.dummy_dim, - eps=config.layer_norm_eps, - var_hidden_size=self.embed_dim) - - self.proj = nn.Linear(self.dummy_dim, self.embed_dim) - - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_heads, self.head_dim, - self.scale) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - B, N, C = x.shape - qkv = self.qkv(x) - q, k, v = qkv.chunk(3, dim=-1) - - q = q.view(B, N, self.num_heads, self.head_dim) - k = k.view(B, N, self.num_heads, self.head_dim) - v = v.view(B, N, self.num_heads, self.head_dim) - - if self.qk_normalization: - B_, N_, H_, D_ = q.shape - q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_) - k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_) - - # Use unified MultiHeadAttention with automatic backend selection - x = self.attn(q, k, v) - - x = self.proj(x) - return x - - class InternMLP(nn.Module): def __init__( @@ -315,20 +233,18 @@ class InternMLP(nn.Module): self.config = config self.activation_fn = get_act_fn(config.hidden_act) - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - self.fc1 = cls_fc1(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - cls_fc2 = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.fc2 = cls_fc2(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc2") + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -385,19 +301,19 @@ class InternVisionEncoderLayer(nn.Module): use_data_parallel: bool = False, ): # fallback to sdpa attention if tp unavailable - # tp_size = get_tensor_model_parallel_world_size() tp_size = (1 if use_data_parallel else get_tensor_model_parallel_world_size()) num_heads = config.num_attention_heads - if (num_heads + num_dummy_heads) % tp_size == 0: - return InternParallelAttention(config, - quant_config=quant_config, - num_dummy_heads=num_dummy_heads, - prefix=prefix, - use_data_parallel=use_data_parallel) - - return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) + # if the number of heads is not divisible by tp_size, + # we also disable Attention's TP + use_data_parallel = (use_data_parallel + or (num_heads + num_dummy_heads) % tp_size != 0) + return InternParallelAttention(config, + quant_config=quant_config, + num_dummy_heads=num_dummy_heads, + prefix=prefix, + use_data_parallel=use_data_parallel) def forward( self, From d88918e4c2d189eb25724a6cff9a9028c67daa07 Mon Sep 17 00:00:00 2001 From: lirong <56789630+lirong-lirong@users.noreply.github.com> Date: Sat, 20 Sep 2025 21:15:22 +0800 Subject: [PATCH 059/185] [Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308) Signed-off-by: pengdrumli --- tests/test_sharded_state_loader.py | 20 ++++++++++++-------- vllm/engine/arg_utils.py | 6 ------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 42afdfa3c7468..fd5b5fad0999c 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -57,10 +57,19 @@ def llama_3p2_1b_files(): def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): llm_sharded_writer = LLM(model=input_dir, **kwargs) - + # Check which engine version is being used + is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core") # Dump worker states to output directory - llm_sharded_writer.llm_engine.model_executor.save_sharded_state( - path=output_dir) + if is_v1_engine: + # For V1 engine, we need to use engine_core.save_sharded_state + print("Using V1 engine save path") + llm_sharded_writer.llm_engine.engine_core.save_sharded_state( + path=output_dir) + else: + # For V0 engine + print("Using V0 engine save path") + model_executor = llm_sharded_writer.llm_engine.model_executor + model_executor.save_sharded_state(path=output_dir) # Copy metadata files to output directory for file in os.listdir(input_dir): @@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, gpu_memory_utilization = 0.8 input_dir = llama_3p2_1b_files ctx = mp.get_context("spawn") - # The interface in v1 engine has changed, run in v1 engine will hang. - monkeypatch.setenv("VLLM_USE_V1", "0") # Run in separate processes for memory & CUDA isolation with TemporaryDirectory() as output_dir: @@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, args=(input_dir, output_dir, weights_patterns), kwargs=dict( tensor_parallel_size=tp_size, - distributed_executor_backend="mp", gpu_memory_utilization=gpu_memory_utilization, enforce_eager=True, )) @@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, p = ctx.Process(target=_run_generate, args=(input_dir, queue), kwargs=dict( - distributed_executor_backend="mp", enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, @@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, p = ctx.Process(target=_run_generate, args=(output_dir, queue), kwargs=dict( - distributed_executor_backend="mp", enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7a4bb0d41d236..8912ff8bad429 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1486,12 +1486,6 @@ class EngineArgs: ############################################################# # Unsupported Feature Flags on V1. - if self.load_format == "sharded_state": - _raise_or_fallback( - feature_name=f"--load_format {self.load_format}", - recommend_to_remove=False) - return False - if (self.logits_processor_pattern != EngineArgs.logits_processor_pattern): _raise_or_fallback(feature_name="--logits-processor-pattern", From bef180f00978186fcc84f7ca6328a6ae6c39676d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 21 Sep 2025 01:50:58 +0800 Subject: [PATCH 060/185] [V0 Deprecation] Enable the remaining multimodal tests in V1 (#25307) Signed-off-by: DarkLight1337 --- tests/conftest.py | 76 ++++++--- .../multimodal/generation/test_common.py | 158 +++++++++--------- .../multimodal/generation/test_pixtral.py | 49 +----- .../multimodal/generation/test_qwen2_vl.py | 9 +- .../multimodal/pooling/test_prithvi_mae.py | 26 ++- tests/models/quantization/test_awq.py | 38 +++-- .../models/quantization/test_bitsandbytes.py | 28 ++-- tests/models/test_terratorch.py | 25 ++- 8 files changed, 195 insertions(+), 214 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e8e95357ff5b9..ce9de3bf94b59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,7 @@ import socket import tempfile import threading from collections.abc import Generator +from contextlib import nullcontext from enum import Enum from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast @@ -45,14 +46,14 @@ from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) -from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) +from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.sequence import Logprob from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils import set_default_torch_num_threads logger = init_logger(__name__) @@ -306,6 +307,35 @@ class HfRunner: is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, + # Set this to avoid hanging issue + default_torch_num_threads: Optional[int] = None, + ) -> None: + init_ctx = (nullcontext() if default_torch_num_threads is None else + set_default_torch_num_threads(default_torch_num_threads)) + + with init_ctx: + self._init( + model_name=model_name, + dtype=dtype, + model_kwargs=model_kwargs, + trust_remote_code=trust_remote_code, + is_sentence_transformer=is_sentence_transformer, + is_cross_encoder=is_cross_encoder, + skip_tokenizer_init=skip_tokenizer_init, + auto_cls=auto_cls, + ) + + def _init( + self, + model_name: str, + dtype: str = "auto", + *, + model_kwargs: Optional[dict[str, Any]] = None, + trust_remote_code: bool = True, + is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, + skip_tokenizer_init: bool = False, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, ) -> None: model_name = maybe_model_redirect(model_name) self.model_name = model_name @@ -714,26 +744,32 @@ class VllmRunner: enable_chunked_prefill: Optional[bool] = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, + # Set this to avoid hanging issue + default_torch_num_threads: Optional[int] = None, **kwargs, ) -> None: - self.llm = LLM( - model=model_name, - runner=runner, - convert=convert, - tokenizer=tokenizer_name, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - dtype=dtype, - seed=seed, - swap_space=swap_space, - enforce_eager=enforce_eager, - disable_log_stats=disable_log_stats, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - block_size=block_size, - enable_chunked_prefill=enable_chunked_prefill, - **kwargs, - ) + init_ctx = (nullcontext() if default_torch_num_threads is None else + set_default_torch_num_threads(default_torch_num_threads)) + + with init_ctx: + self.llm = LLM( + model=model_name, + runner=runner, + convert=convert, + tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + dtype=dtype, + seed=seed, + swap_space=swap_space, + enforce_eager=enforce_eager, + disable_log_stats=disable_log_stats, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + block_size=block_size, + enable_chunked_prefill=enable_chunked_prefill, + **kwargs, + ) def get_inputs( self, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 79f9d607f3386..e76b58e61ec16 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -32,13 +32,6 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs, if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" -REQUIRES_V0_MODELS = [ - # V1 Test: not enough KV cache space in C1. - "fuyu", - # V1 Test: Deadlock issue when processing mm_inputs - "llava-onevision-transformers", -] - # yapf: disable COMMON_BROADCAST_SETTINGS = { "test_type": VLMTestType.IMAGE, @@ -186,8 +179,11 @@ VLM_TEST_SETTINGS = { image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "transformers", + "default_torch_num_threads": 1, }, - marks=[pytest.mark.core_model], + # FIXME: Investigate why the test hangs + # when processing the 3rd prompt in vLLM + marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")], ), "idefics3-transformers": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], @@ -320,6 +316,7 @@ VLM_TEST_SETTINGS = { vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[large_gpu_mark(min_gb=32)], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], @@ -861,13 +858,14 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) test_type=VLMTestType.IMAGE, create_new_process_for_each_test=False, )) -def test_single_image_models(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_single_image_models( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -886,13 +884,14 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, test_type=VLMTestType.MULTI_IMAGE, create_new_process_for_each_test=False, )) -def test_multi_image_models(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_multi_image_models( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -911,13 +910,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_type=VLMTestType.EMBEDDING, create_new_process_for_each_test=False, )) -def test_image_embedding_models(model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_image_embedding_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -935,11 +934,13 @@ def test_image_embedding_models(model_type: str, test_type=VLMTestType.VIDEO, create_new_process_for_each_test=False, )) -def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: VideoTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_video_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + video_assets: VideoTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -957,11 +958,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, test_type=VLMTestType.AUDIO, create_new_process_for_each_test=False, )) -def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - audio_assets: AudioTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_audio_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_audio_test( model_test_info=model_test_info, @@ -984,10 +987,7 @@ def test_custom_inputs_models( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - monkeypatch, ): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, @@ -1006,13 +1006,14 @@ def test_custom_inputs_models( create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_single_image_models_heavy( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -1032,13 +1033,14 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_multi_image_models_heavy( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -1058,14 +1060,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_image_embedding_models_heavy(model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_image_embedding_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -1083,12 +1084,13 @@ def test_image_embedding_models_heavy(model_type: str, test_type=VLMTestType.VIDEO, create_new_process_for_each_test=True, )) -def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - video_assets: VideoTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_video_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + video_assets: VideoTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -1106,12 +1108,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, test_type=VLMTestType.AUDIO, create_new_process_for_each_test=True, )) -def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - audio_assets: AudioTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_audio_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_audio_test( model_test_info=model_test_info, @@ -1135,10 +1138,7 @@ def test_custom_inputs_models_heavy( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - monkeypatch, ): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index a4e21aface41f..cb3cc1d3d330d 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -12,13 +12,12 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from transformers import AutoProcessor -from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt +from vllm import SamplingParams, TextPrompt, TokensPrompt from vllm.multimodal import MultiModalDataBuiltins -from vllm.multimodal.inputs import PlaceholderRange from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test -from ...utils import check_logprobs_close, dummy_hf_overrides +from ...utils import check_logprobs_close if TYPE_CHECKING: from _typeshed import StrPath @@ -185,47 +184,3 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str, outputs_1_lst=logprobs, name_0="h100_ref", name_1="output") - - -@pytest.mark.parametrize( - "image_urls,expected_ranges", - [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), - (IMG_URLS[1:4], [ - PlaceholderRange(offset=11, length=266), - PlaceholderRange(offset=277, length=1056), - PlaceholderRange(offset=1333, length=418) - ])]) -def test_multi_modal_placeholders(vllm_runner, image_urls: list[str], - expected_ranges: list[PlaceholderRange], - local_asset_server, monkeypatch) -> None: - local_image_urls = [local_asset_server.url_for(u) for u in image_urls] - prompt = _create_engine_inputs_hf(local_image_urls) - - # This placeholder checking test only works with V0 engine - # where `multi_modal_placeholders` is returned with `RequestOutput` - monkeypatch.setenv("VLLM_USE_V1", "0") - with vllm_runner( - "mistral-community/pixtral-12b", - max_model_len=8192, - limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, - load_format="dummy", - hf_overrides=dummy_hf_overrides, - ) as vllm_model: - outputs = vllm_model.llm.generate(prompt) - - assert len(outputs) == 1, f"{len(outputs)=}" - output: RequestOutput = outputs[0] - assert hasattr(output, - "multi_modal_placeholders"), f"{output.__dict__=}" - assert "image" in output.multi_modal_placeholders, \ - f"{output.multi_modal_placeholders.keys()=}" - image_placeholder_ranges: list[ - PlaceholderRange] = output.multi_modal_placeholders["image"] - assert len(image_placeholder_ranges) == len( - expected_ranges), f"{image_placeholder_ranges=}" - for real_range, expected_range in zip(image_placeholder_ranges, - expected_ranges): - assert real_range.offset == expected_range.offset, \ - f"{real_range=} {expected_range=}" - assert real_range.length == expected_range.length, \ - f"{real_range=} {expected_range=}" diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e56f4e4075be4..8336ebc0d59cb 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -10,7 +10,6 @@ from PIL import Image from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video -from vllm.utils import set_default_torch_num_threads from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) @@ -264,8 +263,7 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) # max_model_len should be greater than image_feature_size - with set_default_torch_num_threads(1): - vllm_model = vllm_runner( + with vllm_runner( model, runner="generate", max_model_len=4000, @@ -277,9 +275,8 @@ def run_embedding_input_test( }, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - ) - - with vllm_model: + default_torch_num_threads=1, + ) as vllm_model: outputs_per_case_for_original_input = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index b503d42567022..7309660ea5261 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -4,8 +4,6 @@ import pytest import torch -from vllm.utils import set_default_torch_num_threads - from ....conftest import VllmRunner @@ -30,19 +28,17 @@ def _run_test( } for _ in range(10) ] - with ( - set_default_torch_num_threads(1), - vllm_runner( - model, - runner="pooling", - dtype=torch.float16, - enforce_eager=True, - skip_tokenizer_init=True, - # Limit the maximum number of sequences to avoid the - # test going OOM during the warmup run - max_num_seqs=32, - ) as vllm_model, - ): + with vllm_runner( + model, + runner="pooling", + dtype="half", + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + default_torch_num_threads=1, + ) as vllm_model: vllm_model.encode(prompt) diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index 7005e435ecf46..e741e4ad90a09 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -45,12 +45,15 @@ def run_awq_test( # will hurt multiprocessing backend with fork method (the default method). # max_model_len should be greater than image_feature_size - with vllm_runner(source_model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner( + source_model, + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + default_torch_num_threads=1, + ) as vllm_model: source_outputs_per_image = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -59,13 +62,16 @@ def run_awq_test( for prompts, images in inputs_per_image ] - with vllm_runner(quant_model, - quantization="awq", - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner( + quant_model, + quantization="awq", + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + default_torch_num_threads=1, + ) as vllm_model: quant_outputs_per_image = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -108,12 +114,8 @@ def run_awq_test( @pytest.mark.parametrize("num_logprobs", [5]) @torch.inference_mode() def test_awq_models(vllm_runner, image_assets, source_model, quant_model, - size_factors, dtype, max_tokens, num_logprobs, - monkeypatch) -> None: + size_factors, dtype, max_tokens, num_logprobs) -> None: - # Test V1: this test hangs during setup on single-scale input. - # TODO: figure out why and re-enable this on V1. - monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, image_assets, diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index e0e919b62b217..25fc44fee90dd 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -5,10 +5,7 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. ''' -import gc - import pytest -import torch from transformers import BitsAndBytesConfig from tests.quantization.utils import is_quant_method_supported @@ -131,12 +128,15 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts, )) with vllm_runner(model_name, quantization='bitsandbytes', - enforce_eager=False) as llm: + enforce_eager=False, + default_torch_num_threads=1) as llm: vllm_outputs = llm.generate_greedy_logprobs(example_prompts, max_tokens=32, num_logprobs=5) - with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: + with hf_runner(model_name, + model_kwargs=hf_model_kwargs, + default_torch_num_threads=1) as llm: transformers_outputs = llm.generate_greedy_logprobs_limit( example_prompts, max_tokens=32, num_logprobs=5) check_logprobs_close( @@ -174,7 +174,8 @@ def test_4bit_bnb_embedding_model( runner="pooling", dtype=dtype, gpu_memory_utilization=0.5, - quantization="bitsandbytes") as vllm_model: + quantization="bitsandbytes", + default_torch_num_threads=1) as vllm_model: vllm_outputs = vllm_model.embed(example_prompts) hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig( @@ -184,6 +185,7 @@ def test_4bit_bnb_embedding_model( dtype=dtype, model_kwargs=hf_model_kwargs, is_sentence_transformer=True, + default_torch_num_threads=1, ) as hf_model: hf_outputs = hf_model.encode(example_prompts) @@ -222,26 +224,22 @@ def validate_generated_texts(hf_runner, with vllm_runner(model_name, quantization=None if pre_quant else 'bitsandbytes', tensor_parallel_size=vllm_tp_size, - enforce_eager=False) as llm: + enforce_eager=False, + default_torch_num_threads=1) as llm: vllm_outputs = llm.generate_greedy(prompts, max_tokens) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") - # Clean up the GPU memory for the next test - gc.collect() - torch.cuda.empty_cache() - if hf_model_kwargs is None: hf_model_kwargs = {} # Run with HF runner - with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: + with hf_runner(model_name, + model_kwargs=hf_model_kwargs, + default_torch_num_threads=1) as llm: hf_outputs = llm.generate_greedy(prompts, max_tokens) hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner") - # Clean up the GPU memory for the next test - gc.collect() - torch.cuda.empty_cache() # Compare the generated strings for hf_log, vllm_log in zip(hf_logs, vllm_logs): hf_str = hf_log["generated_text"] diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index d6d43ca2f7e15..842e37ea26f67 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -5,7 +5,6 @@ import pytest import torch from tests.conftest import VllmRunner -from vllm.utils import set_default_torch_num_threads @pytest.mark.parametrize( @@ -25,19 +24,17 @@ def test_inference( prompt = dict(prompt_token_ids=[1], multi_modal_data=dict(pixel_values=pixel_values, location_coords=location_coords)) - with ( - set_default_torch_num_threads(1), - vllm_runner( - model, - runner="pooling", - dtype=torch.float16, - enforce_eager=True, - skip_tokenizer_init=True, - # Limit the maximum number of sequences to avoid the - # test going OOM during the warmup run - max_num_seqs=32, - ) as vllm_model, - ): + with vllm_runner( + model, + runner="pooling", + dtype="half", + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + default_torch_num_threads=1, + ) as vllm_model: vllm_output = vllm_model.llm.encode(prompt) assert torch.equal( From 367a480bd3534edf27a8dac3c6f7ea8af9d1ed45 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Sun, 21 Sep 2025 07:39:47 +0800 Subject: [PATCH 061/185] [Docs] Fix warnings in vllm/profiler and vllm/transformers_utils (#25220) Signed-off-by: windsonsea --- mkdocs.yaml | 1 + vllm/profiler/layerwise_profile.py | 4 ++-- vllm/transformers_utils/configs/jais.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index 6f2be65a18af8..1535fcc622cdc 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -102,6 +102,7 @@ plugins: - https://numpy.org/doc/stable/objects.inv - https://pytorch.org/docs/stable/objects.inv - https://psutil.readthedocs.io/en/stable/objects.inv + - https://huggingface.co/docs/transformers/main/en/objects.inv markdown_extensions: - attr_list diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 2f9ebe531cbb1..41136f738c286 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -353,8 +353,8 @@ class layerwise_profile(profile): Args: num_running_seqs (Optional[int], optional): When given, - num_running_seqs will be passed to LayerProfileResults for metadata - update. Defaults to None. + num_running_seqs will be passed to LayerProfileResults + for metadata update. Defaults to None. """ super().__init__( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index d5ca2c7b4751a..3f50638f16b53 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -74,8 +74,7 @@ class JAISConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - scale_attn_by_inverse_layer_idx - (`bool`, *optional*, defaults to `False`): + scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`): Whether to additionally scale attention weights by `1 / layer_idx + 1`. reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): From 52c2a8d4adccf72c2cf59a18da31e44a61c43b41 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 17:56:30 -0700 Subject: [PATCH 062/185] [V0 Deprecation] Remove LLMEngine (#25033) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- .../scripts/hardware_ci/run-amd-test.sh | 4 - .buildkite/test-pipeline.yaml | 8 +- .github/CODEOWNERS | 1 - examples/offline_inference/profiling.py | 510 ----- .../test_basic_correctness.py | 16 +- tests/basic_correctness/test_cumem.py | 5 +- tests/compile/test_fusion_attn.py | 3 +- tests/conftest.py | 20 - tests/entrypoints/llm/test_generate.py | 6 - .../entrypoints/llm/test_prompt_validation.py | 8 - tests/entrypoints/openai/test_metrics.py | 2 +- .../attention/test_attention_selector.py | 91 +- tests/lora/test_lora_functions.py | 2 +- .../models/language/generation/test_common.py | 6 +- .../models/language/generation/test_hybrid.py | 122 +- tests/models/language/pooling/test_reward.py | 3 +- tests/models/quantization/test_fp8.py | 12 - tests/models/test_initialization.py | 13 +- tests/models/test_oot_registration.py | 1 + tests/plugins_tests/test_platform_plugins.py | 9 - tests/plugins_tests/test_scheduler_plugins.py | 37 +- tests/samplers/test_beam_search.py | 7 - tests/samplers/test_ignore_eos.py | 7 - tests/samplers/test_ranks.py | 6 - tests/tokenization/test_detokenize.py | 55 - vllm/engine/arg_utils.py | 8 - vllm/engine/llm_engine.py | 1833 +---------------- vllm/entrypoints/llm.py | 11 +- .../model_executor/model_loader/tensorizer.py | 22 +- 29 files changed, 65 insertions(+), 2763 deletions(-) delete mode 100644 examples/offline_inference/profiling.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 7f90181048d0f..aa4cc7b35a543 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} fi -if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then - commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} -fi - if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c42ec4f2503d0..1e7ce6ef0a665 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -110,7 +110,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 @@ -163,7 +163,6 @@ steps: - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -314,12 +313,11 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 @@ -894,7 +892,7 @@ steps: - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. # TODO: investigate and fix - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s models/multimodal/generation/test_maverick.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 323675993467a..f58256d38b9d1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,6 @@ /vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py deleted file mode 100644 index 392fba8fc5ead..0000000000000 --- a/examples/offline_inference/profiling.py +++ /dev/null @@ -1,510 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import inspect -import json -import os -import sys -from argparse import RawTextHelpFormatter -from collections.abc import Generator -from dataclasses import asdict, dataclass -from typing import Any, Optional, TypeAlias - -import torch -import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.profiler.layerwise_profile import layerwise_profile -from vllm.utils import FlexibleArgumentParser - -BATCH_SIZE_DEFAULT = 1 -PROMPT_LEN_DEFAULT = 256 - - -@dataclass -class ProfileContext: - engine_args: EngineArgs - prompt_len: int - batch_size: int - - # The profiler can run in 2 modes, - # 1. Run profiler for user specified num_steps - num_steps: Optional[int] = None - # 2. Run profiler until all requests complete - complete_num_requests_per_step: Optional[int] = None - - save_chrome_traces_folder: Optional[str] = None - - -def get_dtype(dtype: str): - if dtype == "torch.float": - return torch.float - else: - return dtype - - -OutputLen_NumReqs_Map: TypeAlias = dict[int, int] - - -def compute_request_output_lengths( - batch_size: int, step_requests: list[int] -) -> OutputLen_NumReqs_Map: - """ - Given the number of requests, batch_size, and the number of requests - that each engine-step should process, step_requests, determine the - output lengths of the requests such that step_request is honoured. - - Example: - if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1] - then return, - {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning, - 32 requests should have output length 2, - 32 requests should have output length 3, - 32 requests should have output length 4, - 31 requests should have output length 5, - 1 request should have output length 6. - - Args: - batch_size (int): Number of requests submitted for profile. This is - args.batch_size. - step_requests (list[int]): step_requests[i] is the number of requests - that the ith engine step should process. - - Returns: - OutputLen_NumReqs_Map : A dictionary with output-length as keys and the - number of requests required to have that output-length as values. - """ - ol_nr: OutputLen_NumReqs_Map = {} - - # Number of request that are assigned an output-length - num_reqs_assigned: int = 0 - num_steps: int = len(step_requests) - - # sanity check. The first step (prefill-step), must process all requests. - assert step_requests[0] == batch_size - - # Begin assignments from the last step. - output_length: int = num_steps - for num_requests_at_step in reversed(step_requests): - if num_reqs_assigned == batch_size: - break - - assert num_reqs_assigned < batch_size - - # Remove the number of requests that have been determined - # to participate in this step and beyond. - num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned - assert num_reqs_unassigned_at_step >= 0 - - if num_reqs_unassigned_at_step > 0: - ol_nr[output_length] = num_reqs_unassigned_at_step - num_reqs_assigned += num_reqs_unassigned_at_step - - output_length -= 1 - - # sanity checks. - assert sum(ol_nr.values()) == batch_size, ( - "Number of requests in output-length assignment does not match " - f"batch-size.\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}" - ) - - # Check that the output-length is in [1, num-steps]. Output length must be - # at least 1 as all requests must participate in the prefill-step. - assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), ( - "Output lengths of requests should be in range " - f"[1, num-engine-steps].\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}" - ) - - return ol_nr - - -def determine_requests_per_step(context: ProfileContext) -> list[int]: - """ - Determine number of requests each engine step should process. - If context.num_steps is set, then all engine steps process the - same number of requests and the output list is of length - context.num_steps. - - If context.complete_num_requests_per_step is set, then each decode step - processes fewer and fewer requests until there are no requests to process. - In this case, the output list is as big as the number of steps - required to process all requests. - - Args: - context: ProfileContext object. - - Returns: - list[int]: Number of requests to process for all engine-steps. - output[i], contains the number of requests that the ith step - should process. - """ - if context.num_steps: - # All requests must run until num_engine_steps. This implies - # that their output lengths must be equal to num_engine_steps. - return [context.batch_size] * context.num_steps - - assert ( - context.complete_num_requests_per_step - and context.complete_num_requests_per_step > 0 - ), ( - f"Expected a positive complete_num_requests_per_step argument." - f"Instead got {context.complete_num_requests_per_step}" - ) - - # We start dropping after the first decode step. - step_requests = [ - context.batch_size, # prefill - context.batch_size, # decode - ] - - num_running_requests = context.batch_size - num_running_requests -= context.complete_num_requests_per_step - while num_running_requests > 0: - step_requests.append(num_running_requests) - num_running_requests -= context.complete_num_requests_per_step - - if step_requests[-1] != 1: - # have 1 request running at the last step. This is often - # useful - step_requests.append(1) - - return step_requests - - -def run_profile( - context: ProfileContext, csv_output: Optional[str], json_output: Optional[str] -): - print("Run profile with:") - for key, value in asdict(context).items(): - print(f" {key} = {value}") - - requests_per_step: list[int] = determine_requests_per_step(context) - - ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( - context.batch_size, requests_per_step - ) - - num_steps_to_profile: int = len(requests_per_step) - max_output_len: int = max(ol_nr.keys()) - assert max_output_len >= 1 - - # Create sampling params - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - # max_tokens is set on a per-request basis. - max_tokens=None, - ignore_eos=True, - ) - - # Create LLM - llm = LLM(**asdict(context.engine_args)) - batch_size = context.batch_size - prompt_len = context.prompt_len - - scheduler_config = llm.llm_engine.vllm_config.scheduler_config - max_model_len = llm.llm_engine.model_config.max_model_len - max_num_batched_tokens = scheduler_config.max_num_batched_tokens - max_num_seqs = scheduler_config.max_num_seqs - - if batch_size * prompt_len > max_num_batched_tokens: - print( - f"ERROR: chosen batch_size * prompt_len " - f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " - f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " - f"and therefore cannot be run in a single profile step, please " - f"choose a smaller batch size or prompt length, or increase " - f"--max-num-batched-tokens" - ) - sys.exit(-1) - if batch_size > max_num_seqs: - print( - f"ERROR: chosen batch_size ({batch_size}) is larger than " - f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " - f"single profile step, please choose a smaller batch size" - ) - sys.exit(-1) - print( - "llm.llm_engine.model_config.max_model_len: ", - llm.llm_engine.model_config.max_model_len, - ) - if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len: - print( - f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " - f"{max_output_len} = {prompt_len + max_output_len}) is larger " - f"than the model's max_model_len ({max_model_len}), please " - f"choose a smaller prompt_len or max_output_len, or increase " - f"--max-model-len" - ) - sys.exit(-1) - - def add_requests(): - def get_output_len_generator() -> Generator[int, Any, Any]: - for output_len, num_reqs in ol_nr.items(): - for _ in range(num_reqs): - yield output_len - - output_len_generator = get_output_len_generator() - for i in range(batch_size): - sampling_params.max_tokens = next(output_len_generator) - assert isinstance(sampling_params.max_tokens, int) - - prompt_token_ids = torch.randint( - llm.get_tokenizer().vocab_size, size=(prompt_len,) - ).tolist() - - llm.llm_engine.add_request( - request_id=f"seq{i}", - prompt={"prompt_token_ids": prompt_token_ids}, - params=sampling_params, - ) - - def abort_requests(): - for i in range(batch_size): - llm.llm_engine.abort_request(f"seq{i}") - - # Warm up run - print("Warm up run ...") - add_requests() - llm.llm_engine.step() # Prefill - llm.llm_engine.step() # Decode - abort_requests() - - print("Profile run ...") - add_requests() - - with layerwise_profile() as prefill_prof: - llm.llm_engine.step() # First step is prefill - - decode_profs = [] - for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): - num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups() - with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof: - llm.llm_engine.step() - decode_profs.append(decode_prof) - - decode_results_list = [prof.results for prof in decode_profs] - prefill_results = prefill_prof.results - has_decode = len(decode_results_list) > 0 - - LINE_WIDTH = 80 - print("=" * LINE_WIDTH) - print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * LINE_WIDTH) - print() - prefill_results.print_model_table() - - if has_decode: - print() - print("=" * LINE_WIDTH) - print( - f"= First Decode Step Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})" - ) - print("=" * LINE_WIDTH) - print() - decode_results_list[0].print_model_table() - - print() - print("=" * LINE_WIDTH) - print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * LINE_WIDTH) - print() - prefill_results.print_summary_table() - - if has_decode: - print() - print("=" * LINE_WIDTH) - print( - f"= First Decode Step Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})" - ) - print("=" * LINE_WIDTH) - print() - decode_results_list[0].print_summary_table() - - if csv_output: - csv_filename_base = ( - csv_output[:-4] if csv_output.endswith(".csv") else csv_output - ) - prefill_results.export_model_stats_table_csv( - csv_filename_base + "_prefill_model_table.csv" - ) - prefill_results.export_summary_stats_table_csv( - csv_filename_base + "_prefill_summary_table.csv" - ) - - if has_decode: - decode_results_list[0].export_model_stats_table_csv( - csv_filename_base + "_decode_model_table.csv" - ) - decode_results_list[0].export_summary_stats_table_csv( - csv_filename_base + "_decode_summary_table.csv" - ) - - if json_output: - cuda_devices = [ - torch.cuda.get_device_properties(dev_idx) - for dev_idx in range(torch.cuda.device_count()) - ] - - json_dict = { - "context": { - "python_version": f"{sys.version}", - "torch_version": f"{torch.__version__}", - "torch_cuda_version": f"{torch.version.cuda}", - "cuda_devices": f"{cuda_devices}", - **asdict(context), - }, - "prefill": prefill_results.convert_stats_to_dict(), - } - - if has_decode: - for idx, dr in enumerate(decode_results_list): - json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() - - # Add .json to json_output filename if it doesn't exist already. - json_output_file = ( - json_output if json_output.endswith(".json") else json_output + ".json" - ) - with open(json_output_file, "w+") as f: - json.dump(json_dict, f, indent=2) - pass - - if context.save_chrome_traces_folder is not None: - os.makedirs(context.save_chrome_traces_folder, exist_ok=True) - prefill_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + "/prefill.json" - ) - for idx, decode_prof in enumerate(decode_profs): - decode_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + f"/decode_{idx + 1}.json" - ) - print( - "Traces saved as prefill.json and decode_1.json, etc." - f" in folder {context.save_chrome_traces_folder}" - ) - - -def parse_args(): - parser = FlexibleArgumentParser( - description=""" -Profile a model - - example: - ``` - python examples/offline_inference/profiling.py \\ - --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ - --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ - --enforce-eager run_num_steps -n 2 - ``` - - then you can use various tools to analyze the json output - terminal ascii tables: - ``` - python tools/profiler/print_layerwise_table.py \\ - --json-trace Llama31-8b-FP8.json --phase prefill --table summary - ``` - or create matplotlib stacked bar charts: - ``` - python tools/profiler/visualize_layerwise_profile.py \\ - --json-trace Llama31-8b-FP8.json \\ - --output-directory profile_breakdown --plot-metric pct_cuda_time - ``` -""", - formatter_class=RawTextHelpFormatter, - ) - parser.add_argument( - "--csv", - type=str, - default=None, - help="Export the results as multiple csv file. This should be the root " - "filename, will create _prefill_model_table.csv, " - "_prefill_summary_table.csv, " - "_decode_model_table.csv, and " - "_decode_summary_table.csv", - ) - parser.add_argument( - "--json", - type=str, - default=None, - help="Export the results as a json file. This should be the filename", - ) - parser.add_argument( - "--save-chrome-traces-folder", - type=str, - help="Save chrome traces for the prefill and decode " - "will save traces as prefill.json and decode_1.json, " - "etc. inside this folder", - ) - parser.add_argument( - "--prompt-len", - type=int, - default=PROMPT_LEN_DEFAULT, - help=f"Length of the random prompt to use when profiling, all batched " - f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}", - ) - parser.add_argument( - "--batch-size", - type=int, - default=BATCH_SIZE_DEFAULT, - help=f"Number of requests to run as a single batch, " - f"default={BATCH_SIZE_DEFAULT}", - ) - - subparsers = parser.add_subparsers(dest="cmd") - - run_num_steps_parser = subparsers.add_parser( - "run_num_steps", help="This variation profiles n engine.step() invocations." - ) - run_num_steps_parser.add_argument( - "-n", - "--num-steps", - type=int, - help="Number of engine steps to profile.\n" - "Setting it to 1, profiles only the prefill step.\n" - "Setting it to 2, profiles the prefill and first decode step\n" - "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n" - "and so on ...", - ) - - run_to_completion_parser = subparsers.add_parser( - "run_to_completion", - help="This variation profiles all the engine.step() invocations" - "until the engine exhausts all submitted requests.", - ) - run_to_completion_parser.add_argument( - "-n", - "--complete-num-requests-per-step", - type=int, - help="Complete complete_num_requests_per_step requests every decode step." - "For e.g., with batch_size 128 and complete_num_requests_per_step 32," - "the profiler is run for 6 engine steps, with the steps processing, " - "128, 128, 96, 64, 32, 1 requests respectively.\n" - "Note that we tack-on a one-request step at the end as it is often " - "useful.", - ) - - EngineArgs.add_cli_args(parser) - - return parser.parse_args() - - -def main(args): - context = ProfileContext( - engine_args=EngineArgs.from_cli_args(args), - **{ - k: v - for k, v in vars(args).items() - if k in inspect.signature(ProfileContext).parameters - }, - ) - run_profile(context, csv_output=args.csv, json_output=args.json) - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 24b1c9a93126c..411f3e01bc2cd 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ from unittest.mock import Mock import pytest import torch -from vllm import LLM, envs +from vllm import LLM from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 from ..conftest import HfRunner, VllmRunner @@ -26,14 +26,6 @@ MODELS = [ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_vllm_gc_ed(): """Verify vllm instance is GC'ed when it is deleted""" llm = LLM("distilbert/distilgpt2") @@ -76,12 +68,6 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - if not envs.VLLM_USE_V1: - if async_scheduling: - pytest.skip("async_scheduling only supported in v1.") - if model_executor != "uni": - pytest.skip("only test uniproc executor for v0.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": pytest.skip( f"{backend} does not support gemma2 with full context length.") diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index f3ad680b72b55..508740ab29389 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -122,11 +122,12 @@ def test_cumem_with_cudagraph(): # sleep mode with safetensors ("meta-llama/Llama-3.2-1B", True), # sleep mode with pytorch checkpoint - ("facebook/opt-125m", False), + ("facebook/opt-125m", True), ]) def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + assert use_v1 + m.setenv("VLLM_USE_V1", "1") free, total = torch.cuda.mem_get_info() used_bytes_baseline = total - free # in case other process is running llm = LLM(model, enable_sleep_mode=True) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 022f183b31932..b6bebbba915be 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str, # Use global backends global backend, backend_unfused - use_v1 = False # can be made a param once V1 support added - monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1))) + monkeypatch.setenv("VLLM_USE_V1", "1") monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa))) # Prompt 4 seems too open-ended, differs between fused and unfused diff --git a/tests/conftest.py b/tests/conftest.py index ce9de3bf94b59..f14b1e8780ad9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -160,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch): monkeypatch.delenv("VLLM_USE_V1") -@pytest.fixture(params=[True, False]) -def run_with_both_engines(request, monkeypatch): - # Automatically runs tests twice, once with V1 and once without - use_v1 = request.param - # Tests decorated with `@skip_v1` are only run without v1 - skip_v0 = request.node.get_closest_marker("skip_v0") - skip_v1 = request.node.get_closest_marker("skip_v1") - - if use_v1: - if skip_v1: - pytest.skip("Skipping test on vllm V1") - monkeypatch.setenv('VLLM_USE_V1', '1') - else: - if skip_v0: - pytest.skip("Skipping test on vllm V0") - monkeypatch.setenv('VLLM_USE_V1', '0') - - yield - - @pytest.fixture(autouse=True) def init_test_http_connection(): # pytest_asyncio may use a different event loop per test diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 3bbbcc755d134..e0ecb02d4f563 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -25,12 +25,6 @@ TOKEN_IDS = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 1b7be15d5d691..b219b33d1760e 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -6,14 +6,6 @@ import pytest from vllm import LLM -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_empty_prompt(): llm = LLM(model="openai-community/gpt2", enforce_eager=True) with pytest.raises(ValueError, match='decoder prompt cannot be empty'): diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 8917aa5a5efb9..f0b61902eb568 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool): "--port", port, ], - env={"VLLM_USE_V1": "1" if use_v1 else "0"}) + env={"VLLM_USE_V1": "1"}) def is_server_up(url): try: diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 190c92e1251c2..f8454ad0a4c4d 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -69,28 +69,20 @@ def generate_params(): @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) -@pytest.mark.parametrize("use_v1", [True, False]) def test_env( device: str, name: str, use_mla: bool, block_size: int, - use_v1: bool, monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with valid device-backend pairs.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, name) m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") - if name == "FLASHINFER" and not use_v1: - pytest.skip("FlashInfer backend is only available on V1 engine") - if device == "cpu": - if not use_v1: - pytest.skip("CPU backend only supports V1") - with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, None, block_size, @@ -137,7 +129,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name + expected = f"{name}_VLLM_V1" assert backend.get_name() == expected else: backend = get_attn_backend(16, @@ -146,7 +138,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" + expected = "TRITON_ATTN_VLLM_V1" assert backend.get_name() == expected elif device == "cuda": @@ -163,11 +155,7 @@ def test_env( # - TRITON_MLA: fallback for other cases if name == "CUTLASS_MLA": - if not use_v1: - # CUTLASS_MLA only supported on V1 engine - pytest.skip( - "CUTLASS_MLA only supported on V1 engine") - elif block_size != 128: + if block_size != 128: # CUTLASS_MLA only supports block_size == 128 pytest.skip( "CUTLASS_MLA only supports block_size 128") @@ -181,11 +169,7 @@ def test_env( expected = "CUTLASS_MLA_VLLM_V1" assert backend.get_name() == expected elif name == "FLASHINFER_MLA": - if not use_v1: - # FlashInfer MLA only supported on V1 engine - pytest.skip( - "FlashInfer MLA only supported on V1 engine") - elif block_size not in [32, 64]: + if block_size not in [32, 64]: # FlashInfer MLA only supports block_size 32 or 64 pytest.skip( "FlashInfer MLA only supports block_size 32 " @@ -217,23 +201,17 @@ def test_env( block_size, False, use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name + expected = f"{name}_VLLM_V1" assert backend.get_name() == expected elif name == "FLASH_ATTN_MLA": - if not use_v1: - # FlashAttention MLA only supported on V1 engine - pytest.skip( - "FlashAttention MLA only supported on V1 engine" - ) - else: - backend = get_attn_backend(16, - torch.float16, - None, - block_size, - False, - use_mla=use_mla) - expected = "FLASH_ATTN_MLA" - assert backend.get_name() == expected + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + expected = "FLASH_ATTN_MLA" + assert backend.get_name() == expected else: # TRITON_MLA or other fallback backend = get_attn_backend(16, @@ -242,8 +220,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = ("TRITON_MLA_VLLM_V1" - if use_v1 else "TRITON_MLA") + expected = "TRITON_MLA_VLLM_V1" assert backend.get_name() == expected elif name == "FLASHINFER": backend = get_attn_backend(16, @@ -252,7 +229,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "FLASHINFER_VLLM_V1" if use_v1 else name + expected = "FLASHINFER_VLLM_V1" assert backend.get_name() == expected else: backend = get_attn_backend(32, @@ -261,36 +238,30 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name + expected = "FLASH_ATTN_VLLM_V1" assert backend.get_name() == expected - if use_v1: - backend = get_attn_backend(16, - torch.float16, - None, - block_size, - False, - use_mla=use_mla) - assert backend.get_name() == "FLEX_ATTENTION", ( - "Should fallback to FlexAttention if head size is " - "not supported by FlashAttention") + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + assert backend.get_name() == "FLEX_ATTENTION", ( + "Should fallback to FlexAttention if head size is " + "not supported by FlashAttention") @pytest.mark.parametrize("device", ["cpu", "cuda"]) -@pytest.mark.parametrize("use_v1", [True, False]) def test_fp32_fallback( device: str, - use_v1: bool, monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with fp32.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") if device == "cpu": - if not use_v1: - pytest.skip("CPU backend only supports V1") - with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float32, None, 16, False) @@ -300,8 +271,7 @@ def test_fp32_fallback( with patch("vllm.attention.selector.current_platform", CudaPlatform()): backend = get_attn_backend(16, torch.float32, None, 16, False) - assert (backend.get_name() == "FLEX_ATTENTION" - if use_v1 else "XFORMERS") + assert backend.get_name() == "FLEX_ATTENTION" def test_flash_attn(monkeypatch: pytest.MonkeyPatch): @@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != STR_FLASH_ATTN_VAL -@pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): +def test_invalid_env(monkeypatch: pytest.MonkeyPatch): """Test that invalid attention backend names raise ValueError.""" with monkeypatch.context() as m, patch( "vllm.attention.selector.current_platform", CudaPlatform()): - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) # Should raise ValueError for invalid backend diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 50c60341f0d88..221d5237823ca 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions. import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) from vllm.lora.request import LoRARequest +from vllm.v1.engine.llm_engine import LLMEngine MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index c14e71cbdb96d..39c4dd735b725 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -15,7 +15,8 @@ from ...utils import check_logprobs_close # have a clean way to fall back, so we fail with # a clear msg when it happens. # https://github.com/vllm-project/vllm/issues/14524 -REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] +# NOTE(woosuk): Skipping these tests until V1 supports them. +# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] # This list contains the model that are using AITER kernel. # Skip model that are not using AITER tests. @@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - if model in REQUIRES_V0: - monkeypatch.setenv("VLLM_USE_V1", "0") - if use_rocm_aiter and (model in AITER_MODEL_LIST): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") elif use_rocm_aiter and model not in AITER_MODEL_LIST: diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 206ad1352e06e..0b1f90e27db82 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import SamplingParams -from ...utils import check_logprobs_close, check_outputs_equal +from ...utils import check_logprobs_close # Mark all tests as hybrid pytestmark = pytest.mark.hybrid_model @@ -88,15 +88,6 @@ def test_models( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None - if model in V1_SUPPORTED_MODELS: with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( @@ -104,14 +95,6 @@ def test_models( else: vllm_v1_outputs = None - if vllm_v0_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - if model in V1_SUPPORTED_MODELS: check_logprobs_close( outputs_0_lst=hf_outputs, @@ -157,45 +140,6 @@ def test_batching( ) -@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) -def test_chunked_prefill( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, - num_logprobs: int, - chunked_prefill_token_size: int, - monkeypatch, -) -> None: - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, - enable_chunked_prefill=True, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs) as vllm_model: - chunked = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model, - enable_chunked_prefill=False, - max_num_seqs=max_num_seqs) as vllm_model: - non_chunked = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=chunked, - outputs_1_lst=non_chunked, - name_0="chunked", - name_1="non_chunked", - ) - - @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [10]) def test_chunked_prefill_with_parallel_sampling( @@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding( "Could be related to mamba cache not padded correctly") -@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) -@pytest.mark.parametrize("max_tokens", [20]) -def test_models_preemption_recompute( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, - monkeypatch, -) -> None: - """ - Tests that outputs are identical with and w/o preemptions (recompute). - """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.llm.llm_engine.scheduler[0] - scheduler.ENABLE_ARTIFICIAL_PREEMPT = True - preempt_vllm_outputs = vllm_model.generate_greedy( - example_prompts, max_tokens) - - scheduler.ENABLE_ARTIFICIAL_PREEMPT = False - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=preempt_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="vllm_preepmtions", - name_1="vllm", - ) - - @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks( vllm_runner, @@ -386,27 +298,10 @@ def test_full_cuda_graph( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if vllm_v0_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, @@ -442,27 +337,12 @@ def test_fp32_cache_state( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, - max_num_seqs=MAX_NUM_SEQS, - **{cache_dtype_param: "float32"}) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index 08722ac98b7ed..4ac91b5aed506 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest import torch @@ -82,7 +81,7 @@ def test_prm_models( check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2") - if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0": + if current_platform.is_cpu(): pytest.skip("CPU only supports V1") if current_platform.is_rocm(): diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index afc27b6e0566e..97dd4d6135ac4 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -36,9 +36,6 @@ from ..utils import check_logprobs_close # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) -# Due to low-precision numerical divergence, this test is too sensitive for -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models( vllm_runner, example_prompts, @@ -49,7 +46,6 @@ def test_models( enforce_eager: bool, backend: str, tensor_parallel_size: int, - disable_async_output_proc: bool, monkeypatch: pytest.MonkeyPatch, ) -> None: """ @@ -74,7 +70,6 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -85,7 +80,6 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -110,9 +104,6 @@ def test_models( ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -# Due to low-precision numerical divergence, this test is too sensitive for -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) def test_cpu_models( vllm_runner, example_prompts, @@ -120,7 +111,6 @@ def test_cpu_models( base_model: str, test_model: str, max_tokens: int, - disable_async_output_proc: bool, monkeypatch: pytest.MonkeyPatch, ) -> None: """ @@ -138,7 +128,6 @@ def test_cpu_models( max_model_len=MAX_MODEL_LEN, dtype="bfloat16", kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -148,7 +137,6 @@ def test_cpu_models( max_model_len=MAX_MODEL_LEN, dtype="bfloat16", kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 9281579b71e74..b9601114a3183 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,6 @@ from unittest.mock import patch import pytest from vllm import LLM -from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.engine.core import EngineCore as V1EngineCore @@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, False)) # Avoid calling model.forward() - def _initialize_kv_caches_v0(self) -> None: - self.cache_config.num_gpu_blocks = 0 - self.cache_config.num_cpu_blocks = 0 - def _initialize_kv_caches_v1(self, vllm_config): kv_cache_specs = self.model_executor.get_kv_cache_specs() scheduler_kv_cache_config = get_kv_cache_configs( @@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config - with (patch.object(V0LLMEngine, "_initialize_kv_caches", - _initialize_kv_caches_v0), - patch.object(V1EngineCore, "_initialize_kv_caches", + with (patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m): if model_info.v0_only: - m.setenv("VLLM_USE_V1", "0") + # NOTE(woosuk): skip the test for V0-only models + return + if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"): # Phi4FlashForCausalLM and MotifForCausalLM # only supports DIFFERENTIAL_FLASH_ATTN backend diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 4aa7bb7297893..cb30d77c4f0ea 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -42,6 +42,7 @@ def test_oot_registration_text_generation( assert rest == "" +@pytest.mark.skip(reason="This test is skipped because it failed on V1.") @create_new_process_for_each_test() def test_oot_registration_embedding( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 6e2089ea2e0e2..1d7e4475011d0 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -7,15 +7,6 @@ import torch from vllm.plugins import load_general_plugins -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - def test_platform_plugins(): # simulate workload by running an example import runpy diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 8c21216108685..099869a82ad21 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -3,47 +3,18 @@ import pytest -from vllm.core.scheduler import Scheduler from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams -from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler -from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.engine.llm_engine import LLMEngine -class DummyV0Scheduler(Scheduler): - - def schedule(self): - raise Exception("Exception raised by DummyV0Scheduler") - - -class DummyV1Scheduler(V1Scheduler): +class DummyV1Scheduler(Scheduler): def schedule(self): raise Exception("Exception raised by DummyV1Scheduler") -def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with pytest.raises(Exception) as exception_info: - - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV0Scheduler, - ) - - engine = LLMEngine.from_engine_args(engine_args=engine_args) - - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() - - assert str( - exception_info.value) == "Exception raised by DummyV0Scheduler" - - def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): scheduler_cls=DummyV1Scheduler, ) - engine = V1LLMEngine.from_engine_args(engine_args=engine_args) + engine = LLMEngine.from_engine_args(engine_args=engine_args) sampling_params = SamplingParams(max_tokens=1) engine.add_request("0", "foo", sampling_params) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 0320a5ef31a65..2960ffcbd9eab 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM from vllm.assets.audio import AudioAsset - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. # 2. Increase beam_width to 8. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index ea4a17dd2306f..1d77d37a5d581 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -9,13 +9,6 @@ import pytest from vllm import SamplingParams - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - # We also test with llama because it has generation_config to specify EOS # (past regression). MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 86fc14dc85f80..220a4a53f4671 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -8,12 +8,6 @@ from vllm import SamplingParams MODELS = ["distilbert/distilgpt2"] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_ranks( diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 15ea55afe963b..bd2b91073d568 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str, logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) ]) - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1]) -def test_decode_prompt_logprobs_chunked_prefill( - vllm_runner, - model, - chunked_prefill_token_size: int, - example_prompts, - monkeypatch, -): - # VLLM V1 does not use incremental detokenization for - # prompt logprobs, so this test strategy is irrelevant. - monkeypatch.setenv("VLLM_USE_V1", "0") - - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - - with vllm_runner(model, - dtype="half", - max_logprobs=5, - gpu_memory_utilization=0.5, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs) as vllm_model: - - vllm_sampling_params = SamplingParams(max_tokens=10, - logprobs=5, - prompt_logprobs=5, - temperature=0.0) - vllm_results = vllm_model.llm.generate( - example_prompts, sampling_params=vllm_sampling_params) - - for idx, result in enumerate(vllm_results): - assert result.prompt_logprobs is not None - assert result.prompt_logprobs[0] is None - - # Compared detokenized prompts ids to original prompt. - generated_string = "" - for (prompt_token, - prompt_logprobs) in zip(result.prompt_token_ids[1:], - result.prompt_logprobs[1:]): - # prompt_logprobs is a dict of the token_id: logprob - # We select the token_id corresponding to the actual prompt - # Decoded token in the detokenized string corresponding to this - # prompt token. - generated_string += prompt_logprobs[prompt_token].decoded_token - - assert generated_string == example_prompts[idx], ( - "Detokenized prompt logprobs do not match original prompt") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8912ff8bad429..242fcf501bfc4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1508,14 +1508,6 @@ class EngineArgs: recommend_to_remove=True) return False - if self.kv_cache_dtype != "auto": - supported = current_platform.is_kv_cache_dtype_supported( - self.kv_cache_dtype, model_config) - if not supported: - _raise_or_fallback(feature_name="--kv-cache-dtype", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 014bc56bc8ece..a0fe38eb320d6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,1835 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time -from collections import Counter as collectionsCounter -from collections import deque -from contextlib import contextmanager -from dataclasses import dataclass -from functools import partial -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, - Iterable, List, Literal, Mapping, NamedTuple, Optional) -from typing import Sequence as GenericSequence -from typing import Set, Type, Union, cast +from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine -import torch -import torch.nn as nn -from typing_extensions import TypeVar - -import vllm.envs as envs -from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, - ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics_types import StatLoggerBase, Stats -from vllm.engine.output_processor.interfaces import ( - SequenceGroupOutputProcessor) -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.entrypoints.openai.logits_processors import ( - get_logits_processors as get_openai_logits_processors) -from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.logits_process import get_bad_words_logits_processors -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.cache import processor_only_cache_from_config -from vllm.multimodal.processing import EncDecMultiModalProcessor -from vllm.outputs import (PoolingRequestOutput, RequestOutput, - RequestOutputFactory) -from vllm.reasoning import ReasoningParser, ReasoningParserManager -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, - Sequence, SequenceGroup, SequenceGroupBase, - SequenceGroupMetadata, SequenceGroupOutput, - SequenceStatus) -from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, - init_tracer) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import (AnyTokenizer, - init_tokenizer_from_configs) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind -from vllm.version import __version__ as VLLM_VERSION -from vllm.worker.model_runner_base import InputProcessingError -from vllm.worker.worker_base import WorkerBase - -logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5 - -_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) -_R = TypeVar("_R", default=Any) - - -@dataclass -class SchedulerOutputState: - """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - allow_async_output_proc: bool = False - last_output: Optional[SamplerOutput] = None - - -class OutputData(NamedTuple): - outputs: List[SamplerOutput] - seq_group_metadata_list: List[SequenceGroupMetadata] - scheduler_outputs: SchedulerOutputs - is_async: bool - is_last_step: bool - # Indicates if this output is from the first step of the - # multi-step. When multi-step is disabled, this is always - # set to True. - # is_first_step_output is invalid when `outputs` has - # outputs from multiple steps. - is_first_step_output: Optional[bool] - skip: List[int] - - -class SchedulerContext: - - def __init__(self) -> None: - self.output_queue: Deque[OutputData] = deque() - self.request_outputs: List[RequestOutput] = [] - self.seq_group_metadata_list: Optional[ - List[SequenceGroupMetadata]] = None - self.scheduler_outputs: Optional[SchedulerOutputs] = None - - def append_output(self, outputs: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduler_outputs: SchedulerOutputs, is_async: bool, - is_last_step: bool, - is_first_step_output: Optional[bool]): - self.output_queue.append( - OutputData(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=is_async, - is_last_step=is_last_step, - is_first_step_output=is_first_step_output, - skip=[])) - - -class LLMEngine: - """An LLM engine that receives requests and generates texts. - - This is the main class for the vLLM engine. It receives requests - from clients and generates texts from the LLM. It includes a tokenizer, a - language model (possibly distributed across multiple GPUs), and GPU memory - space allocated for intermediate states (aka KV cache). This class utilizes - iteration-level scheduling and efficient memory management to maximize the - serving throughput. - - The [`LLM`][vllm.LLM] class wraps this class for offline batched inference - and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine] - class wraps this class for online serving. - - The config arguments are derived from [`EngineArgs`][vllm.EngineArgs]. - - Args: - vllm_config: The configuration for initializing and running vLLM. - executor_class: The model executor class for managing distributed - execution. - log_stats: Whether to log statistics. - usage_context: Specified entry point, used for usage info collection. - """ - - DO_VALIDATE_OUTPUT: ClassVar[bool] = False - """A flag to toggle whether to validate the type of request output.""" - - @classmethod - @contextmanager - def enable_output_validation(cls): - cls.DO_VALIDATE_OUTPUT = True - - yield - - cls.DO_VALIDATE_OUTPUT = False - - @classmethod - def validate_output( - cls, - output: object, - output_type: Type[_O], - ) -> _O: - do_validate = cls.DO_VALIDATE_OUTPUT - - if ((TYPE_CHECKING or do_validate) - and not isinstance(output, output_type)): - raise TypeError(f"Expected output of type {output_type}, " - f"but found type {type(output)}") - - return cast(_O, output) - - @classmethod - def validate_outputs( - cls, - outputs: GenericSequence[object], - output_type: Type[_O], - ) -> List[_O]: - do_validate = cls.DO_VALIDATE_OUTPUT - - outputs_: List[_O] - if TYPE_CHECKING or do_validate: - outputs_ = [] - for output in outputs: - if not isinstance(output, output_type): - raise TypeError(f"Expected output of type {output_type}, " - f"but found type {type(output)}") - - outputs_.append(output) - else: - outputs_ = outputs - - return outputs_ - - tokenizer: Optional[AnyTokenizer] - - def __init__( - self, - vllm_config: VllmConfig, - executor_class: Type[ExecutorBase], - log_stats: bool, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - use_cached_outputs: bool = False, - ) -> None: - if envs.VLLM_USE_V1: - raise ValueError( - "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. " - "This should not happen. As a workaround, try using " - "LLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") - - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config # noqa - self.load_config = vllm_config.load_config - self.structured_outputs_config = vllm_config.structured_outputs_config - self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa - ) - - logger.info( - "Initializing a V0 LLM engine (v%s) with config: %s, " - "use_cached_outputs=%s, ", - VLLM_VERSION, - vllm_config, - use_cached_outputs, - ) - - self.log_stats = log_stats - self.use_cached_outputs = use_cached_outputs - - if self.model_config.skip_tokenizer_init: - self.tokenizer = None - self.detokenizer = None - else: - self.tokenizer = self._init_tokenizer() - self.detokenizer = Detokenizer(self.tokenizer) - - self.seq_counter = Counter() - self.generation_config_fields = ( - self.model_config.try_get_generation_config()) - - self.input_preprocessor = InputPreprocessor( - self.model_config, - self.tokenizer, - mm_registry, - mm_processor_cache=processor_only_cache_from_config( - self.model_config, mm_registry), - ) - - self.model_executor = executor_class(vllm_config=vllm_config) - - self._initialize_kv_caches() - - # If usage stat is enabled, collect relevant info. - if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import ( - get_architecture_class_name) - usage_message.report_usage( - get_architecture_class_name(self.model_config), - usage_context, - extra_kvs={ - # Common configuration - "dtype": - str(self.model_config.dtype), - "tensor_parallel_size": - self.parallel_config.tensor_parallel_size, - "block_size": - self.cache_config.block_size, - "gpu_memory_utilization": - self.cache_config.gpu_memory_utilization, - "kv_cache_memory_bytes": - self.cache_config.kv_cache_memory_bytes, - # Quantization - "quantization": - self.model_config.quantization, - "kv_cache_dtype": - str(self.cache_config.cache_dtype), - - # Feature flags - "enable_lora": - bool(self.lora_config), - "enable_prefix_caching": - self.cache_config.enable_prefix_caching, - "enforce_eager": - self.model_config.enforce_eager, - "disable_custom_all_reduce": - self.parallel_config.disable_custom_all_reduce, - }) - - self.cached_scheduler_outputs = [ - SchedulerOutputState() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - self.scheduler_contexts = [ - SchedulerContext() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - if self.model_config.use_async_output_proc: - process_model_outputs = weak_bind(self._process_model_outputs) - - self.async_callbacks = [ - partial(process_model_outputs, - ctx=self.scheduler_contexts[v_id]) - for v_id in range(self.parallel_config.pipeline_parallel_size) - ] - else: - self.async_callbacks = [] - - # Currently used by AsyncLLMEngine to ensure quick append - # of request outputs to asyncio queues - self.process_request_outputs_callback: Optional[Callable] = None - - # Create the scheduler. - # NOTE: the cache_config here have been updated with the numbers of - # GPU and CPU blocks, which are profiled in the distributed executor. - if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str): - Scheduler = resolve_obj_by_qualname( - self.vllm_config.scheduler_config.scheduler_cls) - else: - Scheduler = self.vllm_config.scheduler_config.scheduler_cls - self.scheduler = [ - Scheduler( - self.scheduler_config, self.cache_config, self.lora_config, - self.parallel_config.pipeline_parallel_size, - self.async_callbacks[v_id] - if self.model_config.use_async_output_proc else None) - for v_id in range(self.parallel_config.pipeline_parallel_size) - ] - - # Metric Logging. - if self.log_stats: - if stat_loggers is not None: - self.stat_loggers = stat_loggers - else: - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from vllm.engine.metrics import (LoggingStatLogger, - PrometheusStatLogger) - - self.stat_loggers = { - "logging": - LoggingStatLogger( - local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - vllm_config=vllm_config), - "prometheus": - PrometheusStatLogger( - local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - labels=dict( - model_name=self.model_config.served_model_name), - vllm_config=vllm_config), - } - self.stat_loggers["prometheus"].info("cache_config", - self.cache_config) - - self.tracer = None - if self.observability_config.otlp_traces_endpoint: - self.tracer = init_tracer( - "vllm.llm_engine", - self.observability_config.otlp_traces_endpoint) - - # Initialize reasoning parser if reasoning backend is set. - if self.structured_outputs_config.reasoning_parser and self.tokenizer: - reasoner_class = ReasoningParserManager.get_reasoning_parser( - self.structured_outputs_config.reasoning_parser) - self.reasoner: ReasoningParser = reasoner_class( - self.tokenizer.get_lora_tokenizer()) - - # Create sequence output processor, e.g. for beam search or - # speculative decoding. - self.output_processor = ( - SequenceGroupOutputProcessor.create_output_processor( - self.scheduler_config, - self.detokenizer, - self.scheduler, - self.seq_counter, - stop_checker=StopChecker( - self.scheduler_config.max_model_len, - self.reasoner - if self.structured_outputs_config.reasoning_parser - and self.tokenizer else None, - ), - )) - - self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} - - # Flag to set when an input fails to process and the engine should run - # the next step without re-scheduling. - self._skip_scheduling_next_step = False - - # Don't keep the dummy data in memory - self.reset_mm_cache() - - def _initialize_kv_caches(self) -> None: - """Initialize the KV cache in the worker(s). - - The workers will determine the number of blocks in both the GPU cache - and the swap CPU cache. - """ - start = time.time() - num_gpu_blocks, num_cpu_blocks = ( - self.model_executor.determine_num_available_blocks()) - - if self.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) - elapsed = time.time() - start - logger.info(("init engine (profile, create kv cache, " - "warmup model) took %.2f seconds"), elapsed) - - @classmethod - def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: - # distributed_executor_backend must be set in VllmConfig.__post_init__ - distributed_executor_backend = ( - engine_config.parallel_config.distributed_executor_backend) - # Initialize the cluster and specify the executor class. - if isinstance(distributed_executor_backend, type): - if not issubclass(distributed_executor_backend, ExecutorBase): - raise TypeError( - "distributed_executor_backend must be a subclass of " - f"ExecutorBase. Got {distributed_executor_backend}.") - executor_class = distributed_executor_backend - elif distributed_executor_backend == "ray": - from vllm.executor.ray_distributed_executor import ( - RayDistributedExecutor) - executor_class = RayDistributedExecutor - elif distributed_executor_backend == "mp": - from vllm.executor.mp_distributed_executor import ( - MultiprocessingDistributedExecutor) - assert not envs.VLLM_USE_RAY_SPMD_WORKER, ( - "multiprocessing distributed executor backend does not " - "support VLLM_USE_RAY_SPMD_WORKER=1") - executor_class = MultiprocessingDistributedExecutor - elif distributed_executor_backend == "uni": - # JAX-style, single-process, multi-device executor. - from vllm.executor.uniproc_executor import UniProcExecutor - executor_class = UniProcExecutor - elif distributed_executor_backend == "external_launcher": - # executor with external launcher - from vllm.executor.uniproc_executor import ( # noqa - ExecutorWithExternalLauncher) - executor_class = ExecutorWithExternalLauncher - else: - raise ValueError("unrecognized distributed_executor_backend: " - f"{distributed_executor_backend}") - return executor_class - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - return cls( - vllm_config=vllm_config, - executor_class=cls._get_executor_cls(vllm_config), - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - - engine_cls = cls - if envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - engine_cls = V1LLMEngine - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - def __reduce__(self): - # This is to ensure that the LLMEngine is not referenced in - # the closure used to initialize Ray worker actors - raise RuntimeError("LLMEngine should not be pickled!") - - def __del__(self): - # Shutdown model executor when engine is garbage collected - # Use getattr since __init__ can fail before the field is set - if model_executor := getattr(self, "model_executor", None): - model_executor.shutdown() - - def get_tokenizer(self) -> AnyTokenizer: - if self.tokenizer is None: - raise ValueError("Unable to get tokenizer because " - "skip_tokenizer_init is True") - - return self.tokenizer - - def _init_tokenizer(self) -> AnyTokenizer: - return init_tokenizer_from_configs(model_config=self.model_config) - - def _verify_args(self) -> None: - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: - self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: SamplingParams, - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> Optional[SequenceGroup]: - """Add a processed request to the engine's request pool. - return the created sequence group. - """ - if isinstance(params, SamplingParams) and params.n > 1: - ParallelSampleSequenceGroup.add_request( - request_id, - self, - params, - processed_inputs=processed_inputs, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - return None - - self._validate_model_inputs(processed_inputs) - # Create the sequences. - block_size = self.cache_config.block_size - seq_id = next(self.seq_counter) - eos_token_id = self.input_preprocessor.get_eos_token_id() - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - - seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, - lora_request) - - encoder_seq = (None if encoder_inputs is None else Sequence( - seq_id, encoder_inputs, block_size, eos_token_id, lora_request)) - - # Create a SequenceGroup based on SamplingParams - if isinstance(params, SamplingParams): - seq_group = self._create_sequence_group_with_sampling( - request_id, - seq, - params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - encoder_seq=encoder_seq, - priority=priority) - else: - raise ValueError("SamplingParams must be provided.") - - # Add the sequence group to the scheduler with least unfinished seqs. - costs = [ - scheduler.get_num_unfinished_seq_groups() - for scheduler in self.scheduler - ] - min_cost_scheduler = self.scheduler[costs.index(min(costs))] - min_cost_scheduler.add_seq_group(seq_group) - - return seq_group - - def stop_remote_worker_execution_loop(self) -> None: - self.model_executor.stop_remote_worker_execution_loop() - - def add_request( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - """Add a request to the engine's request pool. - - The request is added to the request pool and will be processed by the - scheduler as `engine.step()` is called. The exact scheduling policy is - determined by the scheduler. - - Args: - request_id: The unique ID of the request. - prompt: The prompt to the LLM. See - [PromptType][vllm.inputs.PromptType] - for more details about the format of each input. - params: Parameters for sampling. - [SamplingParams][vllm.SamplingParams] for text generation. - arrival_time: The arrival time of the request. If None, we use - the current monotonic time. - lora_request: The LoRA request to add. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - - Details: - - Set arrival_time to the current time if it is None. - - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of [Sequence][vllm.sequence.Sequence] objects. - - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object - from the list of [Sequence][vllm.sequence.Sequence]. - - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the - scheduler. - - Example: - >>> # initialize engine - >>> engine = LLMEngine.from_engine_args(engine_args) - >>> # set request arguments - >>> example_prompt = "Who is the president of the United States?" - >>> sampling_params = SamplingParams(temperature=0.0) - >>> request_id = 0 - >>> - >>> # add the request to the engine - >>> engine.add_request( - >>> str(request_id), - >>> example_prompt, - >>> SamplingParams(temperature=0.0)) - >>> # continue the request processing - >>> ... - """ - if not isinstance(request_id, str): - raise TypeError( - f"request_id must be a string, got {type(request_id)}") - - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - - if priority != 0 and not self.scheduler_config.policy == "priority": - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - - if isinstance(params, SamplingParams) \ - and params.logits_processors: - raise ValueError( - "Logits processors are not supported in multi-step decoding") - - if arrival_time is None: - arrival_time = time.time() - - if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None): - if not prompt.get("prompt_token_ids", None): - seq_len = prompt["prompt_embeds"].shape[0] - prompt["prompt_token_ids"] = [0] * seq_len - if params.prompt_logprobs is not None: - raise ValueError( - "prompt_logprobs is not compatible with prompt embeds.") - - processed_inputs = self.input_preprocessor.preprocess( - prompt, - tokenization_kwargs=tokenization_kwargs, - ) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - def _create_sequence_group_with_sampling( - self, - request_id: str, - seq: Sequence, - sampling_params: SamplingParams, - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - encoder_seq: Optional[Sequence] = None, - priority: int = 0, - ) -> SequenceGroup: - """Creates a SequenceGroup with SamplingParams.""" - max_logprobs = self.get_model_config().max_logprobs - if (sampling_params.logprobs - and sampling_params.logprobs > max_logprobs) or ( - sampling_params.prompt_logprobs - and sampling_params.prompt_logprobs > max_logprobs): - raise ValueError(f"Cannot request more than " - f"{max_logprobs} logprobs.") - - sampling_params = self._build_logits_processors( - sampling_params, lora_request) - - # Defensive copy of SamplingParams, which are used by the sampler, - # this doesn't deep-copy LogitsProcessor objects - sampling_params = sampling_params.clone() - - sampling_params.update_from_generation_config( - self.generation_config_fields, seq.eos_token_id) - - # Create the sequence group. - draft_size = 1 - if self.vllm_config.speculative_config is not None: - draft_size = \ - self.vllm_config.speculative_config.num_speculative_tokens + 1 - seq_group = SequenceGroup(request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - sampling_params=sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - encoder_seq=encoder_seq, - priority=priority, - draft_size=draft_size) - - return seq_group - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - """Aborts a request(s) with the given ID. - - Args: - request_id: The ID(s) of the request to abort. - - Details: - - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][]. - - Example: - >>> # initialize engine and add a request with request_id - >>> request_id = str(0) - >>> # abort the request - >>> engine.abort_request(request_id) - """ - for scheduler in self.scheduler: - scheduler.abort_seq_group( - request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) - - def get_vllm_config(self) -> VllmConfig: - """Gets the vllm configuration.""" - return self.vllm_config - - def get_model_config(self) -> ModelConfig: - """Gets the model configuration.""" - return self.model_config - - def get_parallel_config(self) -> ParallelConfig: - """Gets the parallel configuration.""" - return self.parallel_config - - def get_scheduler_config(self) -> SchedulerConfig: - """Gets the scheduler configuration.""" - return self.scheduler_config - - def get_lora_config(self) -> LoRAConfig: - """Gets the LoRA configuration.""" - return self.lora_config - - def get_num_unfinished_requests(self) -> int: - """Gets the number of unfinished requests.""" - return sum(scheduler.get_num_unfinished_seq_groups() - for scheduler in self.scheduler) - - def has_unfinished_requests(self) -> bool: - """Returns True if there are unfinished requests.""" - return any(scheduler.has_unfinished_seqs() - for scheduler in self.scheduler) - - def has_unfinished_requests_for_virtual_engine( - self, virtual_engine: int) -> bool: - """ - Returns True if there are unfinished requests for the virtual engine. - """ - return self.scheduler[virtual_engine].has_unfinished_seqs() - - def reset_mm_cache(self) -> bool: - """Reset the multi-modal cache.""" - self.input_preprocessor.clear_cache() - return True - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for all devices.""" - - success = True - for scheduler in self.scheduler: - success = success and scheduler.reset_prefix_cache(device) - return success - - def _process_model_outputs(self, - ctx: SchedulerContext, - request_id: Optional[str] = None) -> None: - """Apply the model output to the sequences in the scheduled seq groups - and return responses. - - ctx: The virtual engine context to work on - request_id: If provided, then only this request is going to be processed - """ - - now = time.time() - - if len(ctx.output_queue) == 0: - return None - - # Get pending async postprocessor - if request_id: - # When we process only one request, no pop is required - # (since later we will process all of the rest) - (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, is_first_step_output, skip) = ctx.output_queue[0] - else: - (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, is_first_step_output, - skip) = ctx.output_queue.popleft() - - # Sanity check - assert len(seq_group_metadata_list) == len( - scheduler_outputs.scheduled_seq_groups) - - has_multiple_outputs: bool = len(outputs) > 1 - outputs_by_sequence_group: List[List[SequenceGroupOutput]] - assert not has_multiple_outputs - outputs_by_sequence_group = outputs - - # Determine the requests we need to operate on - if request_id: - indices = [] - for i, seq_group_meta in enumerate(seq_group_metadata_list): - if seq_group_meta.request_id == request_id: - assert i not in skip # Cannot be called twice - indices.append(i) - break - - # If the request_id was not found, then it means that - # this is a new request that has no pending async - # postprocessor - if not indices: - return - else: - indices = range(len(seq_group_metadata_list)) # type: ignore - - finished_before: List[int] = [] - finished_now: List[int] = [] - for i in indices: - if i in skip: - continue - - seq_group_meta = seq_group_metadata_list[i] - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group: SequenceGroup = scheduled_seq_group.seq_group - - if seq_group.is_finished(): - finished_before.append(i) - continue - - output: List[SequenceGroupOutput] - if has_multiple_outputs: - output = outputs_by_sequence_group[i] - else: - output = [outputs_by_sequence_group[0][i]] - - if not is_async: - seq_group.update_num_computed_tokens( - seq_group_meta.token_chunk_size or 0) - - if outputs: - for o in outputs: - if (isinstance(o, SamplerOutput) - and seq_group.metrics is not None): - if seq_group.metrics.model_forward_time is not None: - seq_group.metrics.model_forward_time += ( - o.model_forward_time or 0) - else: - seq_group.metrics.model_forward_time = ( - o.model_forward_time) - if seq_group.metrics.model_execute_time is not None: - seq_group.metrics.model_execute_time += ( - o.model_execute_time or 0) - else: - seq_group.metrics.model_execute_time = ( - o.model_execute_time) - - self.output_processor.process_prompt_logprob(seq_group, output) - if seq_group_meta.do_sample: - self.output_processor.process_outputs(seq_group, output, - is_async) - - if seq_group.is_finished(): - finished_now.append(i) - - # Generate outputs for the requests that finished this iteration - for i in finished_now: - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) - if not seq_group.is_prefill(): - seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs) - if request_output: - ctx.request_outputs.append(request_output) - - # When we process a single request, we skip it for the next time, - # and invoke the request output callback (if there was final output) - if request_id: - assert len(indices) == 1 - skip.append(indices[0]) - - if (finished_now - and self.process_request_outputs_callback is not None): - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - return - - # Free currently finished requests - if finished_now: - for scheduler in self.scheduler: - scheduler.free_finished_seq_groups() - - # Create the outputs - for i in indices: - if i in skip or i in finished_before or i in finished_now: - continue # Avoids double processing - - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) - if not seq_group.is_prefill(): - seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs) - if request_output: - ctx.request_outputs.append(request_output) - - # Create outputs only after processing the scheduler's results - - for seq_group in scheduler_outputs.ignored_seq_groups: - params = seq_group.sampling_params - if params is not None and params.output_kind == ( - RequestOutputKind.DELTA) and not seq_group.is_finished(): - continue - - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs, - ) - if request_output: - ctx.request_outputs.append(request_output) - - # Immediately process request outputs here (if callback is given) - if (ctx.request_outputs - and self.process_request_outputs_callback is not None): - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - - # For async case, we need to record the stats here. - # For non-async case, the stats are done in the - # LLMEngine/AsyncLLMEngine directly - if is_async: - # Log stats. - self.do_log_stats(scheduler_outputs, outputs, finished_before, - skip) - - # Tracing - self.do_tracing(scheduler_outputs, finished_before) - - return None - - def _advance_to_next_step( - self, output: SamplerOutput, - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done inside output processor, but it is - required if the worker is to perform async forward pass to next step. - """ - for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \ - zip(seq_group_metadata_list, output, scheduled_seq_groups): - seq_group = scheduled_seq_group.seq_group - - if seq_group.is_finished(): - continue - - token_chunk_size = (seq_group_metadata.token_chunk_size - if seq_group_metadata.token_chunk_size - is not None else 0) - seq_group.update_num_computed_tokens(token_chunk_size) - - if seq_group_metadata.do_sample: - assert len(sequence_group_outputs.samples) == 1, ( - "Async output processor expects a single sample" - " (i.e sampling_params.n == 1)") - sample = sequence_group_outputs.samples[0] - - assert len(seq_group.seqs) == 1 - seq = seq_group.seqs[0] - - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) - - def step(self) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - -
- ![Overview of the step function](https://i.imgur.com/sv2HssD.png) -
Overview of the step function
-
- - Details: - - Step 1: Schedules the sequences to be executed in the next - iteration and the token blocks to be swapped in/out/copy. - - - Depending on the scheduling policy, - sequences may be `preempted/reordered`. - - A Sequence Group (SG) refer to a group of sequences - that are generated from the same prompt. - - - Step 2: Calls the distributed executor to execute the model. - - Step 3: Processes the model output. This mainly includes: - - - Decodes the relevant outputs. - - Updates the scheduled sequence groups with model outputs - based on its `sampling parameters` (`use_beam_search` or not). - - Frees the finished sequence groups. - - - Finally, it creates and returns the newly generated results. - - Example: - ``` - # Please see the example/ folder for more detailed examples. - - # initialize engine and request arguments - engine = LLMEngine.from_engine_args(engine_args) - example_inputs = [(0, "What is LLM?", - SamplingParams(temperature=0.0))] - - # Start the engine with an event loop - while True: - if example_inputs: - req_id, prompt, sampling_params = example_inputs.pop(0) - engine.add_request(str(req_id),prompt,sampling_params) - - # continue the request processing - request_outputs = engine.step() - for request_output in request_outputs: - if request_output.finished: - # return or show the request output - - if not (engine.has_unfinished_requests() or example_inputs): - break - ``` - """ - if self.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError( - "Pipeline parallelism is only supported through AsyncLLMEngine " - "as performance will be severely degraded otherwise.") - - # For llm_engine, there is no pipeline parallel support, so the engine - # used is always 0. - virtual_engine = 0 - - # These are cached outputs from previous iterations. None if on first - # iteration - cached_outputs = self.cached_scheduler_outputs[virtual_engine] - seq_group_metadata_list = cached_outputs.seq_group_metadata_list - scheduler_outputs = cached_outputs.scheduler_outputs - allow_async_output_proc = cached_outputs.allow_async_output_proc - - ctx = self.scheduler_contexts[virtual_engine] - - # Clear outputs for each new scheduler iteration - ctx.request_outputs.clear() - - # Skip the scheduler if there are any remaining steps in the seq groups. - # This ensures that the scheduler is only called again when the current - # batch has completed. - # The scheduler is also skipped if a single request caused the last - # engine step to fail, and the previous schedule needs to be rerun. - if not self._has_remaining_steps( - seq_group_metadata_list - ) and not self._skip_scheduling_next_step: - # Schedule iteration - (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc - ) = self.scheduler[virtual_engine].schedule() - - ctx.seq_group_metadata_list = seq_group_metadata_list - ctx.scheduler_outputs = scheduler_outputs - - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() - # When n>1, elements in self.seq_id_to_seq_group should be deleted - # here, otherwise memory leaks. - for finished_request_id in finished_requests_ids: - if finished_request_id in self.seq_id_to_seq_group: - del self.seq_id_to_seq_group[finished_request_id] - - # Maybe switch from async mode to sync mode - if not allow_async_output_proc and len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - - else: - finished_requests_ids = list() - - assert seq_group_metadata_list is not None - assert scheduler_outputs is not None - - if not scheduler_outputs.is_empty(): - - # Check if we have a cached last_output from the previous iteration. - # For supporting PP this is probably the best way to pass the - # sampled_token_ids, as a separate broadcast over all the PP stages - # will cause one virtual engine's microbatch to block the pipeline. - last_sampled_token_ids = \ - self._get_last_sampled_token_ids(virtual_engine) - - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, - blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, - blocks_to_copy=scheduler_outputs.blocks_to_copy, - num_lookahead_slots=scheduler_outputs.num_lookahead_slots, - running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids, - # We use ExecuteModelRequest to pass the last sampled_token_ids - # to each of the non-last PP stages for in-place prepare_input. - last_sampled_token_ids=last_sampled_token_ids) - - if allow_async_output_proc: - execute_model_req.async_callback = self.async_callbacks[ - virtual_engine] - - try: - outputs = self.model_executor.execute_model( - execute_model_req=execute_model_req) - self._skip_scheduling_next_step = False - except InputProcessingError as e: - # The input for this request cannot be processed, so we must - # abort it. If there are remaining requests in the batch that - # have been scheduled, they will be retried on the next step. - invalid_request_id = e.request_id - self._abort_and_cache_schedule( - request_id=invalid_request_id, - virtual_engine=virtual_engine, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - allow_async_output_proc=allow_async_output_proc) - # Raise so the caller is notified that this request failed - raise - - else: - # Nothing scheduled => If there is pending async postprocessor, - # then finish it here. - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - # No outputs in this case - outputs = [] - - if not self._has_remaining_steps(seq_group_metadata_list): - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. - is_first_step_output: bool = False if not seq_group_metadata_list \ - else seq_group_metadata_list[0].state.num_steps == 1 - - # Add results to the output_queue - ctx.append_output(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=allow_async_output_proc, - is_last_step=True, - is_first_step_output=is_first_step_output) - - if outputs and allow_async_output_proc: - assert len(outputs) == 1, ( - "Async postprocessor expects only a single output set") - - self._advance_to_next_step( - outputs[0], seq_group_metadata_list, - scheduler_outputs.scheduled_seq_groups) - - # Check if need to run the usual non-async path - if not allow_async_output_proc: - self._process_model_outputs(ctx=ctx) - - # Log stats. - self.do_log_stats(scheduler_outputs, outputs) - - # Tracing - self.do_tracing(scheduler_outputs) - else: - # Multi-step case - return ctx.request_outputs - - if not self.has_unfinished_requests(): - # Drain async postprocessor (if exists) - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - assert len(ctx.output_queue) == 0 - - # Stop the execute model loop in parallel workers until there are - # more requests to process. This avoids waiting indefinitely in - # torch.distributed ops which may otherwise time out, and unblocks - # the RPC thread in the workers so that they can process any other - # queued control plane messages, such as add/remove lora adapters. - logger.debug("Stopping remote worker execution loop.") - self.model_executor.stop_remote_worker_execution_loop() - - return ctx.request_outputs - - def _abort_and_cache_schedule( - self, request_id: str, virtual_engine: int, - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduler_outputs: SchedulerOutputs, - allow_async_output_proc: bool) -> None: - """Aborts a single request, and caches the scheduler outputs minus that - request. This allows the next step to continue processing the remaining - requests without having to re-run the scheduler.""" - - # Abort the request and remove its sequence group from the current - # schedule - self.abort_request(request_id) - for i, metadata in enumerate(seq_group_metadata_list): - if metadata.request_id == request_id: - del seq_group_metadata_list[i] - break - for i, group in enumerate(scheduler_outputs.scheduled_seq_groups): - if group.seq_group.request_id == request_id: - del scheduler_outputs.scheduled_seq_groups[i] - break - - # If there are still other sequence groups left in the schedule, cache - # them and flag the engine to reuse the schedule. - if len(seq_group_metadata_list) > 0: - self._skip_scheduling_next_step = True - # Reuse multi-step caching logic - self._cache_scheduler_outputs_for_multi_step( - virtual_engine=virtual_engine, - scheduler_outputs=scheduler_outputs, - seq_group_metadata_list=seq_group_metadata_list, - allow_async_output_proc=allow_async_output_proc) - - def _has_remaining_steps( - self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] - ) -> bool: - return False - - def _cache_scheduler_outputs_for_multi_step( - self, virtual_engine: int, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - scheduler_outputs: SchedulerOutputs, - allow_async_output_proc: bool) -> None: - co = self.cached_scheduler_outputs[virtual_engine] - - co.seq_group_metadata_list = seq_group_metadata_list - co.scheduler_outputs = scheduler_outputs - co.allow_async_output_proc = allow_async_output_proc - co.last_output = None - - def _update_cached_scheduler_output( - self, virtual_engine: int, - output: List[Optional[SamplerOutput]]) -> None: - if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 - and output[0] is not None): - last_output = output[-1] - assert last_output is not None - assert last_output.sampled_token_ids_cpu is not None - assert last_output.sampled_token_ids is None - assert last_output.sampled_token_probs is None - self.cached_scheduler_outputs[ - virtual_engine].last_output = last_output - - def _get_last_sampled_token_ids( - self, virtual_engine: int) -> Optional[torch.Tensor]: - return None - - def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: - if not self.log_stats: - raise RuntimeError( - "Stat logging is disabled. Set `disable_log_stats=False` " - "argument to enable.") - if logger_name in self.stat_loggers: - raise KeyError(f"Logger with name {logger_name} already exists.") - self.stat_loggers[logger_name] = logger - - def remove_logger(self, logger_name: str) -> None: - if not self.log_stats: - raise RuntimeError( - "Stat logging is disabled. Set `disable_log_stats=False` " - "argument to enable.") - if logger_name not in self.stat_loggers: - raise KeyError(f"Logger with name {logger_name} does not exist.") - del self.stat_loggers[logger_name] - - def do_log_stats(self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> None: - """Forced log when no requests active.""" - if self.log_stats: - stats = self._get_stats(scheduler_outputs, model_output, - finished_before, skip) - for logger in self.stat_loggers.values(): - logger.log(stats) - - def _get_stats(self, - scheduler_outputs: Optional[SchedulerOutputs], - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> Stats: - """Get Stats to be Logged to Prometheus. - - Args: - scheduler_outputs: Optional, used to populate metrics related to - the scheduled batch, - model_output: Optional, used to emit speculative decoding metrics - which are created by the workers. - finished_before: Optional, indices of sequences that were finished - before. These sequences will be ignored. - skip: Optional, indices of sequences that were preempted. These - sequences will be ignored. - """ - now = time.time() - - # System State - # Scheduler State - num_running_sys = sum( - len(scheduler.running) for scheduler in self.scheduler) - num_swapped_sys = sum( - len(scheduler.swapped) for scheduler in self.scheduler) - num_waiting_sys = sum( - len(scheduler.waiting) for scheduler in self.scheduler) - - # KV Cache Usage in % - num_total_gpu = self.cache_config.num_gpu_blocks - gpu_cache_usage_sys = 0. - if num_total_gpu: # Guard against both None and 0 - num_free_gpu = sum( - scheduler.block_manager.get_num_free_gpu_blocks() - for scheduler in self.scheduler) - gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu) - - num_total_cpu = self.cache_config.num_cpu_blocks - cpu_cache_usage_sys = 0. - if num_total_cpu: # Guard against both None and 0 - num_free_cpu = sum( - scheduler.block_manager.get_num_free_cpu_blocks() - for scheduler in self.scheduler) - cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu) - - # Prefix Cache Hit Rate. Note that we always use - # the cache hit rate of the first virtual engine. - cpu_prefix_cache_hit_rate = self.scheduler[ - 0].get_prefix_cache_hit_rate(Device.CPU) - gpu_prefix_cache_hit_rate = self.scheduler[ - 0].get_prefix_cache_hit_rate(Device.GPU) - - # Exchange the uasge and cache hit stats between gpu and cpu when - # running on cpu because the cpu_worker.py intentionally reports the - # number of cpu blocks as gpu blocks in favor of cache management. - if self.device_config.device_type == "cpu": - num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu - gpu_cache_usage_sys, cpu_cache_usage_sys = ( - cpu_cache_usage_sys, - gpu_cache_usage_sys, - ) - gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = ( - cpu_prefix_cache_hit_rate, - gpu_prefix_cache_hit_rate, - ) - - # Iteration stats - num_prompt_tokens_iter = 0 - num_generation_tokens_iter = 0 - num_tokens_iter = 0 - time_to_first_tokens_iter: List[float] = [] - inter_token_latencies_iter: List[float] = [] - num_preemption_iter = (0 if scheduler_outputs is None else - scheduler_outputs.preempted) - - # Request stats - # Latency - time_e2e_requests: List[float] = [] - time_queue_requests: List[float] = [] - time_inference_requests: List[float] = [] - time_prefill_requests: List[float] = [] - time_decode_requests: List[float] = [] - # Metadata - num_prompt_tokens_requests: List[int] = [] - num_generation_tokens_requests: List[int] = [] - n_requests: List[int] = [] - max_num_generation_tokens_requests: List[int] = [] - max_tokens_requests: List[int] = [] - finished_reason_requests: List[str] = [] - - # LoRA requests - running_lora_adapters = dict( - collectionsCounter([ - running_request.lora_request.lora_name - for scheduler in self.scheduler - for running_request in scheduler.running - if running_request.lora_request - ])) - waiting_lora_adapters = dict( - collectionsCounter([ - waiting_request.lora_request.lora_name - for scheduler in self.scheduler - for waiting_request in scheduler.waiting - if waiting_request.lora_request - ])) - max_lora_stat = "0" - if self.lora_config: - max_lora_stat = str(self.lora_config.max_loras) - - # NOTE: This loop assumes prefill seq_groups are before - # decode seq_groups in scheduled_seq_groups. - if scheduler_outputs is not None: - # For async postprocessor, already finished sequences need to be - # not counted (to avoid double counting) - actual_num_batched_tokens = scheduler_outputs.num_batched_tokens # type: ignore - - num_generation_tokens_from_prefill_groups = 0 - # NOTE: if scheduler_outputs.num_prefill_groups > 0 and - # the len of scheduler_outputs.scheduled_seq_groups is != - # scheduler_outputs.num_prefill_groups, this means that - # chunked prefills have been detected. - - for idx, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - # Skip double logging when using async output proc - if finished_before and idx in finished_before: - actual_num_batched_tokens -= 1 - continue - - # Currently, skip == preempted sequences, so we need to skip - # their log stats - if skip and idx in skip: - continue - - group_was_prefill = idx < scheduler_outputs.num_prefill_groups - seq_group = scheduled_seq_group.seq_group - - # NOTE: a seq_group that completed all of its prefill tokens - # in the last iteration will have seq_group.is_prefill() = False - # with group_was_prefill = True - if group_was_prefill: - # Number of prompt tokens. - num_prompt_tokens_iter += ( - scheduled_seq_group.token_chunk_size) - - # If the seq_group just finished the prefill state - # get TTFT. - if not seq_group.is_prefill(): - latency = seq_group.get_last_token_latency() - time_to_first_tokens_iter.append(latency) - - # One generation token per finished prefill. - num_generation_tokens_from_prefill_groups += ( - seq_group.num_seqs()) - else: - # ITLs - latency = seq_group.get_last_token_latency() - inter_token_latencies_iter.append(latency) - if seq_group.state.current_step == 0: - # For async_output_proc, the do_log_stats() - # is called following init_multi_step(), which - # sets the current_step to zero. - actual_num_batched_tokens +=\ - seq_group.state.num_steps - 1 - else: - actual_num_batched_tokens +=\ - seq_group.state.current_step - 1 - - # Because of chunked prefill, we can have a single sequence - # group that does multiple prompt_runs. To prevent logging - # the same metadata more than once per request, we standardize - # on logging request level information for finished requests, - # which can only happen once. - if seq_group.is_finished(): - # Latency timings - time_e2e_requests.append(now - - seq_group.metrics.arrival_time) - if (seq_group.metrics.first_scheduled_time is not None and - seq_group.metrics.first_token_time is not None): - time_queue_requests.append( - seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) - time_prefill_requests.append( - seq_group.metrics.first_token_time - - seq_group.metrics.first_scheduled_time) - time_decode_requests.append( - now - seq_group.metrics.first_token_time) - time_inference_requests.append( - now - seq_group.metrics.first_scheduled_time) - # Metadata - num_prompt_tokens_requests.append( - len(seq_group.prompt_token_ids)) - num_generation_tokens_requests.extend([ - seq.get_output_len() - for seq in seq_group.get_finished_seqs() - ]) - max_num_generation_tokens_requests.append( - max(seq.get_output_len() - for seq in seq_group.get_seqs())) - if seq_group.sampling_params is not None: - n_requests.append(seq_group.sampling_params.n) - max_tokens_requests.append( - seq_group.sampling_params.max_tokens) - finished_reason_requests.extend([ - SequenceStatus.get_finished_reason(seq.status) - for seq in seq_group.get_finished_seqs() - ]) - - # Number of generation tokens. - # num_batched_tokens equals the number of prompt_tokens plus the - # number of decode_tokens in a single iteration. So, - # num_generation_tokens = num_batched_tokens - num_prompt_tokens - # + num_generation_tokens_from_prefill_groups (since we generate - # one token on prefills on iters where the prefill finishes). - num_generation_tokens_iter = ( - actual_num_batched_tokens - num_prompt_tokens_iter + - num_generation_tokens_from_prefill_groups) - num_tokens_iter = (num_generation_tokens_iter + - num_prompt_tokens_iter) - - return Stats( - now=now, - # System stats - # Scheduler State - num_running_sys=num_running_sys, - num_swapped_sys=num_swapped_sys, - num_waiting_sys=num_waiting_sys, - # KV Cache Usage in % - gpu_cache_usage_sys=gpu_cache_usage_sys, - cpu_cache_usage_sys=cpu_cache_usage_sys, - # Prefix Cache Hit Rate - cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate, - gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate, - - # Iteration stats - num_prompt_tokens_iter=num_prompt_tokens_iter, - num_generation_tokens_iter=num_generation_tokens_iter, - num_tokens_iter=num_tokens_iter, - time_to_first_tokens_iter=time_to_first_tokens_iter, - inter_token_latencies_iter=inter_token_latencies_iter, - num_preemption_iter=num_preemption_iter, - - # Request stats - # Latency - time_e2e_requests=time_e2e_requests, - time_queue_requests=time_queue_requests, - time_inference_requests=time_inference_requests, - time_prefill_requests=time_prefill_requests, - time_decode_requests=time_decode_requests, - # Metadata - num_prompt_tokens_requests=num_prompt_tokens_requests, - num_generation_tokens_requests=num_generation_tokens_requests, - max_num_generation_tokens_requests= - max_num_generation_tokens_requests, - n_requests=n_requests, - max_tokens_requests=max_tokens_requests, - finished_reason_requests=finished_reason_requests, - max_lora=str(max_lora_stat), - waiting_lora_adapters=list(waiting_lora_adapters.keys()), - running_lora_adapters=list(running_lora_adapters.keys())) - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_executor.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_executor.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_executor.list_loras() - - def pin_lora(self, lora_id: int) -> bool: - return self.model_executor.pin_lora(lora_id) - - def start_profile(self) -> None: - self.model_executor.start_profile() - - def stop_profile(self) -> None: - self.model_executor.stop_profile() - - def sleep(self, level: int = 1) -> None: - assert self.vllm_config.model_config.enable_sleep_mode, ( - "Sleep mode is not enabled in the model config") - self.model_executor.sleep(level=level) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - assert self.vllm_config.model_config.enable_sleep_mode, ( - "Sleep mode is not enabled in the model config") - self.model_executor.wake_up(tags) - - def is_sleeping(self) -> bool: - return self.model_executor.is_sleeping - - def check_health(self) -> None: - self.model_executor.check_health() - - def is_tracing_enabled(self) -> bool: - return self.tracer is not None - - def do_tracing(self, - scheduler_outputs: SchedulerOutputs, - finished_before: Optional[List[int]] = None) -> None: - if self.tracer is None: - return - - for idx, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - # Skip double tracing when using async output proc - if finished_before and idx in finished_before: - continue - - seq_group = scheduled_seq_group.seq_group - if seq_group.is_finished(): - self.create_trace_span(seq_group) - - def create_trace_span(self, seq_group: SequenceGroup) -> None: - if self.tracer is None or seq_group.sampling_params is None: - return - arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9) - - trace_context = extract_trace_context(seq_group.trace_headers) - - with self.tracer.start_as_current_span( - "llm_request", - kind=SpanKind.SERVER, - context=trace_context, - start_time=arrival_time_nano_seconds) as seq_span: - metrics = seq_group.metrics - - # Handle potential None values for cancelled/aborted requests - ttft = (metrics.first_token_time - metrics.arrival_time - if metrics.first_token_time is not None else None) - - e2e_time = (metrics.finished_time - metrics.arrival_time - if metrics.finished_time is not None else None) - - seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, - self.model_config.model) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, - seq_group.request_id) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, - seq_group.sampling_params.temperature) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, - seq_group.sampling_params.top_p) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, - seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, - seq_group.sampling_params.n) - seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES, - seq_group.num_seqs()) - seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, - len(seq_group.prompt_token_ids)) - seq_span.set_attribute( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, - sum([ - seq.get_output_len() - for seq in seq_group.get_finished_seqs() - ])) - - # Only set timing attributes if the values are available - if metrics.time_in_queue is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, - metrics.time_in_queue) - if ttft is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - if e2e_time is not None: - seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, - e2e_time) - if metrics.scheduler_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, - metrics.scheduler_time) - if metrics.model_forward_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD, - metrics.model_forward_time / 1000.0) - if metrics.model_execute_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, - metrics.model_execute_time) - - def _validate_model_inputs(self, inputs: ProcessorInputs): - encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) - - if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, prompt_type="encoder") - - self._validate_model_input(decoder_inputs, prompt_type="decoder") - - def _validate_model_input( - self, - prompt_inputs: SingletonInputs, - *, - prompt_type: Literal["encoder", "decoder"], - ): - model_config = self.model_config - tokenizer = self.tokenizer - - prompt_ids = prompt_inputs.get("prompt_token_ids", []) - if not prompt_ids: - if prompt_type == "encoder" and model_config.is_multimodal_model: - pass # Mllama may have empty encoder inputs for text-only data - elif prompt_inputs["type"] == "embeds": - pass - else: - raise ValueError(f"The {prompt_type} prompt cannot be empty") - - if tokenizer is not None: - max_input_id = max(prompt_ids, default=0) - if max_input_id > tokenizer.max_token_id: - raise ValueError( - f"Token id {max_input_id} is out of vocabulary") - - max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) > max_prompt_len: - if prompt_type == "encoder" and model_config.is_multimodal_model: - mm_registry = self.input_preprocessor.mm_registry - mm_processor = mm_registry.create_processor( - model_config, - tokenizer=tokenizer or object(), # Dummy if no tokenizer - ) - assert isinstance(mm_processor, EncDecMultiModalProcessor) - - if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper - - if model_config.is_multimodal_model: - suggestion = ( - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens plus multimodal tokens. For image " - "inputs, the number of image tokens depends on the number " - "of images, and possibly their aspect ratios as well.") - else: - suggestion = ( - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens.") - - raise ValueError( - f"The {prompt_type} prompt (length {len(prompt_ids)}) is " - f"longer than the maximum model length of {max_prompt_len}. " - f"{suggestion}") - - # TODO: Find out how many placeholder tokens are there so we can - # check that chunked prefill does not truncate them - # max_batch_len = self.scheduler_config.max_num_batched_tokens - - def _build_logits_processors( - self, sampling_params: SamplingParams, - lora_request: Optional[LoRARequest]) -> SamplingParams: - """Constructs logits processors based on the logits_bias, and - allowed_token_ids fields in sampling_params. Deletes those fields and - adds the constructed logits processors to the logits_processors field. - Returns the modified sampling params.""" - - logits_processors = [] - - if (sampling_params.logit_bias or sampling_params.allowed_token_ids): - tokenizer = self.get_tokenizer() - - processors = get_openai_logits_processors( - logit_bias=sampling_params.logit_bias, - allowed_token_ids=sampling_params.allowed_token_ids, - tokenizer=tokenizer) - logits_processors.extend(processors) - - # Unset so these don't get passed down to the model - sampling_params.logit_bias = None - sampling_params.allowed_token_ids = None - - if len(sampling_params.bad_words) > 0: - tokenizer = self.get_tokenizer() - processors = get_bad_words_logits_processors( - bad_words=sampling_params.bad_words, tokenizer=tokenizer) - logits_processors.extend(processors) - - if logits_processors: - if sampling_params.logits_processors is None: - sampling_params.logits_processors = logits_processors - else: - sampling_params.logits_processors.extend(logits_processors) - - return sampling_params - - def collective_rpc(self, - method: Union[str, Callable[[WorkerBase], _R]], - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None) -> list[_R]: - return self.model_executor.collective_rpc(method, timeout, args, - kwargs) - - def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - return self.collective_rpc("apply_model", args=(func, )) - - -if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - LLMEngine = V1LLMEngine # type: ignore +LLMEngine = V1LLMEngine # type: ignore diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f2282c40f7073..0ab806fcb8b5c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -11,7 +11,6 @@ from pydantic import ValidationError from tqdm.auto import tqdm from typing_extensions import TypeVar -import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) @@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType, StructuredOutputsConfig, TokenizerMode, is_init_field) from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, PoolerConfig, RunnerOption) -from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption, apply_hf_chat_template, @@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device, as_iter, is_list_of +from vllm.v1.engine.llm_engine import LLMEngine from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: @@ -309,11 +308,7 @@ class LLM: self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None - if envs.VLLM_USE_V1: - supported_tasks = self.llm_engine \ - .get_supported_tasks() # type: ignore - else: - supported_tasks = self.llm_engine.model_config.supported_tasks + supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore logger.info("Supported_tasks: %s", supported_tasks) @@ -1473,8 +1468,6 @@ class LLM: Note: This method is only available with the V1 LLM engine. """ - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - assert isinstance(self.llm_engine, V1LLMEngine) return self.llm_engine.get_metrics() def _validate_and_add_requests( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 58296131fadb9..13f4eebf1038e 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs", ) as stream: stream.write(encryption_params.key) - from vllm import LLMEngine - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + assert envs.VLLM_USE_V1 - if not envs.VLLM_USE_V1: - engine = LLMEngine.from_engine_args(engine_args) - engine.model_executor.collective_rpc( - "save_tensorized_model", - kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, - ) - else: - engine = V1LLMEngine.from_vllm_config(engine_config) - engine.collective_rpc( - "save_tensorized_model", - kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, - ) + from vllm.v1.engine.llm_engine import LLMEngine + + engine = LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, + ) def tensorize_lora_adapter(lora_path: str, From 86647d1cd0f3c82c7d678324db7e925654ac5665 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 17:57:20 -0700 Subject: [PATCH 063/185] [V0 Deprecation] Remove V0 Output Processor (#25320) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- tests/build_cython.py | 39 ----- vllm/engine/output_processor/__init__.py | 0 vllm/engine/output_processor/interfaces.py | 59 -------- vllm/engine/output_processor/single_step.py | 145 ------------------- vllm/engine/output_processor/stop_checker.py | 139 ------------------ vllm/v1/engine/detokenizer.py | 42 +++++- 6 files changed, 40 insertions(+), 384 deletions(-) delete mode 100644 tests/build_cython.py delete mode 100644 vllm/engine/output_processor/__init__.py delete mode 100644 vllm/engine/output_processor/interfaces.py delete mode 100644 vllm/engine/output_processor/single_step.py delete mode 100644 vllm/engine/output_processor/stop_checker.py diff --git a/tests/build_cython.py b/tests/build_cython.py deleted file mode 100644 index 444434e8f0a79..0000000000000 --- a/tests/build_cython.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import Cython.Compiler.Options -from Cython.Build import cythonize -from setuptools import setup - -Cython.Compiler.Options.annotate = True - -infiles = [] - -infiles += [ - "vllm/engine/llm_engine.py", - "vllm/transformers_utils/detokenizer.py", - "vllm/engine/output_processor/single_step.py", - "vllm/outputs.py", - "vllm/engine/output_processor/stop_checker.py", -] - -infiles += [ - "vllm/core/scheduler.py", - "vllm/sequence.py", - "vllm/core/block_manager.py", -] - -infiles += [ - "vllm/model_executor/layers/sampler.py", - "vllm/sampling_params.py", - "vllm/utils/__init__.py", -] - -setup(ext_modules=cythonize(infiles, - annotate=False, - force=True, - compiler_directives={ - 'language_level': "3", - 'infer_types': True - })) - -# example usage: python3 build_cython.py build_ext --inplace diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py deleted file mode 100644 index 587a9221e32c8..0000000000000 --- a/vllm/engine/output_processor/interfaces.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import List - -from vllm.config import SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.sequence import SequenceGroup, SequenceGroupOutput -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.utils import Counter - - -class SequenceGroupOutputProcessor(ABC): - """Interface for logic that processes new token ids in sequence groups, - managing detokenization, stop checking, and freeing/forking sequences with - the scheduler. - - This is highly coupled with the LLMEngine and should be seen as an extension - of it. The logic is separated to simplify the LLMEngine class and allow - separate implementations for single-step decoding (which supports beam - search sequence forking) and multi-step decoding (which does not support - beam search, but does support speculative decoding). - """ - - @staticmethod - def create_output_processor( - scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, - scheduler: List[Scheduler], - seq_counter: Counter, - stop_checker: "StopChecker", - ): - """Create an output processor. - - Multi-step scheduling is no longer supported. Always return a - single-step output processor. - """ - from vllm.engine.output_processor.single_step import ( - SingleStepOutputProcessor) - return SingleStepOutputProcessor(scheduler_config, detokenizer, - scheduler, seq_counter, stop_checker) - - @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], - is_async: bool) -> None: - """Process new token ids for the sequence group. Handles logic such as - detokenization, stop checking, and freeing/forking sequences in the - scheduler. - """ - pass - - @abstractmethod - def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: - """Update prompt logprobs received from outputs to seq_group.""" - pass diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py deleted file mode 100644 index dbf6a371d050a..0000000000000 --- a/vllm/engine/output_processor/single_step.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List - -from vllm.config import SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.interfaces import ( - SequenceGroupOutputProcessor) -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.logger import init_logger -from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup, - SequenceGroupOutput) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.utils import Counter - -logger = init_logger(__name__) - - -def single_step_process_prompt_logprob( - sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, - output: CompletionSequenceGroupOutput) -> None: - """Process prompt logprobs associated with the - [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step. - - Do nothing if the output has no prompt logprobs. - - Account for the fact that transformers do not compute first-token logprobs. - - Args: - sg_output_proc: - [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor] - instance - seq_group: the output is associated with this - [`SequenceGroup`][vllm.sequence.SequenceGroup] - output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] - for a single scheduler step - """ - prompt_logprobs = output.prompt_logprobs - - # If this is the first (or only) "chunk" of the prefill, we need - # to prepend None to the list of prompt logprobs. The reason for this - # is that for N prompt tokens, the Sampler will generate N-1 total - # prompt logprobs during prefill since the token at idx 0 will not - # have a logprob associated with it. - if prompt_logprobs is not None: - if not seq_group.prompt_logprobs: - prompt_logprobs = [None] + prompt_logprobs - seq_group.prompt_logprobs = [] - - assert hasattr(sg_output_proc, 'detokenizer') - if (seq_group.sampling_params.detokenize - and sg_output_proc.detokenizer): - sg_output_proc.detokenizer.decode_prompt_logprobs_inplace( - seq_group, - prompt_logprobs, - position_offset=len(seq_group.prompt_logprobs)) - - seq_group.prompt_logprobs.extend(prompt_logprobs) - - -class SingleStepOutputProcessor(SequenceGroupOutputProcessor): - """SequenceGroupOutputProcessor which handles "output processing" logic, - which happens after the model returns generated token ids and before - scheduling of the next batch. Output processing logic includes - detokenization, and determining if a sequence is finished (e.g. via max len - or eos token). - - The SingleStepOutputProcessor is specialized to the case where the model - emits at most a single token per invocation, which precludes configurations - such as speculative decoding or multi-step decoding. This enables beam - search sampling, which requires forking/finishing/freeing sequences in a way - that is currently difficult to schedule multiple steps ahead of time. - """ - - def __init__(self, scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, scheduler: List[Scheduler], - seq_counter: Counter, stop_checker: StopChecker): - self.scheduler_config = scheduler_config - self.detokenizer = detokenizer - self.scheduler = scheduler - self.seq_counter = seq_counter - self.stop_checker = stop_checker - - def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], - is_async: bool) -> None: - """Append all new tokens to sequences in the sequence group. Fork any - surviving beam candidates; free any unsurviving ones. - - Invokes detokenizer to detokenize new tokens, and also marks sequences - as finished if they meet stop conditions. - - is_async - Indicates whether this postprocessor runs in - parallel with the GPU forward pass and is processing - tokens from the previous step. If this is true, then - no tokens need to be appended since it is already done - externally (before the next schedule() call) - """ - assert (len(outputs) == 1 - ), f"{type(self)} does not support multiple outputs per step" - return self._process_sequence_group_outputs(sequence_group, outputs[0], - is_async) - - def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: - """Process prompt logprobs associated with one step of a single-step- - scheduled computation. - - Args: - seq_group: the output is associated with this - [`SequenceGroup`][vllm.sequence.SequenceGroup] - outputs: the - [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] - for a single scheduler step - """ - assert len(outputs) == 1, "Single step should only have 1 output." - output = outputs[0] - assert isinstance(output, CompletionSequenceGroupOutput) - single_step_process_prompt_logprob(self, seq_group, output) - - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput, - is_async: bool) -> None: - sampling_params = seq_group.sampling_params - - sample = outputs.samples[0] - seq = seq_group.first_seq - if not is_async: - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) - if sampling_params.detokenize and self.detokenizer: - new_char_count = self.detokenizer.decode_sequence_inplace( - seq, sampling_params) - else: - new_char_count = 0 - self.stop_checker.maybe_stop_sequence( - seq, - new_char_count, - sampling_params, - lora_req=seq_group.lora_request, - ) - if seq.is_finished(): - for scheduler in self.scheduler: - scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py deleted file mode 100644 index 0916f1c918c85..0000000000000 --- a/vllm/engine/output_processor/stop_checker.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Tuple - -from vllm.lora.request import LoRARequest -from vllm.reasoning import ReasoningParser -from vllm.sampling_params import SamplingParams -from vllm.sequence import Sequence, SequenceStatus - - -class StopChecker: - """LLMEngine helper class which separates out the logic involving stop - checking. This checks things such as: whether the eos token was emitted, - whether the max_tokens has been consumed, whether a stop string has been - emitted, or if we have exceeded the max model len. - """ - - def __init__( - self, - max_model_len: int, - reasoner: Optional[ReasoningParser] = None, - ): - # Do not use it directly, but use `self._get_max_model_len`. - self._max_model_len = max_model_len - self.reasoner = reasoner - - def _get_max_model_len(self, lora_req: Optional[LoRARequest]): - if lora_req and lora_req.long_lora_max_len: - return lora_req.long_lora_max_len - else: - return self._max_model_len - - def maybe_stop_sequence( - self, - seq: Sequence, - new_char_count: int, - sampling_params: SamplingParams, - lora_req: Optional[LoRARequest] = None, - ) -> None: - """Stop the finished sequences. - - new_char_count is the number of chars added to the - sequence's output text for the newly generated token - """ - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - # Remove the last EOS token unless explicitly specified - # This prevents unintended exposure of the EOS token - if new_char_count and ( - not sampling_params.include_stop_str_in_output): - seq.output_text = seq.output_text[:-new_char_count] - seq.status = SequenceStatus.FINISHED_STOPPED - return - - # Skip stop string/token checks if in reasoning content generation - if self.reasoner is not None and \ - not self.reasoner.is_reasoning_end(seq.get_token_ids()): - return - - # Check if a stop token was encountered. - # This assumes a single token produced per step. - last_token_id = seq.get_last_token_id() - if last_token_id in (sampling_params.stop_token_ids or ()): - if new_char_count and ( - not sampling_params.include_stop_str_in_output): - # Remove last token - seq.output_text = seq.output_text[:-new_char_count] - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if any stop strings are matched. - stop = self.check_stop_strings( - seq.output_text, new_char_count, sampling_params.stop, - sampling_params.include_stop_str_in_output) - if stop is not None: - stop_str, truncate_to = stop - if truncate_to != -1: - seq.output_text = seq.output_text[:truncate_to] - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - - # Check if the sequence has reached max_model_len. - if seq.get_len() >= self._get_max_model_len(lora_req): - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() == sampling_params.max_tokens: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - @staticmethod - def check_stop_strings( - output_text: str, - new_char_count: int, - stop: List[str], - include_in_output: bool, - ) -> Optional[Tuple[str, int]]: - """Check if any stop strings are matched and truncate sequence - output text accordingly. - - Returns tuple (stop_string, offset) if matched or else None. - - Where stop_string is the matched stop string and offset is the - length to which output_text should be truncated, or -1 for no - truncation. - """ - if not new_char_count or not stop: - return None - - for stop_str in stop: - stop_string_len = len(stop_str) - # Avoid searching already-searched text. - stop_index = output_text.find(stop_str, - 1 - new_char_count - stop_string_len) - if stop_index == -1: - continue - - if include_in_output: - # Truncate to end of stop string. - stop_index += stop_string_len - if stop_index >= len(output_text): - # No truncation required. - return stop_str, -1 - - # Truncate the output text to either the beginning - # or end of the stop string. - return stop_str, stop_index - return None diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8aa36d6a439c1..0f993a74c8103 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -9,7 +9,6 @@ from tokenizers import Tokenizer from tokenizers.decoders import DecodeStream from transformers import PreTrainedTokenizerFast -from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) @@ -129,7 +128,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 2) Evaluate stop strings. stop_string = None if self.stop and len(self.output_token_ids) > self.min_tokens: - stop = StopChecker.check_stop_strings( + stop = check_stop_strings( output_text=self.output_text, new_char_count=len(self.output_text) - stop_check_offset, stop=self.stop, @@ -309,3 +308,42 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): self.read_offset = read_offset return decoded_text + + +def check_stop_strings( + output_text: str, + new_char_count: int, + stop: list[str], + include_in_output: bool, +) -> Optional[tuple[str, int]]: + """Check if any stop strings are matched and truncate sequence + output text accordingly. + + Returns tuple (stop_string, offset) if matched or else None. + + Where stop_string is the matched stop string and offset is the + length to which output_text should be truncated, or -1 for no + truncation. + """ + if not new_char_count or not stop: + return None + + for stop_str in stop: + stop_string_len = len(stop_str) + # Avoid searching already-searched text. + stop_index = output_text.find(stop_str, + 1 - new_char_count - stop_string_len) + if stop_index == -1: + continue + + if include_in_output: + # Truncate to end of stop string. + stop_index += stop_string_len + if stop_index >= len(output_text): + # No truncation required. + return stop_str, -1 + + # Truncate the output text to either the beginning + # or end of the stop string. + return stop_str, stop_index + return None From 572ddf83ce1fe8d52670e01a7a6cc0d8a99fa90c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 19:53:20 -0700 Subject: [PATCH 064/185] [Chore] Remove unused sampler in models (#25324) Signed-off-by: Woosuk Kwon --- tests/lora/conftest.py | 3 --- vllm/model_executor/models/ernie_mtp.py | 10 ---------- vllm/model_executor/models/plamo2.py | 10 ---------- vllm/model_executor/models/step3_text.py | 10 ---------- vllm/model_executor/models/step3_vl.py | 16 ---------------- 5 files changed, 49 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 3475993ff8f07..b539a7bf5d76c 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -17,7 +17,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.platforms import current_platform @@ -97,7 +96,6 @@ def dummy_model() -> nn.Module: # Special handling for lm_head & sampler ("lm_head", ParallelLMHead(512, 10)), ("logits_processor", LogitsProcessor(512)), - ("sampler", Sampler()) ])) model.config = MagicMock() model.embedding_modules = {"lm_head": "lm_head"} @@ -125,7 +123,6 @@ def dummy_model_gate_up() -> nn.Module: # Special handling for lm_head & sampler ("lm_head", ParallelLMHead(512, 10)), ("logits_processor", LogitsProcessor(512)), - ("sampler", Sampler()) ])) model.config = MagicMock() model.packed_modules_mapping = { diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index 57c5348874378..c446265230313 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -33,7 +33,6 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -160,7 +159,6 @@ class ErnieMTP(nn.Module, SupportsPP): self.lm_head = ParallelLMHead(self.config.vocab_size, self.config.hidden_size, prefix=maybe_prefix(prefix, "lm_head")) - self.sampler = get_sampler() if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight @@ -188,14 +186,6 @@ class ErnieMTP(nn.Module, SupportsPP): return self.model.compute_logits(hidden_states, self.lm_head, sampling_metadata, spec_step_idx) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index ef96d272adfb5..9f1ee36366fdd 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -932,7 +931,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -1030,14 +1028,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index b8733fa5e6129..6a5b540fc8174 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -26,7 +26,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -391,7 +390,6 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() @@ -413,14 +411,6 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: qkv_params_mapping = [ diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 2ba5f94ea3b88..c2940f8e44450 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from itertools import product from math import ceil, sqrt from typing import Any, Literal, Optional, TypedDict, Union @@ -24,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -897,13 +895,6 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - @property def device(self): return next(self.parameters()).device @@ -1069,13 +1060,6 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): skip_prefixes = [] From 72dd1595b466383c8c9c9e39cb519814df1a105f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 19:57:46 -0700 Subject: [PATCH 065/185] [CI] Skip tests failing on main (#25326) Signed-off-by: Woosuk Kwon --- .../entrypoints/openai/test_completion_with_prompt_embeds.py | 1 + tests/models/quantization/test_fp8.py | 5 ++++- tests/models/test_oot_registration.py | 1 + tests/quantization/test_compressed_tensors.py | 5 ++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 176c1825530e4..9c62595ad280b 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -60,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str: return base64.b64encode(buffer.getvalue()).decode('utf-8') +@pytest.mark.skip("This test is skipped because it is flaky.") @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_prompt_embeds( diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index 97dd4d6135ac4..bb8ae741b6149 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -32,7 +32,7 @@ from ..utils import check_logprobs_close # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) @@ -57,6 +57,9 @@ def test_models( pytest.skip( f"{kv_cache_dtype} is currently not supported on ROCm/HIP.") + if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None): + pytest.skip(f"{kv_cache_dtype} is not supported on this platform.") + with monkeypatch.context() as m: m.setenv("TOKENIZERS_PARALLELISM", 'true') m.setenv(STR_BACKEND_ENV_VAR, backend) diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index cb30d77c4f0ea..9b376f2a260ac 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -63,6 +63,7 @@ def test_oot_registration_embedding( image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") +@pytest.mark.skip(reason="This test is skipped because it failed on V1.") @create_new_process_for_each_test() def test_oot_registration_multimodal( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b7949a488ad05..c0ab3fbb10622 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -357,6 +357,9 @@ def test_compressed_tensors_fp8(vllm_runner): assert output +@pytest.mark.skipif( + not current_platform.is_kv_cache_dtype_supported("fp8", None), + reason="FP8 KV cache is not supported on this device.") @pytest.mark.skipif(not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform.") def test_compressed_tensors_kv_cache(vllm_runner): @@ -738,4 +741,4 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt, with vllm_runner(model, enforce_eager=True) as llm: perplexity = llm.generate_prompt_perplexity([prompt])[0] print(perplexity) - assert perplexity <= exp_perplexity \ No newline at end of file + assert perplexity <= exp_perplexity From c99db8c8ddd50c0d30c14057012bd27b42d32b2d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 19:58:26 -0700 Subject: [PATCH 066/185] [V0 Deprecation] Remove V0 core (#25321) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 3 - .github/CODEOWNERS | 2 - pyproject.toml | 2 - .../backends/differential_flash_attn.py | 12 +- .../backends/dual_chunk_flash_attn.py | 10 +- vllm/attention/backends/flash_attn.py | 12 +- vllm/attention/backends/mla/common.py | 13 +- vllm/attention/backends/placeholder_attn.py | 11 +- vllm/attention/backends/rocm_aiter_mla.py | 7 +- vllm/attention/backends/utils.py | 9 +- vllm/core/__init__.py | 0 vllm/core/block/__init__.py | 0 vllm/core/block/block_table.py | 399 ---- vllm/core/block/common.py | 371 --- vllm/core/block/cpu_gpu_block_allocator.py | 439 ---- vllm/core/block/interfaces.py | 319 --- vllm/core/block/naive_block.py | 466 ---- vllm/core/block/prefix_caching_block.py | 1135 --------- vllm/core/block/utils.py | 28 - vllm/core/block_manager.py | 523 ----- vllm/core/evictor.py | 157 -- vllm/core/interfaces.py | 139 -- vllm/core/placeholder_block_space_manager.py | 103 - vllm/core/scheduler.py | 2028 ---------------- vllm/engine/protocol.py | 8 +- vllm/v1/engine/async_llm.py | 6 +- vllm/worker/cache_engine.py | 145 -- vllm/worker/model_runner.py | 2031 ----------------- vllm/worker/worker.py | 666 ------ 29 files changed, 24 insertions(+), 9020 deletions(-) delete mode 100644 vllm/core/__init__.py delete mode 100644 vllm/core/block/__init__.py delete mode 100644 vllm/core/block/block_table.py delete mode 100644 vllm/core/block/common.py delete mode 100644 vllm/core/block/cpu_gpu_block_allocator.py delete mode 100644 vllm/core/block/interfaces.py delete mode 100644 vllm/core/block/naive_block.py delete mode 100644 vllm/core/block/prefix_caching_block.py delete mode 100644 vllm/core/block/utils.py delete mode 100644 vllm/core/block_manager.py delete mode 100644 vllm/core/evictor.py delete mode 100644 vllm/core/interfaces.py delete mode 100644 vllm/core/placeholder_block_space_manager.py delete mode 100644 vllm/core/scheduler.py delete mode 100644 vllm/worker/cache_engine.py delete mode 100644 vllm/worker/model_runner.py delete mode 100644 vllm/worker/worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1e7ce6ef0a665..9d38e571324b4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -148,7 +148,6 @@ steps: num_gpus: 4 source_file_dependencies: - vllm/distributed/ - - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events @@ -867,8 +866,6 @@ steps: - tests/distributed/ - vllm/compilation - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py - tests/v1/test_external_lb_dp.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f58256d38b9d1..37bd0ace98a97 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,10 +4,8 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 diff --git a/pyproject.toml b/pyproject.toml index fe55461db00be..f43ae69e00bdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,6 @@ line-length = 80 "vllm/_version.py" = ["ALL"] # Python 3.8 typing - skip V0 code "vllm/attention/**/*.py" = ["UP006", "UP035"] -"vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] @@ -117,7 +116,6 @@ files = [ "vllm/*.py", "vllm/assets", "vllm/entrypoints", - "vllm/core", "vllm/inputs", "vllm/logging_utils", "vllm/multimodal", diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index a7d0e3afb517f..87a4558e377d0 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch from einops import rearrange @@ -34,9 +34,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -329,7 +326,7 @@ class DifferentialFlashAttentionMetadata(AttentionMetadata): class DifferentialFlashAttentionMetadataBuilder( AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -350,9 +347,8 @@ class DifferentialFlashAttentionMetadataBuilder( self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 85957bea1e26d..de47bb8ebd8fb 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -4,7 +4,7 @@ """ import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch import torch.distributed @@ -22,9 +22,6 @@ from vllm.utils import async_tensor_h2d from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache, sparse_attn_func) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -224,9 +221,8 @@ class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder): super().prepare() self.orig_seq_lens: List[int] = [] - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): super()._add_seq_group(inter_data, chunked_prefill_enabled, prefix_cache_hit) for prompt_len, seq_len in zip(inter_data.prompt_lens, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 78c768f92d3c2..edb3afb4aa075 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch @@ -31,9 +31,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -312,7 +309,7 @@ class FlashAttentionMetadata(AttentionMetadata): class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -332,9 +329,8 @@ class FlashAttentionMetadataBuilder( self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 789393eb39a73..826b63e1ccda8 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -193,8 +193,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, - Type, TypeVar) +from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar import torch @@ -233,9 +232,6 @@ except ImportError: except ImportError: flash_attn_varlen_func = None -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - is_hip = current_platform.is_rocm() @@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): """ BLOCK_TABLE_EXTENDER: list[list[int]] = [] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -668,9 +664,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index e630a6c6de8c4..f82d28938f45c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch @@ -13,9 +13,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadataBuilder) from vllm.attention.backends.utils import CommonAttentionState from vllm.multimodal import MultiModalPlaceholderMap - -if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder) from vllm.utils import async_tensor_h2d # Placeholder attention backend for models like Mamba and pooling models that @@ -204,7 +201,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata): class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner @@ -220,9 +217,7 @@ class PlaceholderAttentionMetadataBuilder( self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. """ diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index a2e9710437d95..587d08858b92b 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Type, Union +from typing import Optional, Type, Union import torch @@ -19,9 +19,6 @@ from vllm.attention.backends.utils import (compute_slot_mapping, from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd, get_aiter_mla_metadata) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - def is_aiter_mla_enabled() -> bool: return envs.VLLM_ROCM_USE_AITER \ @@ -110,7 +107,7 @@ class AiterMLAMetadata(MLACommonMetadata): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): BLOCK_TABLE_EXTENDER: list[list[int]] = [[]] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): super().__init__(input_builder) assert self.block_size == 1, "AITER MLA requires only block size 1." diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 7b6c426b0f851..289cfa2177438 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -35,9 +35,6 @@ PAD_SLOT_ID = -1 # if we have at least this many elements. Could be tuned further. _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256 -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - def is_block_tables_empty(block_tables: Union[None, Dict]): """ @@ -129,7 +126,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): _metadata_cls: Type[TAttentionMetadata] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner @@ -149,9 +146,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool): is_prompt = inter_data.is_prompt block_tables = inter_data.block_tables diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/core/block/__init__.py b/vllm/core/block/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py deleted file mode 100644 index 444bb25f2830a..0000000000000 --- a/vllm/core/block/block_table.py +++ /dev/null @@ -1,399 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from typing import List, Optional - -from vllm.core.block.common import BlockList -from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator -from vllm.utils import Device, cdiv, chunk_list - - -class BlockTable: - """A class to manage blocks for a specific sequence. - - The BlockTable maps a sequence of tokens to a list of blocks, where each - block represents a contiguous memory allocation for a portion of the - sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is - responsible for allocating and freeing memory for the blocks. - - Args: - block_size (int): The maximum number of tokens that can be stored in a - single block. - block_allocator (DeviceAwareBlockAllocator): The block allocator used to - manage memory for the blocks. - _blocks (Optional[List[Block]], optional): An optional list of existing - blocks to initialize the BlockTable with. If not provided, an empty - BlockTable is created. - max_block_sliding_window (Optional[int], optional): The number of - blocks to keep around for each sequence. If None, all blocks - are kept (eg., when sliding window is not used). - It should at least fit the sliding window size of the model. - - Attributes: - _block_size (int): The maximum number of tokens that can be stored in a - single block. - _allocator (DeviceAwareBlockAllocator): The block allocator used to - manage memory for the blocks. - _blocks (Optional[List[Block]]): The list of blocks managed by this - BlockTable. - _num_full_slots (int): The number of tokens currently stored in the - blocks. - """ - - def __init__( - self, - block_size: int, - block_allocator: DeviceAwareBlockAllocator, - _blocks: Optional[List[Block]] = None, - max_block_sliding_window: Optional[int] = None, - ): - self._block_size = block_size - self._allocator = block_allocator - if _blocks is None: - _blocks = [] - self._blocks: BlockList = BlockList(_blocks) - - self._max_block_sliding_window = max_block_sliding_window - self._num_full_slots = self._get_num_token_ids() - - @staticmethod - def get_num_required_blocks(token_ids: List[int], - block_size: int, - num_lookahead_slots: int = 0) -> int: - """Calculates the minimum number of blocks required to store a given - sequence of token IDs along with any look-ahead slots that may be - required (like in multi-step + chunked-prefill). - - This assumes worst-case scenario, where every block requires a new - allocation (e.g. ignoring prefix caching). - - Args: - token_ids (List[int]): The sequence of token IDs to be stored. - block_size (int): The maximum number of tokens that can be stored in - a single block. - num_lookahead_slots (int): look-ahead slots that the sequence may - require. - - Returns: - int: The minimum number of blocks required to store the given - sequence of token IDs along with any required look-ahead slots. - """ - return cdiv(len(token_ids) + num_lookahead_slots, block_size) - - def allocate(self, - token_ids: List[int], - device: Device = Device.GPU, - extra_hash: Optional[int] = None) -> None: - """Allocates memory blocks for storing the given sequence of token IDs. - - This method allocates the required number of blocks to store the given - sequence of token IDs. - - Args: - token_ids (List[int]): The sequence of token IDs to be stored. - device (Device, optional): The device on which the blocks should be - allocated. Defaults to Device.GPU. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefixcaching block. - """ - assert not self._is_allocated - assert token_ids - blocks = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device, - extra_hash=extra_hash) - self.update(blocks) - self._num_full_slots = len(token_ids) - - def update(self, blocks: List[Block]) -> None: - """Resets the table to the newly provided blocks - (with their corresponding block ids) - """ - self._blocks.update(blocks) - - def append_token_ids(self, - token_ids: List[int], - num_lookahead_slots: int = 0, - num_computed_slots: Optional[int] = None, - extra_hash: Optional[int] = None) -> None: - """Appends a sequence of token IDs to the existing blocks in the - BlockTable. - - This method appends the given sequence of token IDs to the existing - blocks in the BlockTable. If there is not enough space in the existing - blocks, new blocks are allocated using the `ensure_num_empty_slots` - method to accommodate the additional tokens. - - The token IDs are divided into chunks of size `block_size` (except for - the first chunk, which may be smaller), and each chunk is appended to a - separate block. - - Args: - token_ids (List[int]): The sequence of token IDs to be appended. - num_computed_slots (Optional[int]): The number of KV cache slots - that are already filled (computed). - When sliding window is enabled, this is used to compute how many - blocks to drop at the front of the sequence. - Without sliding window, None can be passed. - Without chunked prefill, it should be the same as - _num_full_slots. - extra_hash (Optional[int]): The hash value of additional - factors such as adapters that influence the block, apart - from the token_ids. - """ - assert self._is_allocated, "no blocks have been allocated" - assert len(self._blocks) > 0 - - # Drop blocks that are no longer needed due to sliding window - if self._max_block_sliding_window is not None: - null_block = self._allocator.allocate_or_get_null_block() - assert num_computed_slots is not None - end_block_idx = (num_computed_slots // - self._block_size) - self._max_block_sliding_window - for idx in range(0, end_block_idx): - b = self._blocks[idx] - if b is not null_block: - self._allocator.free(b) - self._blocks[idx] = null_block - - # Ensure there are enough empty slots for the new tokens plus - # lookahead slots - self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + - num_lookahead_slots, - extra_hash=extra_hash) - - # Update the blocks with the new tokens - first_block_idx = self._num_full_slots // self._block_size - token_blocks = self._chunk_token_blocks_for_append(token_ids) - - for i, token_block in enumerate(token_blocks): - self._blocks.append_token_ids(first_block_idx + i, token_block) - - self._num_full_slots += len(token_ids) - - def ensure_num_empty_slots(self, - num_empty_slots: int, - extra_hash: Optional[int] = None) -> None: - """Ensures that the BlockTable has at least the specified number of - empty slots available. - - This method checks if the BlockTable has enough empty slots (i.e., - available space) to accommodate the requested number of tokens. If not, - it allocates additional blocks on the GPU to ensure that the required - number of empty slots is available. - - Args: - num_empty_slots (int): The minimum number of empty slots required. - extra_hash (Optional[int]): The hash value of additional - factors such as adapters that influence the block, apart - from the token_ids. - """ - # Currently the block table only supports - # appending tokens to GPU blocks. - device = Device.GPU - assert self._is_allocated - - if self._num_empty_slots >= num_empty_slots: - return - - slots_to_allocate = num_empty_slots - self._num_empty_slots - blocks_to_allocate = cdiv(slots_to_allocate, self._block_size) - - for _ in range(blocks_to_allocate): - assert len(self._blocks) > 0 - self._blocks.append( - self._allocator.allocate_mutable_block( - prev_block=self._blocks[-1], - device=device, - extra_hash=extra_hash)) - - def fork(self) -> "BlockTable": - """Creates a new BlockTable instance with a copy of the blocks from the - current instance. - - This method creates a new BlockTable instance with the same block size, - block allocator, and a copy of the blocks from the current instance. The - new BlockTable has its own independent set of blocks, but shares the - same underlying memory allocation with the original BlockTable. - - Returns: - BlockTable: A new BlockTable instance with a copy of the blocks from - the current instance. - """ - assert self._is_allocated - assert len(self._blocks) > 0 - forked_blocks = self._allocator.fork(self._blocks[-1]) - return BlockTable( - block_size=self._block_size, - block_allocator=self._allocator, - _blocks=forked_blocks, - max_block_sliding_window=self._max_block_sliding_window, - ) - - def free(self) -> None: - """Frees the memory occupied by the blocks in the BlockTable. - - This method iterates over all the blocks in the `_blocks` list and calls - the `free` method of the `_allocator` object to release the memory - occupied by each block. After freeing all the blocks, the `_blocks` list - is set to `None`. - """ - for block in self.blocks: - self._allocator.free(block) - self._blocks.reset() - - @property - def physical_block_ids(self) -> List[int]: - """Returns a list of physical block indices for the blocks in the - BlockTable. - - This property returns a list of integers, where each integer represents - the physical block index of a corresponding block in the `_blocks` list. - The physical block index is a unique identifier for the memory location - occupied by the block. - - Returns: - List[int]: A list of physical block indices for the blocks in the - BlockTable. - """ - return self._blocks.ids() - - def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: - """Get the number of "unseen" tokens in the sequence. - - Unseen tokens are tokens in the sequence corresponding to this block - table, but are not yet appended to this block table. - - Args: - sequence_token_ids (List[int]): The list of token ids in the - sequence. - - Returns: - List[int]: The postfix of sequence_token_ids that has not yet been - appended to the block table. - """ - - # Since the block table is append-only, the unseen token ids are the - # ones after the appended ones. - return sequence_token_ids[self.num_full_slots:] - - def _allocate_blocks_for_token_ids( - self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> List[Block]: - blocks: List[Block] = [] - - block_token_ids = [] - tail_token_ids = [] - for cur_token_ids in chunk_list(token_ids, self._block_size): - if len(cur_token_ids) == self._block_size: - block_token_ids.append(cur_token_ids) - else: - tail_token_ids.append(cur_token_ids) - - if block_token_ids: - blocks.extend( - self._allocator.allocate_immutable_blocks( - prev_block, - block_token_ids=block_token_ids, - device=device, - extra_hash=extra_hash)) - prev_block = blocks[-1] - - if tail_token_ids: - assert len(tail_token_ids) == 1 - cur_token_ids = tail_token_ids[0] - - block = self._allocator.allocate_mutable_block( - prev_block=prev_block, device=device, extra_hash=extra_hash) - block.append_token_ids(cur_token_ids) - - blocks.append(block) - - return blocks - - def _get_all_token_ids(self) -> List[int]: - # NOTE: This function is O(seq_len); use sparingly. - token_ids: List[int] = [] - - if not self._is_allocated: - return token_ids - - for block in self.blocks: - token_ids.extend(block.token_ids) - - return token_ids - - def _get_num_token_ids(self) -> int: - res = 0 - for block in self.blocks: - res += len(block.token_ids) - - return res - - @property - def _is_allocated(self) -> bool: - return len(self._blocks) > 0 - - @property - def blocks(self) -> List[Block]: - return self._blocks.list() - - @property - def _num_empty_slots(self) -> int: - assert self._is_allocated - return len(self._blocks) * self._block_size - self._num_full_slots - - @property - def num_full_slots(self) -> int: - """Returns the total number of tokens currently stored in the - BlockTable. - - Returns: - int: The total number of tokens currently stored in the BlockTable. - """ - return self._num_full_slots - - def get_num_blocks_touched_by_append_slots( - self, token_ids: List[int], num_lookahead_slots: int) -> int: - """Determine how many blocks will be "touched" by appending the token - ids. - - This is required for the scheduler to determine whether a sequence can - continue generation, or if it must be preempted. - """ - # Math below is equivalent to: - # all_token_ids = token_ids + [-1] * num_lookahead_slots - # token_blocks = self._chunk_token_blocks_for_append(all_token_ids) - # return len(token_blocks) - - num_token_ids = len(token_ids) + num_lookahead_slots - first_chunk_size = self._block_size - (self._num_full_slots % - self._block_size) - num_token_blocks = (1 + math.ceil( - (num_token_ids - first_chunk_size) / self._block_size)) - return num_token_blocks - - def _chunk_token_blocks_for_append( - self, token_ids: List[int]) -> List[List[int]]: - """Split the token ids into block-sized chunks so they can be easily - appended to blocks. The first such "token block" may have less token ids - than the block size, since the last allocated block may be partially - full. - - If no token ids are provided, then no chunks are returned. - """ - - if not token_ids: - return [] - - first_chunk_size = self._block_size - (self._num_full_slots % - self._block_size) - token_blocks = [token_ids[:first_chunk_size]] - token_blocks.extend( - chunk_list(token_ids[first_chunk_size:], self._block_size)) - return token_blocks diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py deleted file mode 100644 index a337007a9eaa6..0000000000000 --- a/vllm/core/block/common.py +++ /dev/null @@ -1,371 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections import deque -from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple - -from vllm.core.block.interfaces import Block, BlockAllocator - -BlockId = int -RefCount = int - - -class RefCounterProtocol(Protocol): - - def incr(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - def decr(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - def get(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - -class RefCounter(RefCounterProtocol): - """A class for managing reference counts for a set of block indices. - - The RefCounter class maintains a dictionary that maps block indices to their - corresponding reference counts. It provides methods to increment, decrement, - and retrieve the reference count for a given block index. - - Args: - all_block_indices (Iterable[BlockId]): An iterable of block indices - to initialize the reference counter with. - """ - - def __init__(self, all_block_indices: Iterable[BlockId]): - deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, RefCount] = { - index: 0 - for index in deduped - } - - def incr(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - pre_incr_refcount = self._refcounts[block_id] - - assert pre_incr_refcount >= 0 - - post_incr_refcount = pre_incr_refcount + 1 - self._refcounts[block_id] = post_incr_refcount - return post_incr_refcount - - def decr(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - refcount = self._refcounts[block_id] - - assert refcount > 0 - refcount -= 1 - - self._refcounts[block_id] = refcount - - return refcount - - def get(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - return self._refcounts[block_id] - - def as_readonly(self) -> "ReadOnlyRefCounter": - return ReadOnlyRefCounter(self) - - -class ReadOnlyRefCounter(RefCounterProtocol): - """A read-only view of the RefCounter class. - - The ReadOnlyRefCounter class provides a read-only interface to access the - reference counts maintained by a RefCounter instance. It does not allow - modifications to the reference counts. - - Args: - refcounter (RefCounter): The RefCounter instance to create a read-only - view for. - """ - - def __init__(self, refcounter: RefCounter): - self._refcounter = refcounter - - def incr(self, block_id: BlockId) -> RefCount: - raise ValueError("Incr not allowed") - - def decr(self, block_id: BlockId) -> RefCount: - raise ValueError("Decr not allowed") - - def get(self, block_id: BlockId) -> RefCount: - return self._refcounter.get(block_id) - - -class CopyOnWriteTracker: - """A class for tracking and managing copy-on-write operations for blocks. - - The CopyOnWriteTracker class maintains a mapping of source block indices to - their corresponding copy-on-write destination block indices. It works in - conjunction with a RefCounter. - - Args: - refcounter (RefCounter): The reference counter used to track block - reference counts. - """ - - def __init__(self, refcounter: RefCounterProtocol): - self._copy_on_writes: List[Tuple[BlockId, BlockId]] = [] - self._refcounter = refcounter - - def is_appendable(self, block: Block) -> bool: - """Checks if the block is shared or not. If shared, then it cannot - be appended and needs to be duplicated via copy-on-write - """ - block_id = block.block_id - if block_id is None: - return True - - refcount = self._refcounter.get(block_id) - return refcount <= 1 - - def record_cow(self, src_block_id: Optional[BlockId], - trg_block_id: Optional[BlockId]) -> None: - """Records a copy-on-write operation from source to target block id - Args: - src_block_id (BlockId): The source block id from which to copy - the data - trg_block_id (BlockId): The target block id to which the data - is copied - """ - assert src_block_id is not None - assert trg_block_id is not None - self._copy_on_writes.append((src_block_id, trg_block_id)) - - def clear_cows(self) -> List[Tuple[BlockId, BlockId]]: - """Clears the copy-on-write tracking information and returns the current - state. - - This method returns a list mapping source block indices to - destination block indices for the current copy-on-write operations. - It then clears the internal tracking information. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices for the - current copy-on-write operations. - """ - cows = self._copy_on_writes - self._copy_on_writes = [] - return cows - - -class BlockPool: - """Used to pre-allocate block objects, in order to avoid excessive python - object allocations/deallocations. - The pool starts from "pool_size" objects and will increase to more objects - if necessary - - Note that multiple block objects may point to the same physical block id, - which is why this pool is needed, so that it will be easier to support - prefix caching and more complicated sharing of physical blocks. - """ - - def __init__(self, block_size: int, create_block: Block.Factory, - allocator: BlockAllocator, pool_size: int): - self._block_size = block_size - self._create_block = create_block - self._allocator = allocator - self._pool_size = pool_size - assert self._pool_size >= 0 - - self._free_ids: Deque[int] = deque(range(self._pool_size)) - self._pool = [] - for i in range(self._pool_size): - self._pool.append( - self._create_block(prev_block=None, - token_ids=[], - block_size=self._block_size, - allocator=self._allocator, - block_id=None, - extra_hash=None)) - - def increase_pool(self): - """Doubles the internal pool size - """ - cur_pool_size = self._pool_size - new_pool_size = cur_pool_size * 2 - self._pool_size = new_pool_size - - self._free_ids += deque(range(cur_pool_size, new_pool_size)) - - for i in range(cur_pool_size, new_pool_size): - self._pool.append( - self._create_block(prev_block=None, - token_ids=[], - block_size=self._block_size, - allocator=self._allocator, - block_id=None, - extra_hash=None)) - - def init_block(self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - physical_block_id: Optional[int], - extra_hash: Optional[int] = None) -> Block: - if len(self._free_ids) == 0: - self.increase_pool() - assert len(self._free_ids) > 0 - - pool_id = self._free_ids.popleft() - - block = self._pool[pool_id] - block.__init__( # type: ignore[misc] - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - allocator=block._allocator, # type: ignore[attr-defined] - block_id=physical_block_id, - extra_hash=extra_hash) - block.pool_id = pool_id # type: ignore[attr-defined] - return block - - def free_block(self, block: Block) -> None: - self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined] - - -class BlockList: - """This class is an optimization to allow fast-access to physical - block ids. It maintains a block id list that is updated with the - block list and this avoids the need to reconstruct the block id - list on every iteration of the block manager - """ - - def __init__(self, blocks: List[Block]): - self._blocks: List[Block] = [] - self._block_ids: List[int] = [] - - self.update(blocks) - - def _add_block_id(self, block_id: Optional[BlockId]) -> None: - assert block_id is not None - self._block_ids.append(block_id) - - def _update_block_id(self, block_index: int, - new_block_id: Optional[BlockId]) -> None: - assert new_block_id is not None - self._block_ids[block_index] = new_block_id - - def update(self, blocks: List[Block]): - self._blocks = blocks - - # Cache block ids for fast query - self._block_ids = [] - for block in self._blocks: - self._add_block_id(block.block_id) - - def append_token_ids(self, block_index: int, token_ids: List[int]) -> None: - block = self._blocks[block_index] - prev_block_id = block.block_id - - block.append_token_ids(token_ids) - - # CoW or promotion may update the internal block_id - if prev_block_id != block.block_id: - self._update_block_id(block_index, block.block_id) - - def append(self, new_block: Block): - self._blocks.append(new_block) - self._add_block_id(new_block.block_id) - - def __len__(self) -> int: - return len(self._blocks) - - def __getitem__(self, block_index: int) -> Block: - return self._blocks[block_index] - - def __setitem__(self, block_index: int, new_block: Block) -> None: - self._blocks[block_index] = new_block - self._update_block_id(block_index, new_block.block_id) - - def reset(self): - self._blocks = [] - self._block_ids = [] - - def list(self) -> List[Block]: - return self._blocks - - def ids(self) -> List[int]: - return self._block_ids - - -@dataclass -class CacheMetricData: - """A utility dataclass to maintain cache metric. - To avoid overflow, we maintain the hit rate in block granularity, so that - we can maintain a single hit rate for n_completed_block x block_size, - and calculate the real time hit rate by the following: - BS = The number of queries per block. - nB = The number of completed blocks. - HR = hit rate of (nB x BS) queries. - Q = current number of queries (< BS). - H = current number of hits (< BS). - hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS) - """ - num_completed_blocks: int = 0 - completed_block_cache_hit_rate: float = 0.0 - num_incompleted_block_queries: int = 0 - num_incompleted_block_hit: int = 0 - block_size: int = 1000 - - def query(self, hit: bool): - self.num_incompleted_block_queries += 1 - self.num_incompleted_block_hit += 1 if hit else 0 - - # When a block is completed, update the cache hit rate - # and reset the incomplete numbers. - if self.num_incompleted_block_queries == self.block_size: - hit_rate = (self.num_incompleted_block_hit / - self.num_incompleted_block_queries) - self.completed_block_cache_hit_rate = ( - self.completed_block_cache_hit_rate * self.num_completed_blocks - + hit_rate) / (self.num_completed_blocks + 1) - self.num_incompleted_block_queries = 0 - self.num_incompleted_block_hit = 0 - self.num_completed_blocks += 1 - - def get_hit_rate(self): - incomplete_ratio = self.num_incompleted_block_queries / self.block_size - total_blocks = self.num_completed_blocks + incomplete_ratio - if total_blocks == 0: - return 0.0 - - completed_block_hit, incompleted_block_hit = 0.0, 0.0 - if self.num_completed_blocks > 0: - completed_block_hit = (self.completed_block_cache_hit_rate * - self.num_completed_blocks) - if self.num_incompleted_block_queries > 0: - incompleted_hit_rate = (self.num_incompleted_block_hit / - self.num_incompleted_block_queries) - incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio) - return (completed_block_hit + incompleted_block_hit) / total_blocks - - -def get_all_blocks_recursively(last_block: Block) -> List[Block]: - """Retrieves all the blocks in a sequence starting from the last block. - - This function recursively traverses the sequence of blocks in reverse order, - starting from the given last block, and returns a list of all the blocks in - the sequence. - - Args: - last_block (Block): The last block in the sequence. - - Returns: - List[Block]: A list of all the blocks in the sequence, in the order they - appear. - """ - - def recurse(block: Block, lst: List[Block]) -> None: - if block.prev_block is not None: - recurse(block.prev_block, lst) - lst.append(block) - - all_blocks: List[Block] = [] - recurse(last_block, all_blocks) - return all_blocks diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py deleted file mode 100644 index 92bc5e157e148..0000000000000 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ /dev/null @@ -1,439 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Dict, FrozenSet, List, Optional, Tuple - -from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, - DeviceAwareBlockAllocator) -from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator -from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device - - -class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): - """A block allocator that can allocate blocks on both CPU and GPU memory. - - This class implements the `DeviceAwareBlockAllocator` interface and provides - functionality for allocating and managing blocks of memory on both CPU and - GPU devices. - - The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU - blocks, and allows for allocation, deallocation, forking, and swapping of - blocks across these memory pools. - """ - - @staticmethod - def create( - allocator_type: str, - num_gpu_blocks: int, - num_cpu_blocks: int, - block_size: int, - ) -> DeviceAwareBlockAllocator: - """Creates a CpuGpuBlockAllocator instance with the specified - configuration. - - This static method creates and returns a CpuGpuBlockAllocator instance - based on the provided parameters. It initializes the CPU and GPU block - allocators with the specified number of blocks, block size, and - allocator type. - - Args: - allocator_type (str): The type of block allocator to use for CPU - and GPU blocks. Currently supported values are "naive" and - "prefix_caching". - num_gpu_blocks (int): The number of blocks to allocate for GPU - memory. - num_cpu_blocks (int): The number of blocks to allocate for CPU - memory. - block_size (int): The size of each block in number of tokens. - - Returns: - DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the - specified configuration. - - Notes: - - The block IDs are assigned contiguously, with GPU block IDs coming - before CPU block IDs. - """ - reserved_blocks = 0 - block_ids = list( - range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) - num_gpu_blocks -= reserved_blocks - gpu_block_ids = block_ids[:num_gpu_blocks] - cpu_block_ids = block_ids[num_gpu_blocks:] - - if allocator_type == "naive": - gpu_allocator: BlockAllocator = NaiveBlockAllocator( - create_block=NaiveBlock, # type: ignore - num_blocks=num_gpu_blocks, - block_size=block_size, - block_ids=gpu_block_ids, - ) - - cpu_allocator: BlockAllocator = NaiveBlockAllocator( - create_block=NaiveBlock, # type: ignore - num_blocks=num_cpu_blocks, - block_size=block_size, - block_ids=cpu_block_ids, - ) - elif allocator_type == "prefix_caching": - gpu_allocator = PrefixCachingBlockAllocator( - num_blocks=num_gpu_blocks, - block_size=block_size, - block_ids=gpu_block_ids, - ) - - cpu_allocator = PrefixCachingBlockAllocator( - num_blocks=num_cpu_blocks, - block_size=block_size, - block_ids=cpu_block_ids, - ) - else: - raise ValueError(f"Unknown allocator type {allocator_type=}") - - return CpuGpuBlockAllocator( - cpu_block_allocator=cpu_allocator, - gpu_block_allocator=gpu_allocator, - ) - - def __init__(self, cpu_block_allocator: BlockAllocator, - gpu_block_allocator: BlockAllocator): - assert not ( - cpu_block_allocator.all_block_ids - & gpu_block_allocator.all_block_ids - ), "cpu and gpu block allocators can't have intersection of block ids" - - self._allocators = { - Device.CPU: cpu_block_allocator, - Device.GPU: gpu_block_allocator, - } - - self._swap_mapping: Dict[int, int] = {} - self._null_block: Optional[Block] = None - - self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} - for _, allocator in self._allocators.items(): - for block_id in allocator.all_block_ids: - self._block_ids_to_allocator[block_id] = allocator - - def allocate_or_get_null_block(self) -> Block: - if self._null_block is None: - self._null_block = NullBlock( - self.allocate_mutable_block(None, Device.GPU)) - return self._null_block - - def allocate_mutable_block(self, - prev_block: Optional[Block], - device: Device, - extra_hash: Optional[int] = None) -> Block: - """Allocates a new mutable block on the specified device. - - Args: - prev_block (Optional[Block]): The previous block to in the sequence. - Used for prefix hashing. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - Block: The newly allocated mutable block. - """ - return self._allocators[device].allocate_mutable_block( - prev_block, extra_hash=extra_hash) - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device, - extra_hash: Optional[int] = None) -> List[Block]: - """Allocates a new group of immutable blocks with the provided block - token IDs on the specified device. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - Used for prefix hashing. - block_token_ids (List[int]): The list of block token IDs to be - stored in the new blocks. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - List[Block]: The newly allocated list of immutable blocks - containing the provided block token IDs. - """ - return self._allocators[device].allocate_immutable_blocks( - prev_block, block_token_ids, extra_hash=extra_hash) - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> Block: - """Allocates a new immutable block with the provided token IDs on the - specified device. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - Used for prefix hashing. - token_ids (List[int]): The list of token IDs to be stored in the new - block. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - Block: The newly allocated immutable block containing the provided - token IDs. - """ - return self._allocators[device].allocate_immutable_block( - prev_block, token_ids, extra_hash=extra_hash) - - def free(self, block: Block) -> None: - """Frees the memory occupied by the given block. - - Args: - block (Block): The block to be freed. - """ - # Null block should never be freed - if isinstance(block, NullBlock): - return - block_id = block.block_id - assert block_id is not None - allocator = self._block_ids_to_allocator[block_id] - allocator.free(block) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: A new list of blocks that shares the same memory as the - original sequence. - """ - # do not attempt to fork the null block - assert not isinstance(last_block, NullBlock) - block_id = last_block.block_id - assert block_id is not None - allocator = self._block_ids_to_allocator[block_id] - return allocator.fork(last_block) - - def get_num_free_blocks(self, device: Device) -> int: - """Returns the number of free blocks available on the specified device. - - Args: - device (Device): The device for which to query the number of free - blocks. AssertionError is raised if None is passed. - - Returns: - int: The number of free blocks available on the specified device. - """ - return self._allocators[device].get_num_free_blocks() - - def get_num_total_blocks(self, device: Device) -> int: - return self._allocators[device].get_num_total_blocks() - - def get_physical_block_id(self, device: Device, absolute_id: int) -> int: - """Returns the zero-offset block id on certain device given the - absolute block id. - - Args: - device (Device): The device for which to query relative block id. - absolute_id (int): The absolute block id for the block in - whole allocator. - - Returns: - int: The zero-offset block id on certain device. - """ - return self._allocators[device].get_physical_block_id(absolute_id) - - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: - """Execute the swap for the given blocks from source_device - on to dest_device, save the current swap mapping and append - them to the accumulated `self._swap_mapping` for each - scheduling move. - - Args: - blocks: List of blocks to be swapped. - src_device (Device): Device to swap the 'blocks' from. - dst_device (Device): Device to swap the 'blocks' to. - - Returns: - Dict[int, int]: Swap mapping from source_device - on to dest_device. - """ - src_block_ids = [block.block_id for block in blocks] - self._allocators[src_device].swap_out(blocks) - self._allocators[dst_device].swap_in(blocks) - dst_block_ids = [block.block_id for block in blocks] - - current_swap_mapping: Dict[int, int] = {} - for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids): - if src_block_id is not None and dst_block_id is not None: - self._swap_mapping[src_block_id] = dst_block_id - current_swap_mapping[src_block_id] = dst_block_id - return current_swap_mapping - - def get_num_full_blocks_touched(self, blocks: List[Block], - device: Device) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out the given blocks on to the 'device'. - - Args: - blocks: List of blocks to be swapped. - device (Device): Device to swap the 'blocks' on. - - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks on to the 'device'. - Non full blocks are ignored when deciding the number - of blocks to touch. - """ - return self._allocators[device].get_num_full_blocks_touched(blocks) - - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - """Clears the copy-on-write (CoW) state and returns the mapping of - source to destination block IDs. - - Returns: - List[Tuple[int, int]]: A list mapping source block IDs to - destination block IDs. - """ - # CoW only supported on GPU - device = Device.GPU - return self._allocators[device].clear_copy_on_writes() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, only use for prefix caching.""" - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].mark_blocks_as_accessed(block_ids, now) - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - """Mark blocks as accessed, only use for prefix caching.""" - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].mark_blocks_as_computed(block_ids) - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].get_common_computed_block_ids( - computed_seq_block_ids) - - @property - def all_block_ids(self) -> FrozenSet[int]: - return frozenset(self._block_ids_to_allocator.keys()) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - assert device in self._allocators - return self._allocators[device].get_prefix_cache_hit_rate() - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for specified or all devices.""" - if device: - return self._allocators[device].reset_prefix_cache() - success = True - for allocator in self._allocators.values(): - success = success and allocator.reset_prefix_cache() - return success - - def get_and_reset_swaps(self) -> List[Tuple[int, int]]: - """Returns and clears the mapping of source to destination block IDs. - Will be called after every swapping operations for now, and after every - schedule when BlockManagerV2 become default. Currently not useful. - - Returns: - List[Tuple[int, int]]: A mapping of source to destination block IDs. - """ - mapping = self._swap_mapping.copy() - self._swap_mapping.clear() - return list(mapping.items()) - - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - device: Device = Device.GPU, - ) -> List[int]: - return self._allocators[device].find_cached_blocks_prefix(block_hashes) - - -class NullBlock(Block): - """ - Null blocks are used as a placeholders for KV cache blocks that have - been dropped due to sliding window. - This implementation just wraps an ordinary block and prevents it from - being modified. It also allows for testing if a block is NullBlock - via isinstance(). - """ - - def __init__(self, proxy: Block): - super().__init__() - self._proxy = proxy - - def append_token_ids(self, token_ids: List[BlockId]): - raise ValueError("null block should not be modified") - - @property - def block_id(self): - return self._proxy.block_id - - @block_id.setter - def block_id(self, value: Optional[BlockId]): - raise ValueError("null block should not be modified") - - @property - def token_ids(self) -> List[BlockId]: - return self._proxy.token_ids - - @property - def num_tokens_total(self) -> int: - raise NotImplementedError( - "num_tokens_total is not used for null block") - - @property - def num_empty_slots(self) -> BlockId: - return self._proxy.num_empty_slots - - @property - def is_full(self): - return self._proxy.is_full - - @property - def prev_block(self): - return self._proxy.prev_block - - @property - def extra_hash(self): - return None - - @property - def computed(self): - return self._proxy.computed - - @computed.setter - def computed(self, value): - self._proxy.computed = value - - @property - def last_accessed(self) -> float: - return self._proxy.last_accessed - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - self._proxy.last_accessed = last_accessed_ts - - @property - def content_hash(self): - return self._proxy.content_hash diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py deleted file mode 100644 index 1a05881f7c005..0000000000000 --- a/vllm/core/block/interfaces.py +++ /dev/null @@ -1,319 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple - -from vllm.utils import Device - -BlockId = int - - -class Block(ABC): - - @abstractmethod - def append_token_ids(self, token_ids: List[int]) -> None: - pass - - @property - @abstractmethod - def block_id(self) -> Optional[int]: - pass - - @block_id.setter - @abstractmethod - def block_id(self, value: Optional[int]) -> None: - """NOTE: Do not use this API outside Block.""" - self._block_id = value - - @property - @abstractmethod - def token_ids(self) -> List[int]: - pass - - @property - @abstractmethod - def num_tokens_total(self) -> int: - """The number of tokens till the current block (inclusive) - """ - pass - - @property - @abstractmethod - def num_empty_slots(self) -> int: - pass - - @property - @abstractmethod - def is_full(self) -> bool: - pass - - @property - @abstractmethod - def prev_block(self) -> Optional["Block"]: - pass - - @property - @abstractmethod - def extra_hash(self) -> Optional[int]: - return None - - @property - @abstractmethod - def computed(self) -> bool: - raise NotImplementedError - - @computed.setter - @abstractmethod - def computed(self, value) -> bool: - """Should be only used by PrefixCacingAllocator""" - raise NotImplementedError - - @property - @abstractmethod - def last_accessed(self) -> float: - raise NotImplementedError - - @last_accessed.setter - @abstractmethod - def last_accessed(self, last_accessed_ts: float): - raise NotImplementedError - - class Factory(Protocol): - - @abstractmethod - def __call__( - self, - prev_block: Optional["Block"], - token_ids: List[int], - block_size: int, - allocator: "BlockAllocator", - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ) -> "Block": - pass - - @property - @abstractmethod - def content_hash(self) -> Optional[int]: - """Return the content-based hash of the current block, or None if it is - not yet defined or not supported. - - For the content-based hash to be defined, the current block must be - full. - """ - return None - - -class BlockAllocator(ABC): - - @abstractmethod - def allocate_mutable_block(self, prev_block: Optional[Block], - extra_hash: Optional[int]) -> Block: - pass - - @abstractmethod - def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int]) -> Block: - pass - - @abstractmethod - def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int]) -> List[Block]: - pass - - @abstractmethod - def free(self, block: Block) -> None: - pass - - @abstractmethod - def fork(self, last_block: Block) -> List[Block]: - pass - - @abstractmethod - def get_num_total_blocks(self) -> int: - pass - - @abstractmethod - def get_num_free_blocks(self) -> int: - pass - - @abstractmethod - def get_physical_block_id(self, absolute_id: int) -> int: - pass - - @abstractmethod - def swap_out(self, blocks: List[Block]) -> None: - pass - - @abstractmethod - def swap_in(self, blocks: List[Block]) -> None: - pass - - @property - @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: - pass - - @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - pass - - @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - pass - - @abstractmethod - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """NOTE: This should not be used besides Block""" - pass - - @abstractmethod - def promote_to_immutable_block(self, block: Block) -> BlockId: - """NOTE: This should not be used besides Block""" - pass - - @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self) -> bool: - """Reset prefix cache.""" - pass - - class NoFreeBlocksError(ValueError): - pass - - @abstractmethod - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - ) -> List[int]: - pass - - -class DeviceAwareBlockAllocator(ABC): - - @abstractmethod - def allocate_mutable_block(self, - prev_block: Optional[Block], - device: Device, - extra_hash: Optional[int] = None) -> Block: - pass - - @abstractmethod - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> Block: - pass - - @abstractmethod - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device, - extra_hash: Optional[int] = None, - ) -> List[Block]: - pass - - @abstractmethod - def get_num_free_blocks(self, device: Device) -> int: - pass - - @abstractmethod - def get_num_total_blocks(self, device: Device) -> int: - pass - - @abstractmethod - def free(self, block: Block) -> None: - pass - - @abstractmethod - def fork(self, last_block: Block) -> List[Block]: - pass - - @property - @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: - pass - - @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - pass - - @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - pass - - @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block], - device: Device) -> int: - pass - - @abstractmethod - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: - pass - - @abstractmethod - def get_physical_block_id(self, device: Device, absolute_id: int) -> int: - pass - - @abstractmethod - def allocate_or_get_null_block(self) -> Block: - """ - Null blocks are used as a placeholders for KV cache blocks that have - been dropped due to sliding window. - There is at most one null block per allocator. - """ - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache.""" - pass - - @abstractmethod - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - device: Device = Device.GPU, - ) -> List[int]: - pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py deleted file mode 100644 index ae876d131eb66..0000000000000 --- a/vllm/core/block/naive_block.py +++ /dev/null @@ -1,466 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections import deque -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union - -from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, - get_all_blocks_recursively) -from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device - -Refcount = int - - -class NaiveBlockAllocator(BlockAllocator): - """A simple block allocator that manages blocks of memory without prefix - caching. - - Args: - create_block (Block.Factory): A factory function for creating new - blocks. This is used when a NaiveBlockAllocator is composed within - a prefix caching allocator -- the naive block allocator must - construct prefix caching blocks (but shouldn't know anything else - about them). - num_blocks (int): The total number of blocks to manage. - block_size (int): The size of each block in tokens. - block_ids (Optional[Iterable[int]], optional): An optional iterable of - block IDs. If not provided, block IDs will be assigned sequentially - from 0 to num_blocks - 1. - """ - - def __init__( - self, - create_block: Block.Factory, - num_blocks: int, - block_size: int, - block_ids: Optional[Iterable[int]] = None, - block_pool: Optional[BlockPool] = None, - ): - if block_ids is None: - block_ids = range(num_blocks) - - self._free_block_indices: Deque[BlockId] = deque(block_ids) - self._all_block_indices = frozenset(block_ids) - assert len(self._all_block_indices) == num_blocks - - self._refcounter = RefCounter( - all_block_indices=self._free_block_indices) - self._block_size = block_size - - self._cow_tracker = CopyOnWriteTracker( - refcounter=self._refcounter.as_readonly()) - - if block_pool is None: - extra_factor = 4 - # Pre-allocate "num_blocks * extra_factor" block objects. - # The "* extra_factor" is a buffer to allow more block objects - # than physical blocks - self._block_pool = BlockPool(self._block_size, create_block, self, - num_blocks * extra_factor) - else: - # In this case, the block pool is provided by the caller, - # which means that there is most likely a need to share - # a block pool between allocators - self._block_pool = block_pool - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a new immutable block with the given token IDs, linked to - the previous block. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - token_ids (List[int]): The token IDs to be stored in the new block. - - Returns: - Block: The newly allocated immutable block. - """ - assert device is None - block = self.allocate_mutable_block(prev_block=prev_block) - block.append_token_ids(token_ids) - return block - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: - assert device is None - num_blocks = len(block_token_ids) - - block_ids = [] - for i in range(num_blocks): - block_ids.append(self._allocate_block_id()) - - blocks = [] - for i in range(num_blocks): - prev_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block_token_ids[i], - block_size=self._block_size, - physical_block_id=block_ids[i]) - blocks.append(prev_block) - - return blocks - - def allocate_mutable_block(self, - prev_block: Optional[Block], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a new mutable block, linked to the previous block. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - - Returns: - Block: The newly allocated mutable block. - """ - assert device is None - block_id = self._allocate_block_id() - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=[], - block_size=self._block_size, - physical_block_id=block_id) - return block - - def _allocate_block_id(self) -> BlockId: - if not self._free_block_indices: - raise BlockAllocator.NoFreeBlocksError() - - block_id = self._free_block_indices.popleft() - self._refcounter.incr(block_id) - return block_id - - def _free_block_id(self, block: Union[Block, BlockId]) -> None: - if isinstance(block, Block): - block_id = block.block_id - block.block_id = None - else: - block_id = block - assert block_id is not None - - refcount = self._refcounter.decr(block_id) - if refcount == 0: - self._free_block_indices.appendleft(block_id) - - def free(self, block: Block, keep_block_object: bool = False) -> None: - # Release the physical block id - self._free_block_id(block) - - # Release the block object - if not keep_block_object: - self._block_pool.free_block(block) - - def free_block_id(self, block_id: BlockId) -> None: - self._free_block_id(block_id) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: The new sequence of blocks that shares the same memory - as the original sequence. - """ - source_blocks = get_all_blocks_recursively(last_block) - - forked_blocks: List[Block] = [] - prev_block = None - for block in source_blocks: - - # Increment refcount for each block. - assert block.block_id is not None - refcount = self._refcounter.incr(block.block_id) - assert refcount != 1, "can't fork freed block" - - forked_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block.token_ids, - block_size=self._block_size, - physical_block_id=block.block_id) - - forked_blocks.append(forked_block) - prev_block = forked_blocks[-1] - - return forked_blocks - - def get_num_free_blocks(self) -> int: - return len(self._free_block_indices) - - def get_num_total_blocks(self) -> int: - return len(self._all_block_indices) - - def get_physical_block_id(self, absolute_id: int) -> int: - """Returns the zero-offset block id on certain block allocator - given the absolute block id. - - Args: - absolute_id (int): The absolute block id for the block - in whole allocator. - - Returns: - int: The zero-offset block id on certain device. - """ - return sorted(self._all_block_indices).index(absolute_id) - - @property - def refcounter(self): - return self._refcounter - - @property - def all_block_ids(self) -> FrozenSet[int]: - return self._all_block_indices - - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """Performs a copy-on-write operation on the given block if it is not - appendable. - - Args: - block (Block): The block to check for copy-on-write. - - Returns: - BlockId: The block index of the new block if a copy-on-write - operation was performed, or the original block index if - no copy-on-write was necessary. - """ - src_block_id = block.block_id - assert src_block_id is not None - - if self._cow_tracker.is_appendable(block): - return src_block_id - - self._free_block_id(block) - trg_block_id = self._allocate_block_id() - - self._cow_tracker.record_cow(src_block_id, trg_block_id) - - return trg_block_id - - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: - """Returns the copy-on-write source->destination mapping and clears it. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices. - """ - return self._cow_tracker.clear_cows() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, used in prefix caching. - - Since the naive allocator does not implement prefix caching, we do - nothing. - """ - pass - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - """Mark blocks as computed, used in prefix caching. - - Since the naive allocator does not implement prefix caching, we do - nothing. - """ - pass - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - """Determine blocks that can be skipped in prefill. - - Since the naive allocator does not support prefix caching, always return - an empty list. - """ - return [] - - def promote_to_immutable_block(self, block: Block) -> BlockId: - raise NotImplementedError("There is no promotion for naive blocks") - - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out. - - Args: - blocks: List of blocks to be swapped. - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks. Non full blocks are ignored - when deciding the number of blocks to touch. - """ - # NOTE: for naive block, we use set to eliminate common blocks among - # seqs, also we compare the empty slots in the mutable blocks with - # lookahead slots to get the number of unique new block that are - # needed. - old_block_set = set() - for block in blocks: - if block.is_full: - old_block_set.add(block) - return len(old_block_set) - - def swap_out(self, blocks: List[Block]) -> None: - for block in blocks: - self._free_block_id(block) - - def swap_in(self, blocks: List[Block]) -> None: - for block in blocks: - # Here we allocate either immutable or mutable block and then - # extract its block_id. Note that the block object is released - # and the block_id is assigned to "block" to allow reusing the - # existing "block" object - if block.is_full: - tmp_block = self.allocate_immutable_block( - prev_block=block.prev_block, token_ids=block.token_ids) - else: - tmp_block = self.allocate_mutable_block( - prev_block=block.prev_block) - tmp_block.append_token_ids(block.token_ids) - - block_id = tmp_block.block_id - tmp_block.block_id = None - self._block_pool.free_block(tmp_block) - - block.block_id = block_id # Assign block_id - - def get_prefix_cache_hit_rate(self) -> float: - return -1 - - def reset_prefix_cache(self) -> bool: - """No prefix cache for naive block allocator.""" - return True - - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: - # Not applicable for naive block allocator. - return [] - - -class NaiveBlock(Block): - """An implementation of the Block class that does not support prefix - caching. - - The NaiveBlock class represents a block of token IDs with a fixed size. It - provides methods for appending token IDs to the block and manages copy-on - -write operations when necessary. - - Args: - prev_block (Block): The previous block in the sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. - block_size (int): The maximum number of token IDs that can be stored in - the block. - allocator (BlockAllocator): The block allocator associated with this - block. - block_id (Optional[int], optional): The physical block index - of this block. Defaults to None, which means no allocation has been - made. - _cow_target (Optional[Block], optional): The copy-on-write target block. - If not provided, it defaults to self. - """ - - def __init__(self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - _cow_target: Optional[Block] = None, - extra_hash: Optional[int] = None): - self._token_ids: List[int] = [] - self._block_size = block_size - self._prev_block = prev_block - self._block_id = block_id - self._allocator = allocator - self._cow_target = _cow_target if _cow_target is not None else self - - self._append_token_ids_no_cow(token_ids) - - def append_token_ids(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block and performs a - copy-on-write if necessary. - - Args: - token_ids (Optional[List[int]]): The token IDs to be appended - to the block. - """ - self._append_token_ids_no_cow(token_ids) - - if self._block_id is not None: - self._block_id = (self._allocator.cow_block_if_not_appendable( - self._cow_target)) - - def _append_token_ids_no_cow(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block - - Args: - token_ids (List[int]): The token IDs to be appended to the block. - """ - if len(token_ids) == 0: - return - - assert len(token_ids) <= self.num_empty_slots - - self._token_ids.extend(token_ids) - - @property - def computed(self) -> bool: - raise NotImplementedError - - @computed.setter - def computed(self, value) -> None: - raise NotImplementedError - - @property - def last_accessed(self) -> float: - raise NotImplementedError - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - raise NotImplementedError - - @property - def block_id(self) -> Optional[int]: - return self._block_id - - @block_id.setter - def block_id(self, value: Optional[int]) -> None: - self._block_id = value - - @property - def is_full(self) -> bool: - return self.num_empty_slots == 0 - - @property - def num_empty_slots(self) -> int: - return self._block_size - len(self.token_ids) - - @property - def token_ids(self) -> List[int]: - return self._token_ids - - @property - def num_tokens_total(self) -> int: - raise NotImplementedError( - "num_tokens_total is not used for naive block") - - @property - def block_size(self) -> int: - return self._block_size - - @property - def prev_block(self) -> Optional["Block"]: - return self._prev_block - - @property - def extra_hash(self): - return None - - @property - def content_hash(self) -> Optional[int]: - return None diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py deleted file mode 100644 index a21d69323abbc..0000000000000 --- a/vllm/core/block/prefix_caching_block.py +++ /dev/null @@ -1,1135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Token blocks.""" -import sys -from bisect import bisect_left -from os.path import commonprefix -from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set, - Tuple) - -from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker, - get_all_blocks_recursively) -from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device, - DeviceAwareBlockAllocator) -from vllm.core.block.naive_block import (BlockPool, NaiveBlock, - NaiveBlockAllocator) -from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor -from vllm.logger import init_logger -from vllm.sequence import Sequence - -PrefixHash = int - -# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME -# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME, -# then we know this block hasn't been accessed yet. -_DEFAULT_LAST_ACCESSED_TIME = -1 - -logger = init_logger(__name__) - - -class BlockTracker: - """Used to track the status of a block inside the prefix caching allocator - """ - __slots__ = ("active", "last_accessed", "computed") - - def reset(self): - self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME - self.computed: bool = False - - def __init__(self): - self.active: bool = False - self.reset() - - def enable(self): - assert not self.active - self.active = True - self.reset() - - def disable(self): - assert self.active - self.active = False - self.reset() - - -class PrefixCachingBlockAllocator(BlockAllocator): - """A block allocator that implements prefix caching. - - The PrefixCachingBlockAllocator maintains a cache of blocks based on their - content hash. It reuses blocks with the same content hash to avoid redundant - memory allocation. The allocator also supports copy-on-write operations. - - Args: - num_blocks (int): The total number of blocks to manage. - block_size (int): The size of each block in tokens. - block_ids (Optional[Iterable[int]], optional): An optional iterable of - block IDs. If not provided, block IDs will be assigned sequentially - from 0 to num_blocks - 1. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - # Implements Block.Factory. - def __init__( - self, - num_blocks: int, - block_size: int, - block_ids: Optional[Iterable[int]] = None, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU, - ): - if block_ids is None: - block_ids = range(num_blocks) - - self._block_size = block_size - - # A mapping of prefix hash to block index. All blocks which have a - # prefix hash will be in this dict, even if they have refcount 0. - self._cached_blocks: Dict[PrefixHash, BlockId] = {} - - # A list of immutable block IDs that have been touched by scheduler - # and should be marked as computed after an entire batch of sequences - # are scheduled. - self._touched_blocks: Set[BlockId] = set() - - # Used to track status of each physical block id - self._block_tracker: Dict[BlockId, BlockTracker] = {} - for block_id in block_ids: - self._block_tracker[block_id] = BlockTracker() - - # Pre-allocate "num_blocks * extra_factor" block objects. - # The "* extra_factor" is a buffer to allow more block objects - # than physical blocks - extra_factor = 4 - self._block_pool = BlockPool(self._block_size, self._create_block, - self, num_blocks * extra_factor) - - # An allocator for blocks that do not have prefix hashes. - self._hashless_allocator = NaiveBlockAllocator( - create_block=self._create_block, # type: ignore - num_blocks=num_blocks, - block_size=block_size, - block_ids=block_ids, - block_pool=self._block_pool, # Share block pool here - ) - - # Evitor used to maintain how we want to handle those computed blocks - # if we find memory pressure is high. - self.eviction_policy = eviction_policy - self.evictor: Evictor = make_evictor(self.eviction_policy) - - # We share the refcounter between allocators. This allows us to promote - # blocks originally allocated in the hashless allocator to immutable - # blocks. - self._refcounter = self._hashless_allocator.refcounter - - self._cow_tracker = CopyOnWriteTracker( - refcounter=self._refcounter.as_readonly()) - - self.metric_data = CacheMetricData() - - def _create_block( - self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ) -> Block: - # Bind block to self. - allocator = self - - return PrefixCachingBlock( - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=allocator, - computed=computed, - extra_hash=extra_hash, - ) - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates an immutable block with the given token IDs, reusing cached - blocks if possible. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - token_ids (List[int]): The token IDs to be stored in the block. - - Returns: - Block: The allocated immutable block. - """ - assert device is None - assert_prefix_caching_block_or_none(prev_block) - - # First, try to create a block that points to cached data - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=token_ids, - block_size=self._block_size, - physical_block_id=None, - extra_hash=extra_hash) - assert block.content_hash is not None - - cached_block_id = self._cached_blocks.get(block.content_hash, None) - if cached_block_id is not None: - self.metric_data.query(hit=True) - block.block_id = cached_block_id - self._incr_refcount_cached_block(block) - return block - self.metric_data.query(hit=False) - self._block_pool.free_block(block) - - # No cached block => Allocate a new block - block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash) - block.append_token_ids(token_ids) - return block - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: - blocks = [] - for token_ids in block_token_ids: - prev_block = self.allocate_immutable_block(prev_block=prev_block, - token_ids=token_ids, - device=device, - extra_hash=extra_hash) - blocks.append(prev_block) - return blocks - - def allocate_mutable_block(self, - prev_block: Optional[Block], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a mutable block. If there are no free blocks, this will - evict unused cached blocks. - - Args: - prev_block (Block): The previous block in the sequence. - None is not allowed unlike it is super class. - - Returns: - Block: The allocated mutable block. - """ - assert device is None - assert_prefix_caching_block_or_none(prev_block) - - block_id = self._allocate_block_id() - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=[], - block_size=self._block_size, - physical_block_id=block_id, - extra_hash=extra_hash) - assert not block.computed - assert block.content_hash is None - return block - - def _incr_refcount_cached_block(self, block: Block) -> None: - # Set this block to be "computed" since it is pointing to a - # cached block id (which was already computed) - block.computed = True - - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.incr(block_id) - if refcount == 1: - # In case a cached block was evicted, restore its tracking - if block_id in self.evictor: - self.evictor.remove(block_id) - - self._track_block_id(block_id, computed=True) - - def _decr_refcount_cached_block(self, block: Block) -> None: - # Ensure this is immutable/cached block - assert block.content_hash is not None - - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.decr(block_id) - if refcount > 0: - block.block_id = None - return - else: - assert refcount == 0 - - # No longer used - assert block.content_hash in self._cached_blocks - - # Add the cached block to the evictor - # (This keeps the cached block around so it can be reused) - self.evictor.add(block_id, block.content_hash, block.num_tokens_total, - self._block_tracker[block_id].last_accessed) - - # Stop tracking the block - self._untrack_block_id(block_id) - - block.block_id = None - - def _decr_refcount_hashless_block(self, block: Block) -> None: - block_id = block.block_id - assert block_id is not None - - # We may have a fork case where block is shared, - # in which case, we cannot remove it from tracking - refcount = self._refcounter.get(block_id) - if refcount == 1: - self._untrack_block_id(block_id) - - # Decrement refcount of the block_id, but do not free the block object - # itself (will be handled by the caller) - self._hashless_allocator.free(block, keep_block_object=True) - - def _allocate_block_id(self) -> BlockId: - """First tries to allocate a block id from the hashless allocator, - and if there are no blocks, then tries to evict an unused cached block. - """ - hashless_block_id = self._maybe_allocate_hashless_block_id() - if hashless_block_id is not None: - return hashless_block_id - - evicted_block_id = self._maybe_allocate_evicted_block_id() - if evicted_block_id is not None: - return evicted_block_id - - # No block available in hashless allocator, nor in unused cache blocks. - raise BlockAllocator.NoFreeBlocksError() - - def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]: - try: - # Allocate mutable block and extract its block_id - block = self._hashless_allocator.allocate_mutable_block( - prev_block=None) - block_id = block.block_id - self._block_pool.free_block(block) - - self._track_block_id(block_id, computed=False) - return block_id - except BlockAllocator.NoFreeBlocksError: - return None - - def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]: - if self.evictor.num_blocks == 0: - return None - - # Here we get an evicted block, which is only added - # into evictor if its ref counter is 0 - # and since its content would be changed, we need - # to remove it from _cached_blocks's tracking list - block_id, content_hash_to_evict = self.evictor.evict() - - # Sanity checks - assert content_hash_to_evict in self._cached_blocks - _block_id = self._cached_blocks[content_hash_to_evict] - assert self._refcounter.get(_block_id) == 0 - assert _block_id == block_id - - self._cached_blocks.pop(content_hash_to_evict) - - self._refcounter.incr(block_id) - self._track_block_id(block_id, computed=False) - - return block_id - - def _free_block_id(self, block: Block) -> None: - """Decrements the refcount of the block. The block may be in two - possible states: (1) immutable/cached or (2) mutable/hashless. - In the first case, the refcount is decremented directly and the block - may be possibly added to the evictor. In other case, hashless - allocator free(..) with keep_block_object=True is called to only free - the block id (since the block object may be reused by the caller) - """ - block_id = block.block_id - assert block_id is not None, "Freeing unallocated block is undefined" - - if block.content_hash is not None: - # Immutable: This type of block is always cached, and we want to - # keep it in the evictor for future reuse - self._decr_refcount_cached_block(block) - else: - # Mutable: This type of block is not cached, so we release it - # directly to the hashless allocator - self._decr_refcount_hashless_block(block) - - assert block.block_id is None - - def free(self, block: Block, keep_block_object: bool = False) -> None: - """Release the block (look at free_block_id(..) docs) - """ - # Release the physical block index - self._free_block_id(block) - - # Release the block object to the pool - if not keep_block_object: - self._block_pool.free_block(block) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: The new sequence of blocks that shares the same memory - as the original sequence. - """ - source_blocks = get_all_blocks_recursively(last_block) - - forked_blocks: List[Block] = [] - prev_block = None - for block in source_blocks: - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.incr(block_id) - assert refcount != 1, "can't fork free'd block_id = {}".format( - block_id) - - forked_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block.token_ids, - block_size=self._block_size, - physical_block_id=block_id, - extra_hash=block.extra_hash) - - forked_blocks.append(forked_block) - prev_block = forked_blocks[-1] - - return forked_blocks - - def get_num_free_blocks(self, device: Optional[Device] = None) -> int: - assert device is None - # The number of free blocks is the number of hashless free blocks - # plus the number of blocks evictor could free from its list. - return self._hashless_allocator.get_num_free_blocks( - ) + self.evictor.num_blocks - - def get_num_total_blocks(self) -> int: - return self._hashless_allocator.get_num_total_blocks() - - def get_physical_block_id(self, absolute_id: int) -> int: - """Returns the zero-offset block id on certain block allocator - given the absolute block id. - - Args: - absolute_id (int): The absolute block id for the block - in whole allocator. - - Returns: - int: The rzero-offset block id on certain device. - """ - return sorted(self.all_block_ids).index(absolute_id) - - @property - def all_block_ids(self) -> FrozenSet[int]: - return self._hashless_allocator.all_block_ids - - def get_prefix_cache_hit_rate(self) -> float: - return self.metric_data.get_hit_rate() - - def reset_prefix_cache(self) -> bool: - """Reset prefix cache. This function may be used in RLHF - flows to invalid prefix caching after the weights are updated, - or used for resetting prefix caching status for benchmarking. - - Returns: - bool: True if the prefix cache is successfully reset, - False otherwise. - """ - num_used_blocks = (self.get_num_total_blocks() - - self.get_num_free_blocks()) - if num_used_blocks > 0: - logger.warning( - "Failed to reset prefix cache because some " - "blocks (%d) are not freed yet", num_used_blocks) - return False - - # Free all blocks in the evictor. - while (block_id := - self._maybe_allocate_evicted_block_id()) is not None: - self._hashless_allocator.free_block_id(block_id) - - # Should not have any cached blocks because all blocks are evicted. - assert not self._cached_blocks - - # Reset the evictor. - self.evictor = make_evictor(self.eviction_policy) - - # Reset the block tracker. - for block_id in self._block_tracker: - self._block_tracker[block_id] = BlockTracker() - - # Reset the metrics. - self.metric_data = CacheMetricData() - - logger.info("Successfully reset prefix cache") - return True - - def is_block_cached(self, block: Block) -> bool: - assert block.content_hash is not None - return block.content_hash in self._cached_blocks - - def promote_to_immutable_block(self, block: Block) -> BlockId: - """Once a mutable block is full, it can be promoted to an immutable - block. This means that its content can be referenced by future blocks - having the same prefix. - - Note that if we already have a cached block with the same content, we - will replace the newly-promoted block's mapping with the existing cached - block id. - - Args: - block: The mutable block to be promoted. - - Returns: - BlockId: Either the original block index, or the block index of - the previously cached block matching the same content. - """ - # Ensure block can be promoted - assert block.content_hash is not None - assert block.block_id is not None - assert self._refcounter.get(block.block_id) > 0 - - if block.content_hash not in self._cached_blocks: - # No cached content hash => Set this block as cached. - # Note that this block cannot be marked as computed yet - # because other sequences in the same batch cannot reuse - # this block. - self._cached_blocks[block.content_hash] = block.block_id - # Mark this block as touched so that it can be marked as - # computed after the entire batch of sequences are scheduled. - self._touched_blocks.add(block.block_id) - return block.block_id - - # Reuse the cached content hash - self._decr_refcount_hashless_block(block) - block.block_id = self._cached_blocks[block.content_hash] - - # Increment refcount of the cached block and (possibly) restore - # it from the evictor. - # Note that in this case, the block is marked as computed - self._incr_refcount_cached_block(block) - - return block.block_id - - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """Performs a copy-on-write operation on the given block if it is not - appendable. - - Args: - block (Block): The block to check for copy-on-write. - - Returns: - BlockId: The block index of the new block if a copy-on-write - operation was performed, or the original block index if - no copy-on-write was necessary. - """ - src_block_id = block.block_id - assert src_block_id is not None - - if self._cow_tracker.is_appendable(block): - return src_block_id - - self._free_block_id(block) - trg_block_id = self._allocate_block_id() - - self._cow_tracker.record_cow(src_block_id, trg_block_id) - - return trg_block_id - - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: - """Returns the copy-on-write source->destination mapping and clears it. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices. - """ - return self._cow_tracker.clear_cows() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, used in prefix caching. - - If the block is added into evictor, we need to update corresponding - info in evictor's metadata. - """ - - for block_id in block_ids: - if self._block_tracker[block_id].active: - self._block_tracker[block_id].last_accessed = now - elif block_id in self.evictor: - self.evictor.update(block_id, now) - else: - raise ValueError( - "Mark block as accessed which is not belonged to GPU") - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - # Mark all touched blocks as computed. - for block_id in self._touched_blocks: - self._block_tracker[block_id].computed = True - self._touched_blocks.clear() - - def _track_block_id(self, block_id: Optional[BlockId], - computed: bool) -> None: - assert block_id is not None - self._block_tracker[block_id].enable() - self._block_tracker[block_id].computed = computed - - def _untrack_block_id(self, block_id: Optional[BlockId]) -> None: - assert block_id is not None - self._block_tracker[block_id].disable() - - def block_is_computed(self, block_id: int) -> bool: - if self._block_tracker[block_id].active: - return self._block_tracker[block_id].computed - else: - return block_id in self.evictor - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - """Return the block ids that are common for a given sequence group. - - Only those blocks that are immutable and already be marked - compyted would be taken consideration. - """ - - # NOTE We exclude the last block to avoid the case where the entire - # prompt is cached. This would cause erroneous behavior in model - # runner. - - # It returns a list of int although type annotation says list of string. - if len(computed_seq_block_ids) == 1: - return computed_seq_block_ids[0] - - return commonprefix([ - ids for ids in computed_seq_block_ids # type: ignore - if ids - ]) - - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out. - - Args: - blocks: List of blocks to be swapped. - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks. Non full blocks are ignored - when deciding the number of blocks to touch. - """ - num_touched_blocks: int = 0 - for block in blocks: - # If the block has a match in the cache and the cached - # block is not referenced, then we still count it as a - # touched block - if block.is_full and (not self.is_block_cached(block) or \ - (block.content_hash is not None and \ - self._cached_blocks[block.content_hash] in \ - self.evictor)): - num_touched_blocks += 1 - return num_touched_blocks - - def swap_out(self, blocks: List[Block]) -> None: - """Execute the swap out actions. Basically just free the - given blocks. - - Args: - blocks: List of blocks to be swapped out. - """ - for block in blocks: - self._free_block_id(block) - - def swap_in(self, blocks: List[Block]) -> None: - """Execute the swap in actions. Change the block id from - old allocator to current allocator for each block to finish - the block table update. - - Args: - blocks: List of blocks to be swapped in. - """ - for block in blocks: - # Here we allocate either immutable or mutable block and then - # extract its block_id. Note that the block object is released - # and the block_id is assigned to "block" to allow reusing the - # existing "block" object - if block.is_full: - tmp_block = self.allocate_immutable_block( - prev_block=block.prev_block, - token_ids=block.token_ids, - extra_hash=block.extra_hash) - else: - tmp_block = self.allocate_mutable_block( - prev_block=block.prev_block, extra_hash=block.extra_hash) - tmp_block.append_token_ids(block.token_ids) - - block_id = tmp_block.block_id - self._block_pool.free_block(tmp_block) - - block.block_id = block_id # Assign block_id - - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: - """ - Given a list of block hashes, return the prefix of the block hashes that - are all cached. - - Since a block's block hash includes the hashes of all previous blocks, - and we only allocate/deallocate blocks in the entire sequence, so if a - block is cached, then all previous blocks are also cached. With this - property, we can use binary search to find the prefix of cached blocks. - - Args: - block_hashes (List[int]): The list of block hashes. - - Returns: - List[int]: The prefix of the `block_hashes` that are cached. - """ - - def _block_is_cached(block_hash: PrefixHash) -> bool: - if block_hash not in self._cached_blocks: - return False - - cached_block_id = self._cached_blocks[block_hash] - # We only consider the blocks that are marked as computed. - return self.block_is_computed(cached_block_id) - - def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int: - - # python <= 3.10 don't have the key argument - if sys.version_info < (3, 10): - a = [key(e) for e in a] - return bisect_left(a, x) - else: - return bisect_left(a, x, key=key) - - # Look for the first block that's not cached, and returns the prefix - # i.e. blocks that are cached. - idx = _bisect_left(block_hashes, - True, - key=lambda x: not _block_is_cached(x)) - return block_hashes[:idx] - - -class PrefixCachingBlock(Block): - """A block implementation that supports prefix caching. - - The PrefixCachingBlock class represents a block of token IDs with prefix - caching capabilities. It wraps a NaiveBlock internally and provides - additional functionality for content hashing and promoting immutable blocks - with the prefix caching allocator. - - Args: - prev_block (Optional[PrefixCachingBlock]): The previous block in the - sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. - block_size (int): The maximum number of token IDs that can be stored in - the block. - allocator (BlockAllocator): The prefix - caching block allocator associated with this block. - block_id (Optional[int], optional): The physical block index - of this block. Defaults to None. - extra_hash (Optional[int]): The hash value of additional factors - such as adapters that influence the block, apart from the token_ids. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - def __init__( - self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ): - assert isinstance(allocator, PrefixCachingBlockAllocator), ( - "Currently this class is only tested with " - "PrefixCachingBlockAllocator. Got instead allocator = {}".format( - allocator)) - assert_prefix_caching_block_or_none(prev_block) - - self._prev_block = prev_block - self._cached_content_hash: Optional[int] = None - self._cached_num_tokens_total: int = 0 - self._allocator = allocator - self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME - self._computed = computed - self._extra_hash = extra_hash - - # On the first time, we create the block object, and next we only - # reinitialize it - if hasattr(self, "_block"): - self._block.__init__( # type: ignore[has-type] - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=self._allocator) - else: - self._block = NaiveBlock(prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=self._allocator) - - self._update_num_tokens_total() - - def _update_num_tokens_total(self): - """Incrementally computes the number of tokens that there is - till the current block (included) - """ - res = 0 - - # Add all previous blocks - if self._prev_block is not None: - res += self._prev_block.num_tokens_total - - # Add current block - res += len(self.token_ids) - - self._cached_num_tokens_total = res - - @property - def computed(self) -> bool: - return self._computed - - @computed.setter - def computed(self, value) -> None: - self._computed = value - - @property - def last_accessed(self) -> float: - return self._last_accessed - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - self._last_accessed = last_accessed_ts - - def append_token_ids(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block and registers the block as - immutable if the block becomes full. - - Args: - token_ids (List[int]): The token IDs to be appended to the block. - """ - # Ensure this is mutable block (not promoted) - assert self.content_hash is None - assert not self.computed - - if len(token_ids) == 0: - return - - # Ensure there are input tokens - assert token_ids, "Got token_ids = {}".format(token_ids) - - # Naive block handles CoW. - self._block.append_token_ids(token_ids) - self._update_num_tokens_total() - - # If the content hash is present, then the block can be made immutable. - # Register ourselves with the allocator, potentially replacing the - # physical block index. - if self.content_hash is not None: - self.block_id = self._allocator.promote_to_immutable_block(self) - - @property - def block_id(self) -> Optional[int]: - return self._block.block_id - - @block_id.setter - def block_id(self, value) -> None: - self._block.block_id = value - - @property - def is_full(self) -> bool: - return self._block.is_full - - @property - def num_empty_slots(self) -> int: - return self._block.num_empty_slots - - @property - def num_tokens_total(self) -> int: - return self._cached_num_tokens_total - - @property - def block_size(self) -> int: - return self._block.block_size - - @property - def token_ids(self) -> List[int]: - return self._block.token_ids - - @property - def prev_block(self) -> Optional[Block]: - return self._prev_block - - @property - def extra_hash(self) -> Optional[int]: - return self._extra_hash - - @property - def content_hash(self) -> Optional[int]: - """Return the content-based hash of the current block, or None if it is - not yet defined. - - For the content-based hash to be defined, the current block must be - full. - """ - # If the hash is already computed, return it. - if self._cached_content_hash is not None: - return self._cached_content_hash - - # We cannot compute a hash for the current block because it is not full. - if not self.is_full: - return None - - is_first_block = self._prev_block is None - prev_block_hash = ( - self._none_hash if is_first_block else - self._prev_block.content_hash # type: ignore - ) - - # Previous block exists but does not yet have a hash. - # Return no hash in this case. - if prev_block_hash == self._none_hash and not is_first_block: - return None - - self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block, - prev_block_hash, - cur_block_token_ids=self.token_ids, - extra_hash=self._extra_hash) - return self._cached_content_hash - - @classmethod - def hash_block_tokens(cls, - is_first_block: bool, - prev_block_hash: Optional[int], - cur_block_token_ids: List[int], - extra_hash: Optional[int] = None) -> int: - """Computes a hash value corresponding to the contents of a block and - the contents of the preceding block(s). The hash value is used for - prefix caching. - - Parameters: - - is_first_block (bool): A flag indicating if the block is the first in - the sequence. - - prev_block_hash (Optional[int]): The hash of the previous block. None - if this is the first block. - - cur_block_token_ids (List[int]): A list of token ids in the current - block. The current block is assumed to be full. - - extra_hash (Optional[int]): The hash value of additional factors - such as adapters that influence the block, apart from the token_ids. - - Returns: - - int: The computed hash value for the block. - """ - if is_first_block and prev_block_hash is None: - prev_block_hash = cls._none_hash - return hash((is_first_block, prev_block_hash, *cur_block_token_ids, - extra_hash)) - - -class ComputedBlocksTracker: - """ - Tracks the computed blocks for each sequence. - - Internally, it maintains a map from sequence id to the list of block hashes - for the sequence. We cache the hashes of the full blocks for each sequence, - and make sure the hash is calculated in the same way as the allocator. - When a sequence is being decoded, we also update the sequence's hash - accordingly and incrementally. - - From the sequence hash, with prefix caching enabled, we could also calculate - the number of cached tokens for the sequence by looking up the number of - cached block hashes in the allocator. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - def __init__( - self, - allocator: DeviceAwareBlockAllocator, - block_size: int, - enable_caching: bool, - ): - self._allocator = allocator - self._block_size = block_size - self._enable_caching = enable_caching - - # A map from seq_id to the list of block hashes for the - # sequence. This is so that we don't have to recompute the block hashes - # for the sequence when we need to check if the sequence is cached. - # Note a block that's not full will not have its hash calculated and - # recorded. - self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {} - - # A map from seq_id to the number of tokens that are cached for the - # sequence. - # We need this so that a sequence in continuous prefill doesn't - # accidentally see its cached token count change. See comments in - # `get_num_cached_tokens` for more details. - self._seq_id_to_num_tokens_computed: Dict[int, int] = {} - - def _update_seq_hashes(self, seq: Sequence) -> None: - """Incrementally update the sequence's block hashes and record them.""" - assert self._enable_caching - - block_hashes_recorded = self._seq_id_to_blocks_hashes.get( - seq.seq_id, []) - cur_num_blocks_recorded = len(block_hashes_recorded) - token_ids = seq.get_token_ids() - assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, ( - f"The sequence has {len(token_ids)} tokens, but" - f" already recorded {cur_num_blocks_recorded} blocks. " - "This should not happen since we assume blocks are " - "only appended other than recomputation. When the sequence is " - "recomputed, we should have removed the info of the old blocks.") - # Update the computed block hashes for the sequence. Since only full - # blocks are considered as "computed", we take floor here. - num_computed_blocks = len(token_ids) // self._block_size - - # We need to know the hash of the previous block to compute the hash of - # the current block so that blocks could be uniquely identified across - # sequences of prefixes. - prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else - block_hashes_recorded[-1]) - # Only update the computed block hashes for the new blocks - for i in range(cur_num_blocks_recorded, num_computed_blocks): - assert len(token_ids) >= (i + 1) * self._block_size - block_token_ids = token_ids[i * self._block_size:(i + 1) * - self._block_size] - - # NOTE: If there are any factors affecting the block besides - # token_ids, they should be added as input to extra_hash. - extra_hash = seq.extra_hash() - - # This has to be kept in sync with the allocator's hash - # calculation. - block_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block=prev_block_hash == self._none_hash, - prev_block_hash=prev_block_hash, - cur_block_token_ids=block_token_ids, - extra_hash=extra_hash, - ) - block_hashes_recorded.append(block_hash) - prev_block_hash = block_hash - - self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded - - def get_num_cached_tokens(self, seq: Sequence) -> int: - if not self._enable_caching: - return 0 - - # We always try to update the sequence hashes on the fly. - # This is to ensure that we don't miss any cached tokens for the - # sequence during decode. - # This routine should only update hash for any new blocks too. - self._update_seq_hashes(seq) - - num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get( - seq.seq_id, None) - - # TODO(rickyx): This hack could be removed once we mark blocks as - # computed correctly with chunked prefills. - if num_computed_tokens_prev is not None and seq.is_prefill(): - # For a sequence that is still in prefill, we don't - # recompute the number of cached tokens. - # This also handles correctly chunked prefill since currently - # we mark blocks as computed even if the sequence is still partially - # prefilled. So a continuously prefilled sequence should not - # see its cached token count change while running. - return num_computed_tokens_prev - - block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id] - - # This is O(logN), where N is the number of blocks. - num_cached_blocks = len( - self._allocator.find_cached_blocks_prefix(block_hashes)) - num_cached_tokens = num_cached_blocks * self._block_size - self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens - return num_cached_tokens - - def remove_seq(self, seq_id: int) -> None: - """Stop tracking the sequence.""" - if not self._enable_caching: - return - assert seq_id in self._seq_id_to_blocks_hashes - del self._seq_id_to_blocks_hashes[seq_id] - - assert seq_id in self._seq_id_to_num_tokens_computed - del self._seq_id_to_num_tokens_computed[seq_id] - - -class LastAccessBlocksTracker: - """Manages the last access time of the tracked sequences, in order to allow - an efficient update of allocator's block last access times - """ - - def __init__(self, allocator): - self._allocator = allocator - self._seq_last_access: Dict[int, Optional[float]] = {} - - def add_seq(self, seq_id: int) -> None: - """Start tracking seq_id - """ - assert seq_id not in self._seq_last_access - self._seq_last_access[seq_id] = None - - def remove_seq(self, seq_id: int) -> None: - """Stop tracking seq_id - """ - assert seq_id in self._seq_last_access - del self._seq_last_access[seq_id] - - def update_last_access(self, seq_id: int, time: float) -> None: - assert seq_id in self._seq_last_access - self._seq_last_access[seq_id] = time - - def update_seq_blocks_last_access(self, seq_id: int, - block_ids: List[int]) -> None: - assert seq_id in self._seq_last_access - - ts = self._seq_last_access[seq_id] - - if ts is None: - # No last access was recorded, no need to update. - return - - self._allocator.mark_blocks_as_accessed(block_ids, ts) - - -def assert_prefix_caching_block_or_none(block: Optional[Block]): - if block is None: - return - assert isinstance(block, - PrefixCachingBlock), "Got block = {}".format(block) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py deleted file mode 100644 index e933c6ee7c8bd..0000000000000 --- a/vllm/core/block/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Block manager utils.""" -from vllm.sequence import SequenceGroup -from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) - - -def check_no_caching_or_swa_for_blockmgr_encdec( - block_mgr, seq_group: SequenceGroup) -> None: - ''' - Enforce that prefix caching & sliding-window attention (SWA) - are currently unsupported *specifically* for encoder/decoder models. - - Raises NotImplementedError if unsupported scenario is detected. - - Arguments: - - * block_mgr: BlockSpaceManager instance - * seq_group: SequenceGroup passed to block_mgr - ''' - - if seq_group.is_encoder_decoder(): - if block_mgr.max_block_sliding_window is not None: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) - - if block_mgr.enable_caching: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py deleted file mode 100644 index cbfa4d7ff3c4c..0000000000000 --- a/vllm/core/block_manager.py +++ /dev/null @@ -1,523 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A block manager that manages token blocks.""" -from typing import Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple - -from vllm.core.block.block_table import BlockTable -from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.core.block.interfaces import Block -from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, - LastAccessBlocksTracker) -from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device - -SeqId = int -EncoderSeqId = str - - -class SelfAttnBlockSpaceManager(BlockSpaceManager): - """BlockSpaceManager which manages the allocation of KV cache. - - It owns responsibility for allocation, swapping, allocating memory for - autoregressively-generated tokens, and other advanced features such as - prefix caching, forking/copy-on-write, and sliding-window memory allocation. - - This class implements the design described in - https://github.com/vllm-project/vllm/pull/3492. - - Lookahead slots - The block manager has the notion of a "lookahead slot". These are slots - in the KV cache that are allocated for a sequence. Unlike the other - allocated slots, the content of these slots is undefined -- the worker - may use the memory allocations in any way. - - In practice, a worker could use these lookahead slots to run multiple - forward passes for a single scheduler invocation. Each successive - forward pass would write KV activations to the corresponding lookahead - slot. This allows low inter-token latency use-cases, where the overhead - of continuous batching scheduling is amortized over >1 generated tokens. - - Speculative decoding uses lookahead slots to store KV activations of - proposal tokens. - - See https://github.com/vllm-project/vllm/pull/3250 for more information - on lookahead scheduling. - - Args: - block_size (int): The size of each memory block. - num_gpu_blocks (int): The number of memory blocks allocated on GPU. - num_cpu_blocks (int): The number of memory blocks allocated on CPU. - watermark (float, optional): The threshold used for memory swapping. - Defaults to 0.01. - sliding_window (Optional[int], optional): The size of the sliding - window. Defaults to None. - enable_caching (bool, optional): Flag indicating whether caching is - enabled. Defaults to False. - """ - - def __init__( - self, - block_size: int, - num_gpu_blocks: int, - num_cpu_blocks: int, - watermark: float = 0.01, - sliding_window: Optional[int] = None, - enable_caching: bool = False, - ) -> None: - self.block_size = block_size - self.num_total_gpu_blocks = num_gpu_blocks - self.num_total_cpu_blocks = num_cpu_blocks - - self.sliding_window = sliding_window - # max_block_sliding_window is the max number of blocks that need to be - # allocated - self.max_block_sliding_window = None - if sliding_window is not None: - # +1 here because // rounds down - num_blocks = sliding_window // block_size + 1 - # +1 here because the last block may not be full, - # and so the sequence stretches one more block at the beginning - # For example, if sliding_window is 3 and block_size is 4, - # we may need 2 blocks when the second block only holds 1 token. - self.max_block_sliding_window = num_blocks + 1 - - self.watermark = watermark - assert watermark >= 0.0 - - self.enable_caching = enable_caching - - self.watermark_blocks = int(watermark * num_gpu_blocks) - - self.block_allocator = CpuGpuBlockAllocator.create( - allocator_type="prefix_caching" if enable_caching else "naive", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - block_size=block_size, - ) - - self.block_tables: Dict[SeqId, BlockTable] = {} - self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} - - self._computed_blocks_tracker = ComputedBlocksTracker( - self.block_allocator, self.block_size, self.enable_caching) - self._last_access_blocks_tracker = LastAccessBlocksTracker( - self.block_allocator) - - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - # FIXME(woosuk): Here we assume that all sequences in the group share - # the same prompt. This may not be true for preempted sequences. - - check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) - - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = BlockTable.get_num_required_blocks( - seq.get_token_ids(), - block_size=self.block_size, - num_lookahead_slots=num_lookahead_slots, - ) - - if seq_group.is_encoder_decoder(): - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - num_required_blocks += BlockTable.get_num_required_blocks( - encoder_seq.get_token_ids(), - block_size=self.block_size, - ) - - if self.max_block_sliding_window is not None: - num_required_blocks = min(num_required_blocks, - self.max_block_sliding_window) - - num_free_gpu_blocks = self.block_allocator.get_num_free_blocks( - device=Device.GPU) - - # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks - < self.watermark_blocks): - return AllocStatus.NEVER - if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: - return AllocStatus.OK - else: - return AllocStatus.LATER - - def _allocate_sequence(self, seq: Sequence) -> BlockTable: - block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - max_block_sliding_window=self.max_block_sliding_window, - ) - if seq.get_token_ids(): - # NOTE: If there are any factors affecting the block besides - # token_ids, they should be added as input to extra_hash. - extra_hash = seq.extra_hash() - - # Add blocks to the block table only if the sequence is non empty. - block_table.allocate(token_ids=seq.get_token_ids(), - extra_hash=extra_hash) - - return block_table - - def allocate(self, seq_group: SequenceGroup) -> None: - - # Allocate self-attention block tables for decoder sequences - waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) - assert not (set(seq.seq_id for seq in waiting_seqs) - & self.block_tables.keys()), "block table already exists" - - # NOTE: Here we assume that all sequences in the group have the same - # prompt. - seq = waiting_seqs[0] - block_table: BlockTable = self._allocate_sequence(seq) - self.block_tables[seq.seq_id] = block_table - - # Track seq - self._last_access_blocks_tracker.add_seq(seq.seq_id) - - # Assign the block table for each sequence. - for seq in waiting_seqs[1:]: - self.block_tables[seq.seq_id] = block_table.fork() - - # Track seq - self._last_access_blocks_tracker.add_seq(seq.seq_id) - - # Allocate cross-attention block table for encoder sequence - # - # NOTE: Here we assume that all sequences in the group have the same - # encoder prompt. - request_id = seq_group.request_id - - assert (request_id - not in self.cross_block_tables), \ - "block table already exists" - - check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) - - if seq_group.is_encoder_decoder(): - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - block_table = self._allocate_sequence(encoder_seq) - self.cross_block_tables[request_id] = block_table - - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - """Determine if there is enough space in the GPU KV cache to continue - generation of the specified sequence group. - - We use a worst-case heuristic: assume each touched block will require a - new allocation (either via CoW or new block). We can append slots if the - number of touched blocks is less than the number of free blocks. - - "Lookahead slots" are slots that are allocated in addition to the slots - for known tokens. The contents of the lookahead slots are not defined. - This is used by speculative decoding when speculating future tokens. - """ - - num_touched_blocks = 0 - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - block_table = self.block_tables[seq.seq_id] - - num_touched_blocks += ( - block_table.get_num_blocks_touched_by_append_slots( - token_ids=block_table.get_unseen_token_ids( - seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots, - )) - - num_free_gpu_blocks = self.block_allocator.get_num_free_blocks( - Device.GPU) - return num_touched_blocks <= num_free_gpu_blocks - - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - - block_table = self.block_tables[seq.seq_id] - - block_table.append_token_ids( - token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots, - num_computed_slots=seq.data.get_num_computed_tokens(), - extra_hash=seq.extra_hash(), - ) - # Return any new copy-on-writes. - new_cows = self.block_allocator.clear_copy_on_writes() - return new_cows - - def free(self, seq: Sequence) -> None: - seq_id = seq.seq_id - - if seq_id not in self.block_tables: - # Already freed or haven't been scheduled yet. - return - - # Update seq block ids with the latest access time - self._last_access_blocks_tracker.update_seq_blocks_last_access( - seq_id, self.block_tables[seq.seq_id].physical_block_ids) - - # Untrack seq - self._last_access_blocks_tracker.remove_seq(seq_id) - self._computed_blocks_tracker.remove_seq(seq_id) - - # Free table/blocks - self.block_tables[seq_id].free() - del self.block_tables[seq_id] - - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - seq_id = seq.seq_id - self._computed_blocks_tracker.remove_seq(seq_id) - - def free_cross(self, seq_group: SequenceGroup) -> None: - request_id = seq_group.request_id - if request_id not in self.cross_block_tables: - # Already freed or hasn't been scheduled yet. - return - self.cross_block_tables[request_id].free() - del self.cross_block_tables[request_id] - - def get_block_table(self, seq: Sequence) -> List[int]: - block_ids = self.block_tables[seq.seq_id].physical_block_ids - return block_ids # type: ignore - - def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: - request_id = seq_group.request_id - assert request_id in self.cross_block_tables - block_ids = self.cross_block_tables[request_id].physical_block_ids - assert all(b is not None for b in block_ids) - return block_ids # type: ignore - - def access_all_blocks_in_seq(self, seq: Sequence, now: float): - if self.enable_caching: - # Record the latest access time for the sequence. The actual update - # of the block ids is deferred to the sequence free(..) call, since - # only during freeing of block ids, the blocks are actually added to - # the evictor (which is when the most updated time is required) - # (This avoids expensive calls to mark_blocks_as_accessed(..)) - self._last_access_blocks_tracker.update_last_access( - seq.seq_id, now) - - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - # If prefix caching is enabled, mark immutable blocks as computed - # right after they have been scheduled (for prefill). This assumes - # the scheduler is synchronous so blocks are actually computed when - # scheduling the next batch. - self.block_allocator.mark_blocks_as_computed([]) - - def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: - """Determine which blocks for which we skip prefill. - - With prefix caching we can skip prefill for previously-generated blocks. - Currently, the attention implementation only supports skipping cached - blocks if they are a contiguous prefix of cached blocks. - - This method determines which blocks can be safely skipped for all - sequences in the sequence group. - """ - computed_seq_block_ids = [] - for seq in seqs: - all_blocks = self.block_tables[seq.seq_id].physical_block_ids - num_cached_tokens = ( - self._computed_blocks_tracker.get_num_cached_tokens(seq)) - assert num_cached_tokens % self.block_size == 0 - num_cached_blocks = num_cached_tokens // self.block_size - computed_block_ids = all_blocks[:num_cached_blocks] - computed_seq_block_ids.append(computed_block_ids) - - # NOTE(sang): This assumes seq_block_ids doesn't contain any None. - return self.block_allocator.get_common_computed_block_ids( - computed_seq_block_ids) # type: ignore - - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - if parent_seq.seq_id not in self.block_tables: - # Parent sequence has either been freed or never existed. - return - src_block_table = self.block_tables[parent_seq.seq_id] - self.block_tables[child_seq.seq_id] = src_block_table.fork() - - # Track child seq - self._last_access_blocks_tracker.add_seq(child_seq.seq_id) - - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - """Returns the AllocStatus for the given sequence_group - with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap in. - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. - - Returns: - AllocStatus: The AllocStatus for the given sequence group. - """ - return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, - num_lookahead_slots) - - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - """Returns the block id mapping (from CPU to GPU) generated by - swapping in the given seq_group with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap in. - - Returns: - List[Tuple[int, int]]: The mapping of swapping block from CPU - to GPU. - """ - physical_block_id_mapping = [] - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - blocks = self.block_tables[seq.seq_id].blocks - if len(blocks) == 0: - continue - - seq_swap_mapping = self.block_allocator.swap(blocks=blocks, - src_device=Device.CPU, - dst_device=Device.GPU) - - # Refresh the block ids of the table (post-swap) - self.block_tables[seq.seq_id].update(blocks) - - seq_physical_block_id_mapping = { - self.block_allocator.get_physical_block_id( - Device.CPU, cpu_block_id): - self.block_allocator.get_physical_block_id( - Device.GPU, gpu_block_id) - for cpu_block_id, gpu_block_id in seq_swap_mapping.items() - } - - physical_block_id_mapping.extend( - list(seq_physical_block_id_mapping.items())) - - return physical_block_id_mapping - - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - """Returns whether we can swap out the given sequence_group - with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap out. - - Returns: - bool: Whether it's possible to swap out current sequence group. - """ - alloc_status = self._can_swap(seq_group, Device.CPU, - SequenceStatus.RUNNING) - return alloc_status == AllocStatus.OK - - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - """Returns the block id mapping (from GPU to CPU) generated by - swapping out the given sequence_group with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap out. - - Returns: - List[Tuple[int, int]]: The mapping of swapping block from - GPU to CPU. - """ - physical_block_id_mapping = [] - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - blocks = self.block_tables[seq.seq_id].blocks - if len(blocks) == 0: - continue - - seq_swap_mapping = self.block_allocator.swap(blocks=blocks, - src_device=Device.GPU, - dst_device=Device.CPU) - - # Refresh the block ids of the table (post-swap) - self.block_tables[seq.seq_id].update(blocks) - - seq_physical_block_id_mapping = { - self.block_allocator.get_physical_block_id( - Device.GPU, gpu_block_id): - self.block_allocator.get_physical_block_id( - Device.CPU, cpu_block_id) - for gpu_block_id, cpu_block_id in seq_swap_mapping.items() - } - - physical_block_id_mapping.extend( - list(seq_physical_block_id_mapping.items())) - - return physical_block_id_mapping - - def get_num_free_gpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.GPU) - - def get_num_free_cpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.CPU) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return self.block_allocator.get_prefix_cache_hit_rate(device) - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return self.block_allocator.reset_prefix_cache(device) - - def _can_swap(self, - seq_group: SequenceGroup, - device: Device, - status: SequenceStatus, - num_lookahead_slots: int = 0) -> AllocStatus: - """Returns the AllocStatus for swapping in/out the given sequence_group - on to the 'device'. - - Args: - seq_group (SequenceGroup): The sequence group to swap in/out. - device (Device): device to swap the 'seq_group' on. - status (SequenceStatus): The status of sequence which is needed - for action. RUNNING for swap out and SWAPPED for swap in - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. - - Returns: - AllocStatus: The AllocStatus for swapping in/out the given - sequence_group on to the 'device'. - """ - # First determine the number of blocks that will be touched by this - # swap. Then verify if there are available blocks in the device - # to perform the swap. - num_blocks_touched = 0 - blocks: List[Block] = [] - for seq in seq_group.get_seqs(status=status): - block_table = self.block_tables[seq.seq_id] - if block_table.blocks is not None: - # Compute the number blocks to touch for the tokens to be - # appended. This does NOT include the full blocks that need - # to be touched for the swap. - num_blocks_touched += \ - block_table.get_num_blocks_touched_by_append_slots( - block_table.get_unseen_token_ids(seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots) - blocks.extend(block_table.blocks) - # Compute the number of full blocks to touch and add it to the - # existing count of blocks to touch. - num_blocks_touched += self.block_allocator.get_num_full_blocks_touched( - blocks, device=device) - - watermark_blocks = 0 - if device == Device.GPU: - watermark_blocks = self.watermark_blocks - - if self.block_allocator.get_num_total_blocks( - device) < num_blocks_touched: - return AllocStatus.NEVER - elif self.block_allocator.get_num_free_blocks( - device) - num_blocks_touched >= watermark_blocks: - return AllocStatus.OK - else: - return AllocStatus.LATER - - def get_num_cached_tokens(self, seq: Sequence) -> int: - """Get the number of tokens in blocks that are already computed and - cached in the block manager for the sequence. - """ - return self._computed_blocks_tracker.get_num_cached_tokens(seq) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py deleted file mode 100644 index 85ff6bc9ca610..0000000000000 --- a/vllm/core/evictor.py +++ /dev/null @@ -1,157 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import heapq -from abc import ABC, abstractmethod -from typing import Dict, List, Tuple - - -class EvictionPolicy(enum.Enum): - """Enum for eviction policy used by make_evictor to instantiate the correct - Evictor subclass. - """ - LRU = enum.auto() - - -class Evictor(ABC): - """The Evictor subclasses should be used by the BlockAllocator class to - handle eviction of freed Blocks. - """ - - @abstractmethod - def __init__(self): - pass - - @abstractmethod - def __contains__(self, block_id: int) -> bool: - pass - - @abstractmethod - def evict(self) -> Tuple[int, int]: - """Runs the eviction algorithm and returns the evicted block's - content hash along with physical block id along with physical block id - """ - pass - - @abstractmethod - def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - """Adds block to the evictor, making it a candidate for eviction""" - pass - - @abstractmethod - def update(self, block_id: int, last_accessed: float): - """Update corresponding block's access time in metadata""" - pass - - @abstractmethod - def remove(self, block_id: int): - """Remove a given block id from the cache.""" - pass - - @property - @abstractmethod - def num_blocks(self) -> int: - pass - - -class BlockMetaData: - """Data structure for storing key data describe cached block, so that - evictor could use to make its decision which one to choose for eviction - - Here we use physical block id as the dict key, as there maybe several - blocks with the same content hash, but their physical id is unique. - """ - - def __init__(self, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - self.content_hash = content_hash - self.num_hashed_tokens = num_hashed_tokens - self.last_accessed = last_accessed - - -class LRUEvictor(Evictor): - """Evicts in a least-recently-used order using the last_accessed timestamp - that's recorded in the Block. If there are multiple blocks with - the same last_accessed time, then the one with the largest num_hashed_tokens - will be evicted. If two blocks each have the lowest last_accessed time and - highest num_hashed_tokens value, then one will be chosen arbitrarily - """ - - # CLEANUP_THRESHOLD determines the maximum allowable size of the priority - # queue relative to the free table size. When this threshold is exceeded, - # a cleanup operation is triggered to reduce memory usage. - CLEANUP_THRESHOLD = 50 - - def __init__(self): - self.free_table: Dict[int, BlockMetaData] = {} - self.priority_queue = [] - - def __contains__(self, block_id: int) -> bool: - return block_id in self.free_table - - def evict(self) -> Tuple[int, int]: - if len(self.free_table) == 0: - raise ValueError("No usable cache memory left") - - while self.priority_queue: - # We do not remove outdated entries from the priority queue at the - # time of updating the last_accessed timestamp. Instead, outdated - # entries are filtered out here during eviction. Outdated entries - # would either not in the free table, or have older last accessed - # time. - last_accessed, _, block_id, content_hash = heapq.heappop( - self.priority_queue) - if (block_id in self.free_table and - self.free_table[block_id].last_accessed == last_accessed): - self.free_table.pop(block_id) - return block_id, content_hash - - raise ValueError("No usable cache memory left") - - def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - self.free_table[block_id] = BlockMetaData(content_hash, - num_hashed_tokens, - last_accessed) - heapq.heappush( - self.priority_queue, - (last_accessed, -num_hashed_tokens, block_id, content_hash)) - self._cleanup_if_necessary() - - def update(self, block_id: int, last_accessed: float): - self.free_table[block_id].last_accessed = last_accessed - - def _cleanup_if_necessary(self): - if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len( - self.free_table): - self._cleanup() - - def _cleanup(self): - new_priority_queue: List[Tuple[float, int, int, int]] = [] - - for block_id, block in self.free_table.items(): - new_priority_queue.append( - (block.last_accessed, -block.num_hashed_tokens, block_id, - block.content_hash)) - heapq.heapify(new_priority_queue) - - self.priority_queue = new_priority_queue - - def remove(self, block_id: int): - if block_id not in self.free_table: - raise ValueError( - "Attempting to remove block that's not in the evictor") - self.free_table.pop(block_id) - - @property - def num_blocks(self) -> int: - return len(self.free_table) - - -def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: - if eviction_policy == EvictionPolicy.LRU: - return LRUEvictor() - else: - raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py deleted file mode 100644 index 69b9169ddd8a9..0000000000000 --- a/vllm/core/interfaces.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -from abc import ABC, abstractmethod -from typing import List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple - -from vllm.sequence import Sequence, SequenceGroup -from vllm.utils import Device - - -class AllocStatus(enum.Enum): - """Result for BlockSpaceManager.can_allocate - - 1. Ok: seq_group can be allocated now. - 2. Later: seq_group cannot be allocated. - The capacity of allocator is larger than seq_group required. - 3. Never: seq_group can never be allocated. - The seq_group is too large to allocated in GPU. - """ - OK = enum.auto() - LATER = enum.auto() - NEVER = enum.auto() - - -class BlockSpaceManager(ABC): - - @staticmethod - def get_block_space_manager_class(version: str): - version = version.lower() - - if version == "selfattn": - from vllm.core.block_manager import SelfAttnBlockSpaceManager - return SelfAttnBlockSpaceManager - - if version == "placeholder": - from vllm.core.placeholder_block_space_manager import ( - PlaceholderBlockSpaceManager) - return PlaceholderBlockSpaceManager - - raise ValueError(f"Unknown version {version=}") - - @abstractmethod - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - pass - - @abstractmethod - def allocate(self, seq_group: SequenceGroup) -> None: - pass - - @abstractmethod - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - pass - - @abstractmethod - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - pass - - @abstractmethod - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - pass - - @abstractmethod - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - pass - - @abstractmethod - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def free(self, seq: Sequence) -> None: - pass - - @abstractmethod - def get_block_table(self, seq: Sequence) -> List[int]: - pass - - @abstractmethod - def get_num_free_gpu_blocks(self) -> int: - pass - - @abstractmethod - def get_num_free_cpu_blocks(self) -> int: - pass - - @abstractmethod - def access_all_blocks_in_seq( - self, - seq: Sequence, - access_time: float, - ) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: - pass - - @abstractmethod - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for specified or all devices.""" - pass - - @abstractmethod - def get_num_cached_tokens(self, seq: Sequence) -> int: - pass - - @abstractmethod - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - pass \ No newline at end of file diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py deleted file mode 100644 index 679515924e85d..0000000000000 --- a/vllm/core/placeholder_block_space_manager.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Tuple - -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.sequence import Sequence, SequenceGroup -from vllm.utils import Device - - -class PlaceholderBlockSpaceManager(BlockSpaceManager): - """A version of BlockSpaceManager for use in environments - where block management is not required. - For example: pooling models or attention-free models like Mamba. - - This class provides the same interface as BlockSpaceManager, but its - methods perform no actions or return simple values like True in specific - actions. It's designed to be used in scenarios where the overhead of - block management is unnecessary, such as in an embedding environment. - """ - - def __init__( - self, - **kwargs, - ) -> None: - pass - - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - # Always return OK for dummy purposes - return AllocStatus.OK - - def allocate(self, seq_group: SequenceGroup) -> None: - # No actual allocation logic needed - pass - - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - return True - - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - return [] - - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - pass - - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - return AllocStatus.OK - - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - return None # type: ignore - - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - return True - - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - return None # type: ignore - - def free(self, seq: Sequence) -> None: - # No operation on free - return - - def get_block_table(self, seq: Sequence) -> List[int]: - return None # type: ignore - - def get_num_free_gpu_blocks(self) -> int: - return 1 - - def get_num_free_cpu_blocks(self) -> int: - return 1 - - def access_all_blocks_in_seq( - self, - seq: Sequence, - access_time: float, - ) -> None: - pass - - def get_common_computed_block_ids(self, - seq_group: List[Sequence]) -> List[int]: - return [] - - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - pass - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return -1 - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return True - - def get_num_cached_tokens(self, seq: Sequence) -> int: - return 0 - - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - return diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py deleted file mode 100644 index 92ebad778ea4b..0000000000000 --- a/vllm/core/scheduler.py +++ /dev/null @@ -1,2028 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import os -import random -import time -from collections import deque -from dataclasses import dataclass, field -from typing import Callable, Deque, Dict, Iterable, List, Optional -from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.config.lora import LoRAConfig -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupBase, SequenceGroupMetadata, - SequenceGroupMetadataDelta, SequenceStage, - SequenceStatus) -from vllm.utils import Device, PyObjectCache - -logger = init_logger(__name__) - -# Test-only. If configured, decode is preempted with -# ARTIFICIAL_PREEMPTION_PROB% probability. -ENABLE_ARTIFICIAL_PREEMPT = bool( - os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa -ARTIFICIAL_PREEMPTION_PROB = 0.5 -ARTIFICIAL_PREEMPTION_MAX_CNT = 500 - - -class PreemptionMode(enum.Enum): - """Preemption modes. - - 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory - and swap them back in when the sequences are resumed. - 2. Recomputation: Discard the blocks of the preempted sequences and - recompute them when the sequences are resumed, treating the sequences as - new prompts. - """ - - SWAP = enum.auto() - RECOMPUTE = enum.auto() - - -@dataclass -class SchedulingBudget: - """The available slots for scheduling. - - TODO(sang): Right now, the budget is request_id-aware meaning it can ignore - budget update from the same request_id. It is because in normal scheduling - path, we update RUNNING num_seqs ahead of time, meaning it could be - updated more than once when scheduling RUNNING requests. Since this won't - happen if we only have chunked prefill scheduling, we can remove this - feature from the API when chunked prefill is enabled by default. - """ - - token_budget: int - max_num_seqs: int - _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) - _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) - # Number of cached tokens in the batch. - _num_cached_tokens: int = 0 - # Number of actual non-cached tokens in the batch. - _num_batched_tokens: int = 0 - _num_curr_seqs: int = 0 - - def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): - # We allow num_new_tokens to be 0 when the entire sequence has - # been cached. - assert num_new_tokens >= 0 - assert num_new_seqs != 0 - return (self.num_batched_tokens + num_new_tokens <= self.token_budget - and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs) - - def remaining_token_budget(self): - return self.token_budget - self.num_batched_tokens - - def add_num_batched_tokens(self, - req_id: str, - num_batched_tokens: int, - num_cached_tokens: int = 0): - if req_id in self._request_ids_num_batched_tokens: - return - assert num_cached_tokens >= 0 - assert num_batched_tokens >= 0 - - self._request_ids_num_batched_tokens.add(req_id) - self._num_batched_tokens += num_batched_tokens - self._num_cached_tokens += num_cached_tokens - - def subtract_num_batched_tokens(self, req_id: str, - num_batched_tokens: int): - if req_id in self._request_ids_num_batched_tokens: - self._request_ids_num_batched_tokens.remove(req_id) - self._num_batched_tokens -= num_batched_tokens - - def add_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._request_ids_num_curr_seqs: - return - - self._request_ids_num_curr_seqs.add(req_id) - self._num_curr_seqs += num_curr_seqs - - def subtract_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._request_ids_num_curr_seqs: - self._request_ids_num_curr_seqs.remove(req_id) - self._num_curr_seqs -= num_curr_seqs - - @property - def num_batched_tokens(self): - return self._num_batched_tokens - - @property - def num_curr_seqs(self): - return self._num_curr_seqs - - @property - def num_cached_tokens(self): - return self._num_cached_tokens - - -@dataclass -class ScheduledSequenceGroup: - # A sequence group that's scheduled. - seq_group: SequenceGroup - # The total chunk size (number of tokens) to process for next iteration. - # 1 for decoding. Same as prompt tokens for prefill, but if prefill is - # chunked, it can be smaller than that. - token_chunk_size: int - - -@dataclass -class SchedulerOutputs: - """The scheduling decision made from a scheduler.""" - - # Scheduled sequence groups. - scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup] - # Number of prefill groups scheduled. - num_prefill_groups: int - # Total number of batched tokens. - num_batched_tokens: int - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, int]] - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, int]] - # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] - # Sequence groups that are going to be ignored. - ignored_seq_groups: List[SequenceGroup] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - # The number of requests in the running queue - running_queue_size: int - preempted: int - - def __post_init__(self): - # Swap in and swap out should never happen at the same time. - assert not (self.blocks_to_swap_in and self.blocks_to_swap_out) - - self.num_loras: int = len(self.lora_requests) - if self.num_loras > 0: - self._sort_by_lora_ids() - - def is_empty(self) -> bool: - # NOTE: We do not consider the ignored sequence groups. - return (not self.scheduled_seq_groups and not self.blocks_to_swap_in - and not self.blocks_to_swap_out and not self.blocks_to_copy) - - def _sort_by_lora_ids(self): - assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups) - - def key_fn(group: ScheduledSequenceGroup): - key = (group.seq_group.lora_int_id, group.seq_group.request_id) - if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups): - # Sort sequence groups so that all prefills come before all - # decodes as required by chunked prefill. - return (not group.seq_group.is_prefill(), *key) - return key - - self.scheduled_seq_groups = sorted(self.scheduled_seq_groups, - key=key_fn) - - @property - def lora_requests(self) -> Set[LoRARequest]: - return { - g.seq_group.lora_request - for g in self.scheduled_seq_groups - if g.seq_group.lora_request is not None - } - - -@dataclass -class SchedulerRunningOutputs: - """The requests that are scheduled from a running queue. - - Could contain prefill (prefill that's chunked) or decodes. If there's not - enough memory, it can be preempted (for recompute) or swapped out. - """ - - # Selected sequences that are running and in a decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] - # Selected sequences that are running and in a prefill phase. - # I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] - # The preempted sequences. - preempted: List[SequenceGroup] - # Sequences that are swapped out. - swapped_out: List[SequenceGroup] - # The blocks to swap out. - blocks_to_swap_out: List[Tuple[int, int]] - # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - - # Optimization for fast-access to seq_group lists - decode_seq_groups_list: List[SequenceGroup] - prefill_seq_groups_list: List[SequenceGroup] - - @classmethod - def create_empty(cls) -> "SchedulerRunningOutputs": - return SchedulerRunningOutputs( - decode_seq_groups=[], - prefill_seq_groups=[], - preempted=[], - swapped_out=[], - blocks_to_swap_out=[], - blocks_to_copy=[], - num_lookahead_slots=0, - decode_seq_groups_list=[], - prefill_seq_groups_list=[], - ) - - -@dataclass -class SchedulerSwappedInOutputs: - """The requests that are scheduled from a swap queue. - - Could contain prefill (prefill that's chunked) or decodes. - """ - - # Selected sequences that are going to be swapped in and is in a - # decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] - # Selected sequences that are going to be swapped in and in a prefill - # phase. I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] - # The blocks to swap in. - blocks_to_swap_in: List[Tuple[int, int]] - # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - # Infeasible sequence groups. - infeasible_seq_groups: List[SequenceGroup] - - @classmethod - def create_empty(cls) -> "SchedulerSwappedInOutputs": - return SchedulerSwappedInOutputs( - decode_seq_groups=[], - prefill_seq_groups=[], - blocks_to_swap_in=[], - blocks_to_copy=[], - num_lookahead_slots=0, - infeasible_seq_groups=[], - ) - - -@dataclass -class SchedulerPrefillOutputs: - """The requests that are scheduled from a waiting queue. - - Could contain a fresh prefill requests or preempted requests that need - to be recomputed from scratch. - """ - - # Selected sequences for prefill. - seq_groups: List[ScheduledSequenceGroup] - # Ignored sequence groups. - ignored_seq_groups: List[SequenceGroup] - num_lookahead_slots: int - - @classmethod - def create_empty(cls) -> "SchedulerPrefillOutputs": - return SchedulerPrefillOutputs( - seq_groups=[], - ignored_seq_groups=[], - num_lookahead_slots=0, - ) - - -def seq_group_metadata_builder(): - return SequenceGroupMetadata(request_id="", - is_prompt=False, - seq_data={}, - sampling_params=None, - block_tables={}) - - -def scheduler_running_outputs_builder(): - return SchedulerRunningOutputs(decode_seq_groups=[], - prefill_seq_groups=[], - preempted=[], - swapped_out=[], - blocks_to_swap_out=[], - blocks_to_copy=[], - num_lookahead_slots=0, - prefill_seq_groups_list=[], - decode_seq_groups_list=[]) - - -def scheduled_seq_group_builder(): - return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup), - token_chunk_size=0) - # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0) - - -@dataclass -class PartialPrefillMetadata: - """Holds information about the partial prefills that are currently running - during a single iteration of the Scheduler. - When chunked prefill is enabled, we allow a certain number of seqs to be - partially prefilled during each iteration. Having multiple partial prefills - in flight allows us to minimize TTFT and avoid decode starvation in cases - where a single sequence group with a very large prompt blocks the queue for - too many iterations. - The number of long prefill requests is limited so that smaller - requests may jump the queue in front of them and get to the decode - phase faster. - """ - - # A minimum bound on the total number of prefills to be scheduled during - # this iteration - schedulable_prefills: int - - # The number of long prefill requests currently running - long_prefills: int - - scheduler_config: SchedulerConfig - - def can_schedule(self, seq_group: SequenceGroup) -> bool: - """When concurrent partial prefills are enabled, - we limit the number of long requests and only accept - shorter requests from the queue while running them - concurrently""" - return not (seq_group.first_seq.get_num_new_tokens() - > self.scheduler_config.long_prefill_token_threshold - and self.long_prefills - >= self.scheduler_config.max_long_partial_prefills - and self.scheduler_config.max_num_partial_prefills > 1) - - def maybe_increment_partial_prefills(self, - seq_group: SequenceGroup) -> None: - # When a new prefill is scheduled, we need to know if it is a - # long request - if (seq_group.first_seq.get_num_new_tokens() - > self.scheduler_config.long_prefill_token_threshold): - self.long_prefills += 1 - - @classmethod - def from_queues( - cls, - running: Deque[SequenceGroup], - waiting: Deque[SequenceGroup], - scheduler_config: SchedulerConfig, - ) -> "PartialPrefillMetadata": - """Create a PartialPrefillMetadata object from the current state of - the scheduler's queues. - This accounts for the currently running prefill requests, and peeks into - the waiting queue to see if there are more prefills to potentially be - scheduled during this iteration.""" - prefills = 0 - long_prefills = 0 - - waiting_long_prefills = 0 - - for sg in running: - if sg.first_seq.data.stage == SequenceStage.PREFILL: - prefills += 1 - if (sg.first_seq.get_num_new_tokens() - > scheduler_config.long_prefill_token_threshold): - long_prefills += 1 - - for sg in waiting: - # Don't bother looping through the rest of the queue if we know - # there are already at - # least max_partial_prefills requests to fill - if prefills >= scheduler_config.max_num_partial_prefills: - break - - # Don't count long requests from the waiting queue if we aren't - # going to schedule them anyway - if (sg.first_seq.get_num_new_tokens() - > scheduler_config.long_prefill_token_threshold): - if (long_prefills + waiting_long_prefills - >= scheduler_config.max_long_partial_prefills): - continue - waiting_long_prefills += 1 - prefills += 1 - - # NB: long_prefills and waiting_long_prefills are tracked separately. - # We don't account for the waiting requests here because we need to use - # this metadata to track how many have actually been scheduled. - return PartialPrefillMetadata( - schedulable_prefills=min( - prefills, scheduler_config.max_num_partial_prefills), - long_prefills=long_prefills, - scheduler_config=scheduler_config, - ) - - -class Scheduler: - - def __init__( - self, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - lora_config: Optional[LoRAConfig], - pipeline_parallel_size: int = 1, - output_proc_callback: Optional[Callable] = None, - ) -> None: - self.scheduler_config = scheduler_config - self.cache_config = cache_config - # Note for LoRA scheduling: the current policy is extremely - # simple and NOT fair. It can lead to starvation of some - # LoRAs. This should be improved in the future. - self.lora_config = lora_config - - version = "selfattn" - if (self.scheduler_config.runner_type == "pooling" - or self.cache_config.is_attention_free): - version = "placeholder" - - BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class( - version) - - num_gpu_blocks = cache_config.num_gpu_blocks - if num_gpu_blocks: - num_gpu_blocks //= pipeline_parallel_size - - num_cpu_blocks = cache_config.num_cpu_blocks - if num_cpu_blocks: - num_cpu_blocks //= pipeline_parallel_size - - # Create the block space manager. - self.block_manager = BlockSpaceManagerImpl( - block_size=self.cache_config.block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - sliding_window=self.cache_config.sliding_window, - enable_caching=self.cache_config.enable_prefix_caching, - ) - - # Sequence groups in the WAITING state. - # Contain new prefill or preempted requests. - self.waiting: Deque[SequenceGroup] = deque() - # Sequence groups in the RUNNING state. - # Contain decode requests. - self.running: Deque[SequenceGroup] = deque() - # Sequence groups in the SWAPPED state. - # Contain decode requests that are swapped out. - self.swapped: Deque[SequenceGroup] = deque() - # Sequence groups finished requests ids since last step iteration. - # It lets the model know that any state associated with these requests - # can and must be released after the current step. - # This is used to evict the finished requests from the Mamba cache. - self._finished_requests_ids: List[str] = list() - # Time at previous scheduling step - self.prev_time = 0.0 - # Did we schedule a prompt at previous step? - self.prev_prompt = False - # Latency of the last prompt step - self.last_prompt_latency = 0.0 - # preemption mode, RECOMPUTE or SWAP - self.user_specified_preemption_mode = scheduler_config.preemption_mode - - # The following field is test-only. It is used to inject artificial - # preemption. - self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT - self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT - if self.enable_artificial_preemption - else 0) - self.num_cumulative_preemption: int = 0 - - # Used to cache python objects - self._seq_group_metadata_cache: List[PyObjectCache] = [] - self._scheduler_running_outputs_cache: List[PyObjectCache] = [] - self._scheduled_seq_group_cache: List[PyObjectCache] = [] - - # For async output processing, we need to swap cache buffers between - # iterations. I.e. since the output processing is lagged one step, - # we cannot reuse the cached objects immediately when the schedule() - # is called again, but only when schedule() is called the second time. - self.output_proc_callback = output_proc_callback - self.use_async_output_proc = self.output_proc_callback is not None - self.num_cache_iters = 2 if self.use_async_output_proc else 1 - - self.cache_id = 0 - for i in range(self.num_cache_iters): - self._seq_group_metadata_cache.append( - PyObjectCache(seq_group_metadata_builder)) - self._scheduler_running_outputs_cache.append( - PyObjectCache(scheduler_running_outputs_builder)) - self._scheduled_seq_group_cache.append( - PyObjectCache(scheduled_seq_group_builder)) - - # For async postprocessor, the extra decode run cannot be done - # when the request reaches max_model_len. In this case, the request - # will be stopped during schedule() call and added to this stop list - # for processing and deallocation by the free_finished_seq_groups() - self._async_stopped: List[SequenceGroup] = [] - - # List with the chunk sizes to hand out to each sequence depending - # on how many partial prefills are running. This is slightly faster than - # running an integer division every time a prefill is scheduled. - # This splits the budget evenly among all prefills. - self.partial_prefill_budget_lookup_list = [0] * ( - self.scheduler_config.max_num_partial_prefills + 1) - self.partial_prefill_budget_lookup_list[0] = ( - scheduler_config.max_num_batched_tokens) - for i in range(1, self.scheduler_config.max_num_partial_prefills + 1): - self.partial_prefill_budget_lookup_list[i] = ( - scheduler_config.max_num_batched_tokens // i) - - @property - def next_cache_id(self): - return (self.cache_id + 1) % self.num_cache_iters - - @property - def lora_enabled(self) -> bool: - return bool(self.lora_config) - - @property - def num_decoding_tokens_per_seq(self) -> int: - """The number of new tokens.""" - return 1 - - def add_seq_group(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the waiting queue. - self.waiting.append(seq_group) - - def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the running queue. - # Only for testing purposes. - self.running.append(seq_group) - - def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the swapped queue. - # Only for testing purposes. - self.swapped.append(seq_group) - - def abort_seq_group( - self, - request_id: Union[str, Iterable[str]], - seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None, - ) -> None: - """Aborts a sequence group with the given ID. - - Check if the sequence group with the given ID - is present in any of the state queue. - If present, remove the sequence group from the state queue. - Also, if any of the sequences in the sequence group is not finished, - free the sequence with status `FINISHED_ABORTED`. - Otherwise, do nothing. - - Args: - request_id: The ID(s) of the sequence group to abort. - seq_id_to_seq_group: helper for groups with n>1 - """ - if isinstance(request_id, str): - request_id = (request_id, ) - request_ids = set(request_id) - seq_id_to_seq_group = seq_id_to_seq_group or {} - for state_queue in [self.waiting, self.running, self.swapped]: - aborted_groups: List[SequenceGroup] = [] - for seq_group in state_queue: - # When n>1, seq_group.request_id looks like - # foo_parallel_sample_0, while request_ids is just foo, and we - # should resolve it as real_request_id to match. - if seq_group.request_id in seq_id_to_seq_group: - real_request_id = seq_id_to_seq_group[ - seq_group.request_id].group_id - else: - real_request_id = seq_group.request_id - if real_request_id in request_ids: - # Appending aborted group into pending list. - aborted_groups.append(seq_group) - # We can't remove real_request_id in request_ids here, - # because there may be other seq groups sharing the same - # real_request_id - for aborted_group in aborted_groups: - # Remove the sequence group from the state queue. - state_queue.remove(aborted_group) - # Remove the aborted request from the Mamba cache. - self._finished_requests_ids.append(aborted_group.request_id) - for seq in aborted_group.get_seqs(): - if seq.is_finished(): - continue - seq.status = SequenceStatus.FINISHED_ABORTED - self.free_seq(seq) - if aborted_group.request_id in seq_id_to_seq_group: - del seq_id_to_seq_group[aborted_group.request_id] - - self._free_seq_group_cross_attn_blocks(aborted_group) - - def _free_seq_group_cross_attn_blocks( - self, - seq_group: SequenceGroup, - ) -> None: - """ - Free a sequence group from a cross-attention block table. - Has no effect on decoder-only models. - """ - if seq_group.is_encoder_decoder(): - self.block_manager.free_cross(seq_group) - - def has_unfinished_seqs(self) -> bool: - return (len(self.waiting) != 0 or len(self.running) != 0 - or len(self.swapped) != 0) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return self.block_manager.get_prefix_cache_hit_rate(device) - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return self.block_manager.reset_prefix_cache(device) - - def get_num_unfinished_seq_groups(self) -> int: - return len(self.waiting) + len(self.running) + len(self.swapped) - - def get_and_reset_finished_requests_ids(self) -> List[str]: - """Flushes the list of request ids of previously finished seq_groups.""" - finished_requests_ids = self._finished_requests_ids - self._finished_requests_ids = list() - return finished_requests_ids - - def _schedule_running( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> SchedulerRunningOutputs: - """Schedule sequence groups that are running. - - Running queue should include decode and chunked prefill requests. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any decodes are preempted. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any decodes are preempted. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - partial_prefill_metadata: information about the partial prefills - that are currently running - - Returns: - SchedulerRunningOutputs. - """ - ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[ - self.cache_id].get_object() - ret.blocks_to_swap_out.clear() - ret.blocks_to_copy.clear() - ret.decode_seq_groups.clear() - ret.prefill_seq_groups.clear() - ret.preempted.clear() - ret.swapped_out.clear() - - ret.num_lookahead_slots = self._get_num_lookahead_slots( - is_prefill=False, enable_chunking=enable_chunking) - - ret.decode_seq_groups_list.clear() - ret.prefill_seq_groups_list.clear() - - # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out - blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy - - decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups - prefill_seq_groups: List[ - ScheduledSequenceGroup] = ret.prefill_seq_groups - preempted: List[SequenceGroup] = ret.preempted - swapped_out: List[SequenceGroup] = ret.swapped_out - - running_queue = self.running - assert len(self._async_stopped) == 0 - while running_queue: - seq_group = running_queue[0] - # We discard the cached tokens info here because we don't need it - # for running sequence: - # 1. If a sequence is running with chunked prefill, the cached - # tokens info was already used for the first prefill. - # 2. If a sequence is running with non-chunked prefill, then - # there it's a decoding sequence, and the cached tokens info is - # irrelevant. - num_uncached_new_tokens, _ = \ - self._get_num_new_uncached_and_cached_tokens( - seq_group, - SequenceStatus.RUNNING, - enable_chunking, - budget, - partial_prefill_metadata, - ) - - num_running_tokens = num_uncached_new_tokens - if num_running_tokens == 0: - # No budget => Stop - break - - running_queue.popleft() - - # With async postprocessor, an extra decode run is done - # to process the final tokens. The check below avoids this extra - # decode run when the model max len is reached, in order to avoid - # a memory overflow. - if (self.use_async_output_proc and seq_group.seqs[0].get_len() - > self.scheduler_config.max_model_len): - self._async_stopped.append(seq_group) - continue - - # NOTE(woosuk): Preemption happens only when there is no available - # slot to keep all the sequence groups in the RUNNING state. - while not self._can_append_slots(seq_group, enable_chunking): - budget.subtract_num_batched_tokens(seq_group.request_id, - num_running_tokens) - num_running_seqs = seq_group.get_max_num_running_seqs() - budget.subtract_num_seqs(seq_group.request_id, - num_running_seqs) - - if (curr_loras is not None and seq_group.lora_int_id > 0 - and seq_group.lora_int_id in curr_loras): - curr_loras.remove(seq_group.lora_int_id) - - # Determine victim sequence - cont_loop = True - if running_queue: - # Preempt the lowest-priority sequence group. - victim_seq_group = running_queue.pop() - else: - # No other sequence group can be preempted. - # Preempt the current sequence group. - # Note: This is also where we stop this loop - # (since there is nothing else to preempt) - victim_seq_group = seq_group - cont_loop = False - - # With async postprocessor, before preempting a sequence - # we need to ensure it has no pending async postprocessor - do_preempt = True - if self.use_async_output_proc: - assert self.output_proc_callback is not None - self.output_proc_callback( - request_id=victim_seq_group.request_id) - - # It may be that the async pending "victim_seq_group" - # becomes finished, in which case we simply free it. - if victim_seq_group.is_finished(): - self._free_finished_seq_group(victim_seq_group) - do_preempt = False - - # Do preemption - if do_preempt: - preempted_mode = self._preempt(victim_seq_group, - blocks_to_swap_out) - if preempted_mode == PreemptionMode.RECOMPUTE: - preempted.append(victim_seq_group) - else: - swapped_out.append(victim_seq_group) - - if not cont_loop: - break - else: - self._append_slots(seq_group, blocks_to_copy, enable_chunking) - is_prefill = seq_group.is_prefill() - - scheduled_seq_group: ScheduledSequenceGroup = ( - self._scheduled_seq_group_cache[ - self.cache_id].get_object()) - scheduled_seq_group.seq_group = seq_group - if is_prefill: - scheduled_seq_group.token_chunk_size = num_running_tokens - prefill_seq_groups.append(scheduled_seq_group) - ret.prefill_seq_groups_list.append(seq_group) - else: - scheduled_seq_group.token_chunk_size = 1 - decode_seq_groups.append(scheduled_seq_group) - ret.decode_seq_groups_list.append(seq_group) - - budget.add_num_batched_tokens(seq_group.request_id, - num_running_tokens) - # OPTIMIZATION: Note that get_max_num_running_seqs is - # expensive. For the default scheduling chase where - # enable_chunking is False, num_seqs are updated before running - # this method, so we don't have to update it again here. - if enable_chunking: - num_running_seqs = seq_group.get_max_num_running_seqs() - budget.add_num_seqs(seq_group.request_id, num_running_seqs) - if curr_loras is not None and seq_group.lora_int_id > 0: - curr_loras.add(seq_group.lora_int_id) - - self._scheduler_running_outputs_cache[self.next_cache_id].reset() - self._scheduled_seq_group_cache[self.next_cache_id].reset() - - return ret - - def _schedule_swapped( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - ) -> SchedulerSwappedInOutputs: - """Schedule sequence groups that are swapped out. - - It schedules swapped requests as long as it fits `budget` and - curr_loras <= max_lora from the scheduling config. The input arguments - `budget` and `curr_loras` are updated based on scheduled seq_groups. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are swapped in. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any requests are swapped in. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - - Returns: - SchedulerSwappedInOutputs. - """ - # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_in: List[Tuple[int, int]] = [] - blocks_to_copy: List[Tuple[int, int]] = [] - decode_seq_groups: List[ScheduledSequenceGroup] = [] - prefill_seq_groups: List[ScheduledSequenceGroup] = [] - infeasible_seq_groups: List[SequenceGroup] = [] - - swapped_queue = self.swapped - - leftover_swapped: Deque[SequenceGroup] = deque() - while swapped_queue: - seq_group = swapped_queue[0] - - # If the sequence group cannot be swapped in, stop. - is_prefill = seq_group.is_prefill() - alloc_status = self.block_manager.can_swap_in( - seq_group, - self._get_num_lookahead_slots(is_prefill, enable_chunking)) - if alloc_status == AllocStatus.LATER: - break - elif alloc_status == AllocStatus.NEVER: - logger.warning( - "Failing the request %s because there's not enough kv " - "cache blocks to run the entire sequence.", - seq_group.request_id, - ) - for seq in seq_group.get_seqs(): - seq.status = SequenceStatus.FINISHED_IGNORED - infeasible_seq_groups.append(seq_group) - swapped_queue.popleft() - continue - - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - assert curr_loras is not None - assert self.lora_config is not None - if (lora_int_id > 0 and (lora_int_id not in curr_loras) - and len(curr_loras) >= self.lora_config.max_loras): - # We don't have a space for another LoRA, so - # we ignore this request for now. - leftover_swapped.appendleft(seq_group) - swapped_queue.popleft() - continue - - # The total number of sequences in the RUNNING state should not - # exceed the maximum number of sequences. - num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens_uncached, num_new_tokens_cached = ( - self._get_num_new_uncached_and_cached_tokens( - seq_group, SequenceStatus.SWAPPED, enable_chunking, - budget)) - - if num_new_tokens_uncached == 0 or not budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - ): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.SWAPPED) - break - - if lora_int_id > 0 and curr_loras is not None: - curr_loras.add(lora_int_id) - swapped_queue.popleft() - self._swap_in(seq_group, blocks_to_swap_in) - self._append_slots(seq_group, blocks_to_copy, enable_chunking) - if is_prefill: - prefill_seq_groups.append( - ScheduledSequenceGroup( - seq_group, - token_chunk_size=num_new_tokens_uncached + - num_new_tokens_cached, - )) - else: - decode_seq_groups.append( - ScheduledSequenceGroup(seq_group, token_chunk_size=1)) - budget.add_num_batched_tokens( - seq_group.request_id, - num_batched_tokens=num_new_tokens_uncached, - num_cached_tokens=num_new_tokens_cached, - ) - budget.add_num_seqs(seq_group.request_id, num_new_seqs) - - swapped_queue.extendleft(leftover_swapped) - - return SchedulerSwappedInOutputs( - decode_seq_groups=decode_seq_groups, - prefill_seq_groups=prefill_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_copy=blocks_to_copy, - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=False, enable_chunking=enable_chunking), - infeasible_seq_groups=infeasible_seq_groups, - ) - - def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: - if self.scheduler_config.chunked_prefill_enabled: - prompt_limit = self.scheduler_config.max_model_len - else: - prompt_limit = min( - self.scheduler_config.max_model_len, - self.scheduler_config.max_num_batched_tokens, - ) - - # Model is fine tuned with long context. Return the fine tuned max_len. - if seq_group.lora_request and seq_group.lora_request.long_lora_max_len: - assert prompt_limit <= seq_group.lora_request.long_lora_max_len - return seq_group.lora_request.long_lora_max_len - else: - return prompt_limit - - def _get_priority(self, - seq_group: SequenceGroup) -> Tuple[Optional[int], float]: - """Get the priority of the sequence group. - Highest preference to user-defined priority, followed by arrival time. - Args: - seq_group: The sequence group input. - Returns: - The priority of the sequence group. - """ - return seq_group.priority, seq_group.arrival_time - - def _schedule_priority_preemption( - self, - budget: SchedulingBudget, - ) -> int: - """Sorts waiting and running queue. Also, force preempt requests - from the running queue if their priority is lower. - Priority-based preemption is used with the priority policy. - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are scheduled. - Returns: - A count of priority-based preemptions. - """ - - waiting_queue = self.waiting - - running_queue = deque(sorted(self.running, key=self._get_priority)) - - blocks_to_swap_out: List[Tuple[int, int]] = [] - force_preemption_count = 0 - - if waiting_queue: - seq_group = waiting_queue.popleft() - num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens_uncached, _ = \ - self._get_num_new_uncached_and_cached_tokens( - seq_group, SequenceStatus.WAITING, False, budget) - - # Only preempt if priority inversion exists - while running_queue and self._get_priority( - running_queue[-1]) > self._get_priority(seq_group): - # Only preempt if waiting sequence cannot be allocated - can_allocate = self.block_manager.can_allocate(seq_group) - if (num_new_tokens_uncached > 0 - and can_allocate == AllocStatus.OK - and budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - )): - break - - # Adjust budget to remove the victim sequence group - vseq_group = running_queue.pop() - num_running_tokens_uncached, _ = ( - self._get_num_new_uncached_and_cached_tokens( - vseq_group, SequenceStatus.RUNNING, False, budget)) - budget.subtract_num_batched_tokens( - vseq_group.request_id, num_running_tokens_uncached) - num_running_seqs = vseq_group.get_max_num_running_seqs() - budget.subtract_num_seqs(vseq_group.request_id, - num_running_seqs) - - # Preempt out the victim sequence group - self._preempt(vseq_group, blocks_to_swap_out) - waiting_queue.appendleft(vseq_group) - force_preemption_count += 1 - # Put the sequence back into the waiting queue - waiting_queue.appendleft(seq_group) - - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - - waiting_queue = deque(sorted(waiting_queue, key=self._get_priority)) - - self.waiting = waiting_queue - self.running = running_queue - return force_preemption_count - - def _schedule_prefills( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> SchedulerPrefillOutputs: - """Schedule sequence groups that are in prefill stage. - - Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE - as a new prefill (that starts from beginning -> most recently generated - tokens). - - It schedules waiting requests as long as it fits `budget` and - curr_loras <= max_lora from the scheduling config. The input arguments - `budget` and `curr_loras` are updated based on scheduled seq_groups. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are scheduled. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any requests are scheduled. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - partial_prefill_metadata: information about the partial prefills - that are currently running - - Returns: - SchedulerPrefillOutputs. - """ - if budget.remaining_token_budget() == 0: - # Do nothing: Can't add any more prefill anyway - return SchedulerPrefillOutputs( - seq_groups=[], - ignored_seq_groups=[], - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=True, enable_chunking=enable_chunking), - ) - ignored_seq_groups: List[SequenceGroup] = [] - seq_groups: List[ScheduledSequenceGroup] = [] - using_prompt_embeds: bool = False - - waiting_queue = self.waiting - - leftover_waiting_sequences: Deque[SequenceGroup] = deque() - while self._passed_delay(time.time()) and waiting_queue: - seq_group = waiting_queue[0] - - waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) - assert len(waiting_seqs) == 1, ( - "Waiting sequence group should have only one prompt " - "sequence.") - if (partial_prefill_metadata is not None - and not partial_prefill_metadata.can_schedule(seq_group)): - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - num_new_tokens_uncached, num_new_tokens_cached = ( - self._get_num_new_uncached_and_cached_tokens( - seq_group, - SequenceStatus.WAITING, - enable_chunking, - budget, - partial_prefill_metadata=partial_prefill_metadata, - )) - num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached - - if not enable_chunking: - num_prompt_tokens = waiting_seqs[0].get_len() - assert num_new_tokens == num_prompt_tokens - - prompt_limit = self._get_prompt_limit(seq_group) - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.FINISHED_IGNORED) - ignored_seq_groups.append(seq_group) - waiting_queue.popleft() - continue - - num_lookahead_slots: int = 0 - - # If the sequence group cannot be allocated, stop. - can_allocate = self.block_manager.can_allocate( - seq_group, num_lookahead_slots=num_lookahead_slots) - if can_allocate == AllocStatus.LATER: - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - elif can_allocate == AllocStatus.NEVER: - logger.warning( - "Input prompt (%d tokens) + lookahead slots (%d) is " - "too long and exceeds the capacity of block_manager", - num_new_tokens, - num_lookahead_slots, - ) - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.FINISHED_IGNORED) - ignored_seq_groups.append(seq_group) - waiting_queue.popleft() - continue - - # We cannot mix sequence groups that use prompt embeds and - # those that do not. - if len(seq_groups) == 0: - using_prompt_embeds = seq_group.uses_prompt_embeds() - if using_prompt_embeds != seq_group.uses_prompt_embeds(): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - assert curr_loras is not None - assert self.lora_config is not None - if (self.lora_enabled and lora_int_id > 0 - and lora_int_id not in curr_loras - and len(curr_loras) >= self.lora_config.max_loras): - # We don't have a space for another LoRA, so - # we ignore this request for now. - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - - if (budget.num_batched_tokens - >= self.scheduler_config.max_num_batched_tokens): - # We've reached the budget limit - since there might be - # continuous prefills in the running queue, we should break - # to avoid scheduling any new prefills. - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - - num_new_seqs = seq_group.get_max_num_running_seqs() - if num_new_tokens_uncached == 0 or not budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - ): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - - # Can schedule this request. - if curr_loras is not None and lora_int_id > 0: - curr_loras.add(lora_int_id) - waiting_queue.popleft() - self._allocate_and_set_running(seq_group) - - if partial_prefill_metadata is not None: - partial_prefill_metadata.maybe_increment_partial_prefills( - seq_group) - - seq_groups.append( - ScheduledSequenceGroup(seq_group=seq_group, - token_chunk_size=num_new_tokens)) - budget.add_num_batched_tokens( - seq_group.request_id, - num_batched_tokens=num_new_tokens_uncached, - num_cached_tokens=num_new_tokens_cached, - ) - budget.add_num_seqs(seq_group.request_id, num_new_seqs) - - # Queue requests that couldn't be scheduled. - waiting_queue.extendleft(leftover_waiting_sequences) - if len(seq_groups) > 0: - self.prev_prompt = True - - return SchedulerPrefillOutputs( - seq_groups=seq_groups, - ignored_seq_groups=ignored_seq_groups, - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=True, enable_chunking=enable_chunking), - ) - - def _schedule_default(self) -> SchedulerOutputs: - """Schedule queued requests. - - The current policy is designed to optimize the throughput. First, - it batches as many prefill requests as possible. And it schedules - decodes. If there's a pressure on GPU memory, decode requests can - be swapped or preempted. - """ - # Include running requests to the budget. - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) - # Make sure we include num running seqs before scheduling prefill, - # so that we don't schedule beyond max_num_seqs for prefill. - for seq_group in self.running: - budget.add_num_seqs(seq_group.request_id, - seq_group.get_max_num_running_seqs()) - curr_loras = (set( - seq_group.lora_int_id for seq_group in self.running - if seq_group.lora_int_id > 0) if self.lora_enabled else None) - - prefills = SchedulerPrefillOutputs.create_empty() - running_scheduled = SchedulerRunningOutputs.create_empty() - swapped_in = SchedulerSwappedInOutputs.create_empty() - - # If any requests are swapped, prioritized swapped requests. - if not self.swapped: - prefills = self._schedule_prefills(budget, - curr_loras, - enable_chunking=False) - - if len(prefills.seq_groups - ) == 0 and self.scheduler_config.policy == "priority": - self._schedule_priority_preemption(budget) - - # Don't schedule decodes if prefills are scheduled. - # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running - # only contains decode requests, not chunked prefills. - if len(prefills.seq_groups) == 0: - running_scheduled = self._schedule_running(budget, - curr_loras, - enable_chunking=False) - - # If any sequence group is preempted, do not swap in any sequence - # group. because it means there's no slot for new running requests. - if (len(running_scheduled.preempted) + - len(running_scheduled.swapped_out) == 0): - swapped_in = \ - self._schedule_swapped(budget, curr_loras) - - assert (budget.num_batched_tokens - <= self.scheduler_config.max_num_batched_tokens) - assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs - - # Update waiting requests. - self.waiting.extendleft(running_scheduled.preempted) - # Update new running requests. - if len(prefills.seq_groups) > 0: - self.running.extend([s.seq_group for s in prefills.seq_groups]) - - self.running.extend(running_scheduled.decode_seq_groups_list) - - if len(swapped_in.decode_seq_groups) > 0: - self.running.extend( - [s.seq_group for s in swapped_in.decode_seq_groups]) - - # Update swapped requests. - self.swapped.extend(running_scheduled.swapped_out) - preempted = len(running_scheduled.preempted) + len( - running_scheduled.swapped_out) - - # There should be no prefill from running queue because this policy - # doesn't allow chunked prefills. - assert len(running_scheduled.prefill_seq_groups) == 0 - assert len(swapped_in.prefill_seq_groups) == 0 - - # Merge lists - num_prefill_groups = len(prefills.seq_groups) - ignored_seq_groups_for_embeds = list[SequenceGroup]() - if num_prefill_groups > 0: - scheduled_seq_groups = prefills.seq_groups - scheduled_seq_groups.extend(running_scheduled.decode_seq_groups) - ignored_seq_groups_for_embeds.clear() - else: - scheduled_seq_groups = running_scheduled.decode_seq_groups - if len(scheduled_seq_groups) > 0: - using_prompt_embeds = scheduled_seq_groups[ - 0].seq_group.uses_prompt_embeds() - ignored_seq_groups_for_embeds.clear() - indices_ignored = list[int]() - for i, schedule_seq_group in enumerate(scheduled_seq_groups): - if using_prompt_embeds !=\ - schedule_seq_group.seq_group.uses_prompt_embeds(): - ignored_seq_groups_for_embeds.append( - schedule_seq_group.seq_group) - indices_ignored.append(i) - if len(ignored_seq_groups_for_embeds) > 0: - scheduled_seq_groups = [ - group for i, group in enumerate(scheduled_seq_groups) - if i not in indices_ignored - ] - else: - ignored_seq_groups_for_embeds.clear() - - scheduled_seq_groups.extend(swapped_in.decode_seq_groups) - - blocks_to_copy = running_scheduled.blocks_to_copy - blocks_to_copy.extend(swapped_in.blocks_to_copy) - - ignored_seq_groups = prefills.ignored_seq_groups - ignored_seq_groups.extend(ignored_seq_groups_for_embeds) - ignored_seq_groups.extend(swapped_in.infeasible_seq_groups) - - return SchedulerOutputs( - scheduled_seq_groups=scheduled_seq_groups, - num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens + - budget.num_cached_tokens, - blocks_to_swap_in=swapped_in.blocks_to_swap_in, - blocks_to_swap_out=running_scheduled.blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ignored_seq_groups=ignored_seq_groups, - num_lookahead_slots=running_scheduled.num_lookahead_slots, - running_queue_size=len(self.running), - preempted=preempted, - ) - - def _schedule_chunked_prefill(self) -> SchedulerOutputs: - """Schedule queued requests. - - Chunked prefill allows to chunk prefill requests, batch them together - with decode requests. This policy 1. schedule as many decoding requests - as possible. 2. schedule chunked prefill requests that are not - finished. 3. schedule swapped request. 4. schedule new prefill - requests. - - The policy can sustain the high GPU utilization because it can put - prefill and decodes requests to the same batch, while it improves - inter token latency because decodes requests don't need to be blocked - by prefill requests. - """ - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) - curr_loras: Set[int] = set() - - prefills = SchedulerPrefillOutputs.create_empty() - swapped_in = SchedulerSwappedInOutputs.create_empty() - - # Create partial prefill metadata - partial_prefill_metadata = PartialPrefillMetadata.from_queues( - running=self.running, - waiting=self.waiting, - scheduler_config=self.scheduler_config, - ) - - # Decoding should be always scheduled first by fcfs. - running_scheduled = self._schedule_running( - budget, - curr_loras, - enable_chunking=True, - partial_prefill_metadata=partial_prefill_metadata, - ) - - # Schedule swapped out requests. - # If preemption happens, it means we don't have space for swap-in. - if len(running_scheduled.preempted) + len( - running_scheduled.swapped_out) == 0: - swapped_in = self._schedule_swapped(budget, curr_loras) - - prefills = self._schedule_prefills( - budget, - curr_loras, - enable_chunking=True, - partial_prefill_metadata=partial_prefill_metadata, - ) - - assert (budget.num_batched_tokens - <= self.scheduler_config.max_num_batched_tokens) - assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs - - # Update waiting requests. - self.waiting.extendleft(running_scheduled.preempted) - - # Update new running requests. - # By default, vLLM scheduler prioritizes prefills. - # Once chunked prefill is enabled, - # the policy is changed to prioritize decode requests. - self.running.extend( - [s.seq_group for s in swapped_in.decode_seq_groups]) - self.running.extend( - [s.seq_group for s in swapped_in.prefill_seq_groups]) - self.running.extend( - [s.seq_group for s in running_scheduled.decode_seq_groups]) - # Because multiple prefills may be running concurrently, we need to - # make sure that prefills which are scheduled to finish are listed - # before those that won't. This is so that on the next scheduling - # iteration when they have transitioned to the decode stage, they are - # properly prioritized over sequences that are still in the prefill - # stage. - self.running.extend( - self._order_finishing_prefills_first( - running_scheduled.prefill_seq_groups)) - self.running.extend([s.seq_group for s in prefills.seq_groups]) - - # Update swapped requests. - self.swapped.extend(running_scheduled.swapped_out) - # Put prefills first due to Attention backend ordering assumption. - scheduled_seq_groups = (prefills.seq_groups + - running_scheduled.prefill_seq_groups + - swapped_in.prefill_seq_groups + - running_scheduled.decode_seq_groups + - swapped_in.decode_seq_groups) - num_prefill_groups = (len(prefills.seq_groups) + - len(swapped_in.prefill_seq_groups) + - len(running_scheduled.prefill_seq_groups)) - return SchedulerOutputs( - scheduled_seq_groups=scheduled_seq_groups, - num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens + - budget.num_cached_tokens, - blocks_to_swap_in=swapped_in.blocks_to_swap_in, - blocks_to_swap_out=running_scheduled.blocks_to_swap_out, - blocks_to_copy=running_scheduled.blocks_to_copy + - swapped_in.blocks_to_copy, - ignored_seq_groups=prefills.ignored_seq_groups + - swapped_in.infeasible_seq_groups, - num_lookahead_slots=0, - running_queue_size=len(self.running), - preempted=(len(running_scheduled.preempted) + - len(running_scheduled.swapped_out)), - ) - - def _order_finishing_prefills_first( - self, scheduled_prefill_seqs: List[ScheduledSequenceGroup] - ) -> List[SequenceGroup]: - """Returns a list of prefilling SequenceGroups where sequences that are - scheduled to finish prefilling are listed first""" - finishing = [ - s.seq_group for s in scheduled_prefill_seqs - if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size - ] - not_finishing = [ - s.seq_group for s in scheduled_prefill_seqs - if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size - ] - return finishing + not_finishing - - def _schedule(self) -> SchedulerOutputs: - """Schedule queued requests.""" - if self.scheduler_config.chunked_prefill_enabled: - return self._schedule_chunked_prefill() - else: - return self._schedule_default() - - def _can_append_slots(self, seq_group: SequenceGroup, - enable_chunking: bool) -> bool: - """Determine whether or not we have enough space in the KV cache to - continue generation of the sequence group. - """ - # It is True only for testing case to trigger artificial preemption. - if (self.enable_artificial_preemption - and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB - and self.artificial_preempt_cnt > 0): - self.artificial_preempt_cnt -= 1 - return False - - is_prefill = seq_group.is_prefill() - num_lookahead_slots = self._get_num_lookahead_slots( - is_prefill, enable_chunking) - - return self.block_manager.can_append_slots( - seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) - - def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: - # async_output_proc is allowed only when we have a single sequence - # in the sequence group - no_single_seq = seq_group.sampling_params is None or ( - seq_group.sampling_params.n == 1) - return no_single_seq - - def schedule( - self - ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]: - # Schedule sequence groups. - # This function call changes the internal states of the scheduler - # such as self.running, self.swapped, and self.waiting. - scheduler_start_time = time.perf_counter() - - scheduler_outputs: SchedulerOutputs = self._schedule() - now = time.time() - - if not self.cache_config.enable_prefix_caching: - common_computed_block_nums = [] - - allow_async_output_proc: bool = self.use_async_output_proc - - # Create input data structures. - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - for i, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - seq_group = scheduled_seq_group.seq_group - token_chunk_size = scheduled_seq_group.token_chunk_size - seq_group.maybe_set_first_scheduled_time(now) - - seq_group_metadata = self._seq_group_metadata_cache[ - self.cache_id].get_object() - seq_group_metadata.seq_data.clear() - seq_group_metadata.block_tables.clear() - - # seq_id -> SequenceData - seq_data: Dict[int, SequenceData] = {} - # seq_id -> physical block numbers - block_tables: Dict[int, List[int]] = {} - - if seq_group.is_encoder_decoder(): - # Encoder associated with SequenceGroup - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - encoder_seq_data = encoder_seq.data - # Block table for cross-attention - # Also managed at SequenceGroup level - cross_block_table = self.block_manager.get_cross_block_table( - seq_group) - else: - encoder_seq_data = None - cross_block_table = None - - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq_id = seq.seq_id - seq_data[seq_id] = seq.data - block_tables[seq_id] = self.block_manager.get_block_table(seq) - self.block_manager.access_all_blocks_in_seq(seq, now) - - if self.cache_config.enable_prefix_caching: - common_computed_block_nums = ( - self.block_manager.get_common_computed_block_ids( - seq_group.get_seqs(status=SequenceStatus.RUNNING))) - - do_sample = True - is_prompt = seq_group.is_prefill() - # We should send the metadata to workers when the first prefill - # is sent. Subsequent requests could be chunked prefill or decode. - is_first_prefill = False - if is_prompt: - seqs = seq_group.get_seqs() - # Prefill has only 1 sequence. - assert len(seqs) == 1 - num_computed_tokens = seqs[0].data.get_num_computed_tokens() - is_first_prefill = num_computed_tokens == 0 - # In the next iteration, all prompt tokens are not computed. - # It means the prefill is chunked, and we don't need sampling. - # NOTE: We use get_len instead of get_prompt_len because when - # a sequence is preempted, prefill includes previous generated - # output tokens. - if (token_chunk_size + num_computed_tokens - < seqs[0].data.get_len()): - do_sample = False - - # It assumes the scheduled_seq_groups is ordered by - # prefill < decoding. - if is_first_prefill or not self.scheduler_config.send_delta_data: - seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group.request_id, - is_prompt=is_prompt, - seq_data=seq_data, - sampling_params=seq_group.sampling_params, - block_tables=block_tables, - do_sample=do_sample, - pooling_params=seq_group.pooling_params, - token_chunk_size=token_chunk_size, - lora_request=seq_group.lora_request, - computed_block_nums=common_computed_block_nums, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - state=seq_group.state, - # `multi_modal_data` will only be present for the 1st comm - # between engine and worker. - # the subsequent comms can still use delta, but - # `multi_modal_data` will be None. - multi_modal_data=(seq_group.multi_modal_data - if scheduler_outputs.num_prefill_groups - > 0 else None), - multi_modal_placeholders=( - seq_group.multi_modal_placeholders - if scheduler_outputs.num_prefill_groups > 0 else None), - ) - else: - # When SPMD mode is enabled, we only send delta data except for - # the first request to reduce serialization cost. - seq_data_delta = {} - for id, data in seq_data.items(): - seq_data_delta[id] = data.get_delta_and_reset() - seq_group_metadata = SequenceGroupMetadataDelta( - seq_data_delta, - seq_group.request_id, - block_tables, - is_prompt, - do_sample=do_sample, - token_chunk_size=token_chunk_size, - computed_block_nums=common_computed_block_nums, - ) - seq_group_metadata_list.append(seq_group_metadata) - - if allow_async_output_proc: - allow_async_output_proc = self._allow_async_output_proc( - seq_group) - - # Now that the batch has been created, we can assume all blocks in the - # batch will have been computed before the next scheduling invocation. - # This is because the engine assumes that a failure in model execution - # will crash the vLLM instance / will not retry. - for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: - self.block_manager.mark_blocks_as_computed( - scheduled_seq_group.seq_group, - scheduled_seq_group.token_chunk_size) - - self._seq_group_metadata_cache[self.next_cache_id].reset() - - scheduler_time = time.perf_counter() - scheduler_start_time - # Add this to scheduler time to all the sequences that are currently - # running. This will help estimate if the scheduler is a significant - # component in the e2e latency. - for seq_group in self.running: - if seq_group is not None and seq_group.metrics is not None: - if seq_group.metrics.scheduler_time is not None: - seq_group.metrics.scheduler_time += scheduler_time - else: - seq_group.metrics.scheduler_time = scheduler_time - - # Move to next cache (if exists) - self.cache_id = self.next_cache_id - - # Return results - return (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc) - - def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: - self.block_manager.fork(parent_seq, child_seq) - - def free_seq(self, seq: Sequence) -> None: - """Free a sequence from a block table.""" - self.block_manager.free(seq) - - def remove_seq_from_computed_blocks_tracker( - self, seq_group: SequenceGroup, - status: Optional[SequenceStatus]) -> None: - seqs = seq_group.get_seqs(status=status) - for seq in seqs: - self._remove_seq_from_computed_blocks_tracker(seq) - - def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - """ - Free a sequence computed blocks tracker _seq_id_to_blocks_hashes - and _seq_id_to_num_tokens_computed. - """ - self.block_manager.remove_seq_from_computed_blocks_tracker(seq) - - def _free_finished_seqs(self, seq_group: SequenceGroup) -> None: - """Free finished seqs in a sequence group.""" - for seq in seq_group.get_seqs(): - if seq.is_finished(): - self.free_seq(seq) - - def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None: - if seq_group.is_finished(): - # Free cross-attention block table, if it exists - self._free_seq_group_cross_attn_blocks(seq_group) - - # Add the finished requests to the finished requests list. - # This list will be used to update the Mamba cache in the - # next step. - self._finished_requests_ids.append(seq_group.request_id) - - # Free finished seqs - self._free_finished_seqs(seq_group) - - def free_finished_seq_groups(self) -> None: - remaining: Deque[SequenceGroup] = deque() - for seq_group in self.running: - self._free_finished_seq_group(seq_group) - if not seq_group.is_finished(): - remaining.append(seq_group) - - self.running = remaining - - # Handle async stopped sequence groups - # (ones that reached max model len) - if self._async_stopped: - for seq_group in self._async_stopped: - self._free_seq_group_cross_attn_blocks(seq_group) - self._finished_requests_ids.append(seq_group.request_id) - - # Free finished seqs - self._free_finished_seqs(seq_group) - - self._async_stopped.clear() - - def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: - self.block_manager.allocate(seq_group) - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): - seq.status = SequenceStatus.RUNNING - - def _append_slots( - self, - seq_group: SequenceGroup, - blocks_to_copy: List[Tuple[int, int]], - enable_chunking: bool = False, - ) -> None: - """Appends new slots to the sequences in the given sequence group. - - Args: - seq_group (SequenceGroup): The sequence group containing the - sequences to append slots to. - blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two - ints, the first int is the source block index, and the second - int is the destination block index. This list is updated with - the new source and destination block indices for the appended - slots. - enable_chunking (bool): True if chunked prefill is enabled. - """ - is_prefill: bool = seq_group.is_prefill() - num_lookahead_slots: int = self._get_num_lookahead_slots( - is_prefill, enable_chunking) - - seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING - for seq in seq_group.get_seqs(status=seq_status): - cows = self.block_manager.append_slots(seq, num_lookahead_slots) - if len(cows) > 0: - blocks_to_copy.extend(cows) - - def _preempt(self, seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: - # If preemption mode is not specified, we determine the mode as follows: - # We use recomputation by default since it incurs lower overhead than - # swapping. However, when the sequence group has multiple sequences - # (e.g., beam search), recomputation is not currently supported. In - # such a case, we use swapping instead. - # FIXME(woosuk): This makes our scheduling policy a bit bizarre. - # As swapped sequences are prioritized over waiting sequences, - # sequence groups with multiple sequences are implicitly prioritized - # over sequence groups with a single sequence. - # TODO(woosuk): Support recomputation for sequence groups with multiple - # sequences. This may require a more sophisticated CUDA kernel. - if self.user_specified_preemption_mode is None: - if seq_group.get_max_num_running_seqs() == 1: - preemption_mode = PreemptionMode.RECOMPUTE - else: - preemption_mode = PreemptionMode.SWAP - - elif self.user_specified_preemption_mode == "swap": - preemption_mode = PreemptionMode.SWAP - else: - preemption_mode = PreemptionMode.RECOMPUTE - - if self.num_cumulative_preemption % 50 == 0: - logger.warning( - "Sequence group %s is preempted by %s mode because there is " - "not enough KV cache space. This can affect the end-to-end " - "performance. Increase gpu_memory_utilization or " - "tensor_parallel_size to provide more KV cache memory. " - "total_num_cumulative_preemption=%d", - seq_group.request_id, - preemption_mode, - self.num_cumulative_preemption + 1, - ) - self.num_cumulative_preemption += 1 - - if preemption_mode == PreemptionMode.RECOMPUTE: - self._preempt_by_recompute(seq_group) - elif preemption_mode == PreemptionMode.SWAP: - self._preempt_by_swap(seq_group, blocks_to_swap_out) - else: - raise AssertionError("Invalid preemption mode.") - return preemption_mode - - def _preempt_by_recompute( - self, - seq_group: SequenceGroup, - ) -> None: - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - assert len(seqs) == 1 - for seq in seqs: - seq.status = SequenceStatus.WAITING - self.free_seq(seq) - seq.reset_state_for_recompute() - self._free_seq_group_cross_attn_blocks(seq_group) - - def _preempt_by_swap( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - ) -> None: - self._swap_out(seq_group, blocks_to_swap_out) - - def _swap_in( - self, - seq_group: SequenceGroup, - blocks_to_swap_in: List[Tuple[int, int]], - ) -> None: - mapping = self.block_manager.swap_in(seq_group) - blocks_to_swap_in.extend(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - seq.status = SequenceStatus.RUNNING - - def _swap_out( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - ) -> None: - if not self.block_manager.can_swap_out(seq_group): - # FIXME(woosuk): Abort the sequence group instead of aborting the - # entire engine. - raise RuntimeError( - "Aborted due to the lack of CPU swap space. Please increase " - "the swap space to avoid this error.") - mapping = self.block_manager.swap_out(seq_group) - blocks_to_swap_out.extend(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq.status = SequenceStatus.SWAPPED - - def _passed_delay(self, now: float) -> bool: - if self.prev_prompt: - self.last_prompt_latency = now - self.prev_time - self.prev_time, self.prev_prompt = now, False - # Delay scheduling prompts to let waiting queue fill up - if self.scheduler_config.delay_factor > 0 and self.waiting: - earliest_arrival_time = min( - [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ((now - earliest_arrival_time) - > (self.scheduler_config.delay_factor * - self.last_prompt_latency) or not self.running) - else: - passed_delay = True - return passed_delay - - def _get_num_lookahead_slots(self, is_prefill: bool, - enable_chunking: bool) -> int: - """The number of slots to allocate per sequence per step, beyond known - token ids. Speculative decoding uses these slots to store KV activations - of tokens which may or may not be accepted. - """ - return 0 - - def _get_num_new_uncached_and_cached_tokens( - self, - seq_group: SequenceGroup, - status: SequenceStatus, - enable_chunking: bool, - budget: SchedulingBudget, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> Tuple[int, int]: - """ - Returns the number of new uncached and cached tokens to schedule for a - given sequence group that's in a given `status`. - - The API could chunk the number of tokens to compute based on `budget` - if `enable_chunking` is True. If a sequence group has multiple - sequences (e.g., running beam search), it means it is in decoding - phase, so chunking doesn't happen. - - Returns (0, 0) if the new token cannot be computed due to token budget. - - The cached tokens's blocks are already computed, and the attention - backend will reuse the cached blocks rather than recomputing them. So - the scheduler could schedule these cached tokens "for free". - - Args: - seq_group: The sequence group to get the number of new tokens to - schedule. - status: The status of the sequences to get the number of new tokens - to schedule. - enable_chunking: Whether to chunk the number of tokens to compute. - budget: The budget to chunk the number of tokens to compute. - partial_prefill_metadata: information about the partial prefills - that are currently running - - - Returns: - A tuple of two ints. The first int is the number of new uncached - tokens to schedule. The second int is the number of cached tokens. - If no more new tokens can be scheduled, returns (0, 0). - """ - num_cached_new_tokens = 0 - num_uncached_new_tokens = 0 - - seqs = seq_group.get_seqs(status=status) - # Compute the number of new uncached and cached tokens for - # each sequence. - for seq in seqs: - if not seq.is_prefill(): - # Decode sequences should always just have 1 uncached token - # TODO(rickyx): Actually is this still correct for multi-step? - num_uncached_new_tokens += 1 - continue - - num_computed_tokens_seq = seq.get_num_computed_tokens() - all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq - if not self.cache_config.enable_prefix_caching: - # If prefix caching is not enabled, all new tokens are uncached. - num_uncached_new_tokens += all_num_new_tokens_seq - continue - - # NOTE: the cache token might be currently in a block that's in an - # evictor meaning that it's not yet allocated. However, we don't - # exclude such tokens in the cache count because it will be - # guaranteed to be allocated later if the sequence can be allocated. - num_cached_tokens_seq = self.block_manager.get_num_cached_tokens( - seq) - - # Sanity check. - if num_cached_tokens_seq < num_computed_tokens_seq: - # This should only happen with chunked prefill, and - # the seq is still in prefill. The `num_cached_tokens_seq` - # is the value we calculated on scheduling the first prefill. - # For subsequent continuous prefill steps, we cached the - # number of cache tokens for the sequence so the cached token - # count could be less than the number of computed tokens. - # See comments on `ComputedBlocksTracker` for more details. - assert ( - seq.is_prefill() and seq.status == SequenceStatus.RUNNING - and self.scheduler_config.chunked_prefill_enabled - ), ("Number of cached tokens should not be less than the " - "number of computed tokens for a sequence that's still " - f"in prefill. But there are {num_cached_tokens_seq} cached " - f"tokens and {num_computed_tokens_seq} computed tokens " - f"for sequence {seq.seq_id}.") - - num_cached_new_tokens_seq = max( - 0, num_cached_tokens_seq - num_computed_tokens_seq) - num_uncached_new_tokens_seq = (all_num_new_tokens_seq - - num_cached_new_tokens_seq) - - num_uncached_new_tokens += num_uncached_new_tokens_seq - num_cached_new_tokens += num_cached_new_tokens_seq - - if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0: - # For a fully cached hit sequence, we actually need to recompute the - # last token. So we need at least 1 uncached token to schedule. - # See ModelRunner._compute_for_prefix_cache_hit for more details. - num_uncached_new_tokens = 1 - num_cached_new_tokens -= 1 - - if enable_chunking and len(seqs) == 1: - # Chunk if a running request cannot fit in the given budget. - # If number of seq > 1, it means it is doing beam search - # in a decode phase. Do not chunk. - num_uncached_new_tokens = self._chunk_new_tokens_to_schedule( - self.scheduler_config, - self.cache_config, - budget, - self._get_prompt_limit(seq_group), - num_uncached_new_tokens, - self.partial_prefill_budget_lookup_list, - partial_prefill_metadata, - ) - - return num_uncached_new_tokens, num_cached_new_tokens - - @staticmethod - def _chunk_new_tokens_to_schedule( - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - budget: SchedulingBudget, - prompt_limit: int, - num_new_tokens: int, - partial_prefill_budget_lookup_list: List[int], - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> int: - """ - Chunks the number of new tokens to schedule based on the budget when - chunked prefill is enabled. - - Args: - scheduler_config: The scheduler config. - cache_config: The cache config. - budget: The budget to chunk the number of tokens to compute. - prompt_limit: The maximum number of tokens allowed in a prompt. - num_new_tokens: The number of new tokens to schedule. - - Returns: - The number of new tokens to schedule after chunking. - """ - remaining_token_budget = budget.remaining_token_budget() - - # Get the number of tokens to allocate to this prefill slot - prefill_slot_budget = ( - remaining_token_budget if partial_prefill_metadata is None else - partial_prefill_budget_lookup_list[ - partial_prefill_metadata.schedulable_prefills]) - - if cache_config.enable_prefix_caching: - # When prefix caching is enabled and we're partially prefilling - # a sequence, we always allocate a number of new tokens that is - # divisible by the block size to avoid partial block matching. - block_size = cache_config.block_size - # Don't exceed either the total budget or slot budget. - # Take min of those and get the next lowest multiple of the - # block size: - remaining_token_budget = ( - min(remaining_token_budget, prefill_slot_budget) // - block_size) * block_size - # NB: In the case where num_new_tokens < budget, we are - # finishing prefill for this sequence, so we do not need to - # allocate a full block. - - num_new_tokens = min(num_new_tokens, remaining_token_budget, - prefill_slot_budget) - - return num_new_tokens diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index c345f17e6614f..e828ac04364ff 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -7,13 +7,11 @@ from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import ModelConfig, VllmConfig -from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors.interface import IOProcessor from vllm.pooling_params import PoolingParams @@ -266,11 +264,7 @@ class EngineClient(ABC): ... @abstractmethod - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None, - ) -> None: + async def do_log_stats(self) -> None: ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 73165c7e4c0ad..757baecea9ce0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -601,11 +601,7 @@ class AsyncLLM(EngineClient): async def is_tracing_enabled(self) -> bool: return self.observability_config.otlp_traces_endpoint is not None - async def do_log_stats( - self, - scheduler_outputs=None, - model_output=None, - ) -> None: + async def do_log_stats(self) -> None: if self.logger_manager: self.logger_manager.log() diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py deleted file mode 100644 index 530907012f704..0000000000000 --- a/vllm/worker/cache_engine.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""CacheEngine class for managing the KV cache.""" -from typing import List - -import torch - -from vllm.attention import get_attn_backend -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig -from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, - get_dtype_size, is_pin_memory_available) - -logger = init_logger(__name__) - - -class CacheEngine: - """Manages the KV cache. - - This class is responsible for initializing and managing the GPU and CPU KV - caches. It also provides methods for performing KV cache operations, such - as swapping and copying. - """ - - def __init__( - self, - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - device_config: DeviceConfig, - ) -> None: - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - self.device_config = device_config - - self.head_size = model_config.get_head_size() - # Models like Jamba, have mixed typed layers, E.g Mamba - self.num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - self.num_gpu_blocks = cache_config.num_gpu_blocks - if self.num_gpu_blocks: - self.num_gpu_blocks //= parallel_config.pipeline_parallel_size - self.num_cpu_blocks = cache_config.num_cpu_blocks - if self.num_cpu_blocks: - self.num_cpu_blocks //= parallel_config.pipeline_parallel_size - - if cache_config.cache_dtype == "auto": - self.dtype = model_config.dtype - else: - self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # Get attention backend. - self.attn_backend = get_attn_backend(self.head_size, - model_config.dtype, - cache_config.cache_dtype, - self.block_size, - model_config.is_attention_free, - use_mla=model_config.use_mla) - - # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache( - self.num_gpu_blocks, self.device_config.device_type) - self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") - - def _allocate_kv_cache( - self, - num_blocks: int, - device: str, - ) -> List[torch.Tensor]: - """Allocates KV cache on the specified device.""" - kv_cache_generic_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: List[torch.Tensor] = [] - try: - kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order( - ) - except (AttributeError, NotImplementedError): - kv_cache_stride_order = tuple(range(len(kv_cache_generic_shape))) - - # The allocation respects the backend-defined stride order to ensure - # the semantic remains consistent for each backend. We first obtain the - # generic kv cache shape and then permute it according to the stride - # order which could result in a non-contiguous tensor. - kv_cache_allocation_shape = tuple(kv_cache_generic_shape[i] - for i in kv_cache_stride_order) - - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache = torch.zeros( - kv_cache_allocation_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device).permute(*kv_cache_stride_order) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append(layer_kv_cache) - return kv_cache - - def swap_in(self, src_to_dst: torch.Tensor) -> None: - for i in range(self.num_attention_layers): - self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i], - src_to_dst) - - def swap_out(self, src_to_dst: torch.Tensor) -> None: - for i in range(self.num_attention_layers): - self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i], - src_to_dst) - - def copy(self, src_to_dsts: torch.Tensor) -> None: - self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) - - @staticmethod - def get_cache_block_size( - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - - if cache_config.cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - key_cache_entry = num_heads * head_size - - # For MLA there is no value cache, since the latent vector - # is joint keys and values. - value_cache_entry = key_cache_entry if not model_config.use_mla else 0 - total = num_attention_layers * cache_config.block_size * \ - (key_cache_entry + value_cache_entry) - - dtype_size = get_dtype_size(dtype) - return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py deleted file mode 100644 index bab89586b0f2c..0000000000000 --- a/vllm/worker/model_runner.py +++ /dev/null @@ -1,2031 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import gc -import inspect -import itertools -import time -import weakref -from contextlib import contextmanager -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, - Tuple, Type, TypeVar, Union) - -import numpy as np -import torch -import torch.distributed -import torch.nn as nn -from tqdm.auto import tqdm - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.attention.backends.abstract import AttentionState -from vllm.attention.backends.utils import CommonAttentionState -from vllm.compilation.counter import compilation_counter -from vllm.config import CompilationLevel, VllmConfig -from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import broadcast_tensor_dict, get_pp_group -from vllm.distributed.kv_transfer import get_kv_transfer_group -from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, - graph_capture) -from vllm.forward_context import get_forward_context, set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata, SamplingMetadataCache -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - get_sampler) -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import (supports_lora, supports_mrope, - supports_multimodal) -from vllm.model_executor.models.utils import set_cpu_offload_max_bytes -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap, - MultiModalRegistry) -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, - async_tensor_h2d, flatten_2d_lists, - is_pin_memory_available, supports_dynamo, - weak_ref_tensor) -from vllm.worker.model_runner_base import ( - InputProcessingError, ModelRunnerBase, ModelRunnerInputBase, - ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -LORA_WARMUP_RANK = 8 - -_NUM_WARMUP_ITERS = 2 - -TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") - -# For now, bump up cache limits for recompilations during CUDA graph warmups. -torch._dynamo.config.cache_size_limit = 128 -torch._dynamo.config.accumulated_cache_size_limit = 128 - - -@dataclass(frozen=True) -class ModelInputForGPU(ModelRunnerInputBase): - """ - This base class contains metadata needed for the base model forward pass - but not metadata for possible additional steps, e.g., sampling. Model - runners that run additional steps should subclass this method to add - additional fields. - """ - input_tokens: Optional[torch.Tensor] = None - inputs_embeds: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None - finished_requests_ids: Optional[List[str]] = None - virtual_engine: int = 0 - async_callback: Optional[Callable] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - previous_hidden_states: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForGPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForGPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - # Exclude `async_callback` to be able to pickle this object - def __getstate__(self): - state = self.__dict__.copy() - del state["async_callback"] - return state - - # TODO: What happens when we depickle this object? - # How can we update this callback to properly pass it to the engine? - def __setstate__(self, state): - self.__dict__.update(state) - self.__dict__.update({'async_callback': None}) - - -@dataclass(frozen=True) -class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - # Used for speculative decoding. We do not broadcast it because it is only - # used by the driver worker. - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForGPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): - """Build ModelInputForGPU from SequenceGroupMetadata.""" - - # Note: ideally we would be using a dataclass(kw_only=True) - # here, so that this can be subclassed easily, - # but kw_only is not supported in python<3.10. - class InterDataForSeqGroup: - """Intermediate data for the current sequence group.""" - - def simple_reinit(self): - self.input_tokens[0].clear() # type: ignore - self.inputs_embeds = None # type: ignore - self.input_positions[0].clear() # type: ignore - self.mrope_input_positions = None # type: ignore - self.seq_lens[0] = 0 # type: ignore - self.orig_seq_lens[0] = 0 # type: ignore - self.prompt_lens[0] = 0 # type: ignore - self.query_lens[0] = 0 # type: ignore - self.context_lens[0] = 0 # type: ignore - self.curr_sliding_window_blocks[0] = 0 # type: ignore - self.lora_index_mapping.clear() # type: ignore - self.lora_prompt_mapping.clear() # type: ignore - self.lora_requests.clear() # type: ignore - - def __init__( - self, - *, - # From sequence group metadata. - request_id: str, - seq_ids: List[int], - is_prompt: bool, - block_tables: Optional[Dict[int, List[int]]], - computed_block_nums: List[int], - n_seqs: int = 0, - - # Input tokens and positions. - input_tokens: Optional[List[List[int]]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - input_positions: Optional[List[List[int]]] = None, - mrope_input_positions: Optional[List[List[List[int]]]] = None, - - # The sequence length (may be capped to the sliding window). - seq_lens: Optional[List[int]] = None, - # The original sequence length (before applying sliding window). - # This is used to compute slot mapping. - orig_seq_lens: Optional[List[int]] = None, - # This is used in the dual-chunk flash attention backend. - prompt_lens: Optional[List[int]] = None, - # The query length. - query_lens: Optional[List[int]] = None, - # The number of tokens that are already computed. - context_lens: Optional[List[int]] = None, - # The current sliding window block. - curr_sliding_window_blocks: Optional[List[int]] = None, - - # LoRA inputs. - lora_index_mapping: Optional[List[List[int]]] = None, - lora_prompt_mapping: Optional[List[List[int]]] = None, - lora_requests: Optional[Set[LoRARequest]] = None, - - # Multi-modal inputs. - multi_modal_kwargs: Optional[MultiModalKwargs] = None, - multi_modal_placeholder_maps: Optional[Dict[ - str, MultiModalPlaceholderMap]] = None, - - # Whether the prefix cache is hit (prefill only). - prefix_cache_hit: bool = False, - reinit: bool = False, - reinit_use_defaults: bool = False, - encoder_seq_len: int = 0, - ): - if reinit: - assert len(self.seq_ids) == len(seq_ids) # type: ignore - for i, seq_id in enumerate(seq_ids): - self.seq_ids[i] = seq_id # type: ignore - else: - self.seq_ids = seq_ids - - self.request_id = request_id - self.is_prompt = is_prompt - self.block_tables = block_tables - self.computed_block_nums = computed_block_nums - self.n_seqs = n_seqs - self.encoder_seq_len = encoder_seq_len - - if reinit: - if len(self.seq_ids) == 1 and reinit_use_defaults: - self.simple_reinit() - else: - if input_tokens: - self.input_tokens = input_tokens - else: - for seq_id in range(len(self.seq_ids)): - self.input_tokens[seq_id].clear() - - self.inputs_embeds = inputs_embeds - - if input_positions: - self.input_positions = input_positions - else: - for seq_id in range(len(self.seq_ids)): - self.input_positions[seq_id].clear() - - self.mrope_input_positions = None - - if seq_lens: - self.seq_lens = seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.seq_lens[seq_id] = 0 - - if orig_seq_lens: - self.orig_seq_lens = orig_seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.orig_seq_lens[seq_id] = 0 - - if prompt_lens: - self.prompt_lens = prompt_lens - else: - for seq_id in range(len(self.seq_ids)): - self.prompt_lens[seq_id] = 0 - - if query_lens: - self.query_lens = query_lens - else: - for seq_id in range(len(self.seq_ids)): - self.query_lens[seq_id] = 0 - - if context_lens: - self.context_lens = context_lens - else: - for seq_id in range(len(self.seq_ids)): - self.context_lens[seq_id] = 0 - - if curr_sliding_window_blocks: - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks - else: - for seq_id in range(len(self.seq_ids)): - self.curr_sliding_window_blocks[seq_id] = 0 - - if lora_index_mapping: - self.lora_index_mapping = lora_index_mapping - else: - self.lora_index_mapping.clear() - - if lora_prompt_mapping: - self.lora_prompt_mapping = lora_prompt_mapping - else: - self.lora_prompt_mapping.clear() - - if lora_requests: - self.lora_requests = lora_requests - else: - self.lora_requests.clear() - - else: - self.input_tokens = input_tokens or [] - self.inputs_embeds = inputs_embeds - self.input_positions = input_positions or [] - self.mrope_input_positions = mrope_input_positions or None - self.seq_lens = seq_lens or [] - self.orig_seq_lens = orig_seq_lens or [] - self.prompt_lens = prompt_lens or [] - self.query_lens = query_lens or [] - self.context_lens = context_lens or [] - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks or [] - - self.lora_index_mapping = lora_index_mapping or [] - self.lora_prompt_mapping = lora_prompt_mapping or [] - self.lora_requests = lora_requests or set() - - self.multi_modal_kwargs = multi_modal_kwargs - self.multi_modal_placeholder_maps = multi_modal_placeholder_maps - self.prefix_cache_hit = prefix_cache_hit - - self.n_seqs = len(self.seq_ids) - - if not reinit: - self.__post_init__() - - def __post_init__(self): - self.n_seqs = len(self.seq_ids) - - self.input_tokens = [[] for _ in range(self.n_seqs)] - self.input_positions = [[] for _ in range(self.n_seqs)] - self.mrope_input_positions = None - self.seq_lens = [0] * self.n_seqs - self.orig_seq_lens = [0] * self.n_seqs - self.prompt_lens = [0] * self.n_seqs - self.query_lens = [0] * self.n_seqs - self.context_lens = [0] * self.n_seqs - self.curr_sliding_window_blocks = [0] * self.n_seqs - - self.lora_index_mapping = [] - self.lora_prompt_mapping = [] - - def __repr__(self) -> str: - return (f"InterDataForSeqGroup(" - f"request_id={self.request_id}, " - f"seq_ids={self.seq_ids}, " - f"is_prompt={self.is_prompt}, " - f"block_tables={self.block_tables}, " - f"computed_block_nums={self.computed_block_nums}, " - f"n_seqs={self.n_seqs}, " - f"input_tokens={self.input_tokens}, " - f"inputs_embeds.shape=" - f"{getattr(self.inputs_embeds, 'shape', None)}, " - f"input_positions={self.input_positions}, " - f"mrope_input_positions={self.mrope_input_positions}, " - f"seq_lens={self.seq_lens}, " - f"orig_seq_lens={self.orig_seq_lens}, " - f"query_lens={self.query_lens}, " - f"context_lens={self.context_lens}, " - f"multi_modal_kwargs={self.multi_modal_kwargs}") - - def gen_inter_data_builder(self, num_seqs: int): - return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup( - request_id="", - seq_ids=[0] * num_seqs, - is_prompt=True, - block_tables=None, - computed_block_nums=[]) - - def init_cached_inter_data(self, *args, **kwargs): - assert len(args) == 0 - assert "seq_ids" in kwargs - seq_ids = kwargs["seq_ids"] - num_seqs = len(seq_ids) - - # The inter-data cache is per model_runner - inter_data_cache = self.runner.inter_data_cache - if num_seqs not in inter_data_cache: - inter_data_cache[num_seqs] = PyObjectCache( - self.gen_inter_data_builder(num_seqs)) - - obj = inter_data_cache[num_seqs].get_object() - obj.__init__(*args, **kwargs) - return obj - - def reset_cached_inter_data(self): - for cache in self.runner.inter_data_cache.values(): - cache.reset() - - def __init__(self, - runner: "GPUModelRunnerBase", - finished_requests_ids: Optional[List[str]] = None): - super().__init__() - # Compute functions for each sequence in a sequence group. - # WARNING: The order of the functions matters! - self.per_seq_compute_fns = [ - self._compute_lens, - self._compute_for_prefix_cache_hit, - self._compute_for_sliding_window, - self._compute_lora_input, - ] - # Compute functions for each sequence group. - # WARNING: The order of the functions matters! - self.per_seq_group_compute_fns = [ - self._compute_multi_modal_input, - ] - - self.runner = runner - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.scheduler_config = self.runner.scheduler_config - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.enable_lora = self.runner.lora_config is not None - - # Attention metadata inputs. - if self.attn_backend is not None: - # spec decode (e.g. Medusa) does not have atten backend - self.attn_metadata_builder = self.attn_backend.get_builder_cls()( - weakref.proxy(self)) - - # Engine/Model configurations. - self.chunked_prefill_enabled = ( - self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled) - if self.sliding_window is not None: - self.sliding_window_blocks = ( - self.sliding_window + self.block_size - 1) // self.block_size - self.block_aligned_sliding_window = \ - self.sliding_window_blocks * self.block_size - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.finished_requests_ids = finished_requests_ids - - # if the current batch is decode-only. - # will be set to False if there is any non-decode request. - self.decode_only = True - - # Intermediate data (data in CPU before going to GPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForGPUBuilder.InterDataForSeqGroup] = [] - - self.attn_metadata_builder.prepare() - - def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Compute context length, sequence length and tokens - for the given sequence data. - """ - seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] - token_chunk_size = seq_group_metadata.token_chunk_size - - # Compute context length (the number of tokens that are - # already computed) and sequence length (total number of tokens). - - seq_len = seq_data.get_len() - if inter_data.is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min(seq_len, context_len + token_chunk_size) - elif self.runner.model_config.is_encoder_decoder: - context_len = seq_len - 1 - else: - context_len = seq_data.get_num_computed_tokens() - - # Compute tokens. - if seq_data.prompt_embeds is None: - tokens = seq_data.get_token_ids()[context_len:seq_len] - prompt_embeds = None - else: - tokens = [0] * (seq_len - context_len) - prompt_embeds = seq_data.get_token_embeddings( - )[context_len:seq_len] - - inter_data.seq_lens[seq_idx] = seq_len - inter_data.orig_seq_lens[seq_idx] = seq_len - inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len() - inter_data.context_lens[seq_idx] = context_len - inter_data.input_tokens[seq_idx].extend(tokens) - inter_data.inputs_embeds = prompt_embeds - inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) - inter_data.query_lens[seq_idx] = seq_len - context_len - - if seq_data.mrope_position_delta is not None: - if inter_data.mrope_input_positions is None: - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - - inter_data.mrope_input_positions[ - seq_idx] = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - - def _compute_for_prefix_cache_hit( - self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Check if hit prefix cache (i.e., some blocks are already computed). - If hit, update input tokens and positions to only compute the - remaining blocks. - """ - computed_block_nums = inter_data.computed_block_nums - - # Note that prefix caching does not support sliding window. - prefix_cache_hit = (computed_block_nums is not None - and len(computed_block_nums) > 0 - and self.sliding_window is None - and inter_data.is_prompt) - inter_data.prefix_cache_hit = prefix_cache_hit - - if not prefix_cache_hit: - return - - assert computed_block_nums is not None - # The cache hit prompt tokens in this sequence. Note that - # this may be larger than the sequence length if chunked - # prefill is enabled. - prefix_cache_len = len(computed_block_nums) * self.block_size - seq_group_metadata.seq_data[inter_data.seq_ids[ - seq_idx]].update_num_cached_tokens(prefix_cache_len) - - # The number of so far computed prompt tokens in this sequence. - context_len = inter_data.context_lens[seq_idx] - # The total number of prompt tokens in this sequence. - # When chunked prefill is enabled, this is the token number of - # computed chunks + current chunk. - seq_len = inter_data.seq_lens[seq_idx] - if prefix_cache_len <= context_len: - # We already passed the cache hit region, - # so do normal computation. - pass - elif context_len < prefix_cache_len < seq_len: - # Partial hit. Compute the missing part. - uncomputed_start = prefix_cache_len - context_len - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][uncomputed_start:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][uncomputed_start:] - context_len = prefix_cache_len - - inter_data.context_lens[seq_idx] = context_len - inter_data.query_lens[ - seq_idx] = inter_data.seq_lens[seq_idx] - context_len - elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last token to avoid - # erroneous behavior. FIXME: Ideally we should directly - # mark all tokens as computed in the scheduler and do not - # schedule this sequence, so this case should not happen. - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][-1:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][-1:] - inter_data.query_lens[seq_idx] = 1 - inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 - - def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Update seq_len and curr_sliding_window_block for the given - sequence data (only required by decoding) if sliding window is enabled. - """ - curr_sliding_window_block = 0 - sliding_seq_len = inter_data.seq_lens[seq_idx] - if not inter_data.is_prompt and self.sliding_window is not None: - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - curr_sliding_window_block = self.sliding_window_blocks - # number of elements in last block - suff_len = inter_data.seq_lens[seq_idx] % self.block_size - sliding_seq_len = min(inter_data.seq_lens[seq_idx], - self.block_aligned_sliding_window + suff_len) - if suff_len > 0: - curr_sliding_window_block += 1 - - inter_data.curr_sliding_window_blocks[ - seq_idx] = curr_sliding_window_block - inter_data.seq_lens[seq_idx] = sliding_seq_len - - def _compute_lora_input(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """If LoRA is enabled, compute LoRA index and prompt mapping.""" - if not self.enable_lora: - return - - lora_id = seq_group_metadata.lora_int_id - if lora_id > 0: - inter_data.lora_requests.add(seq_group_metadata.lora_request) - query_len = inter_data.query_lens[seq_idx] - inter_data.lora_index_mapping.append([lora_id] * query_len) - sampling_params = seq_group_metadata.sampling_params - if sampling_params and sampling_params.prompt_logprobs is not None: - inter_data.lora_prompt_mapping.append([lora_id] * query_len) - elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: - inter_data.lora_prompt_mapping.append([lora_id]) - else: - inter_data.lora_prompt_mapping.append([]) - - def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): - """If multi-modal data is given, add it to the input.""" - # NOTE: mm_kwargs only includes the subset of multi-modal items that - # intersect with the current prefill positions. - positions = inter_data.input_positions[0] - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, - range(positions[0], positions[0] + len(positions))) - - # M-RoPE requires mrope_positions even for plain text; return early - # when mm_kwargs is empty only if inter_data.is_prompt is False. - if not mm_kwargs and not inter_data.is_prompt: - return - - inter_data.multi_modal_kwargs = mm_kwargs - inter_data.multi_modal_placeholder_maps = placeholder_maps - - # special processing for mrope position deltas. - if self.runner.model_config.uses_mrope: - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", - None) - - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) - hf_config = self.runner.model_config.hf_config - - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - for seq_idx in range(inter_data.n_seqs): - seq_data = seq_group_metadata.seq_data[ - inter_data.seq_ids[seq_idx]] - token_ids = seq_data.get_token_ids() - - if supports_mrope(self.runner.model): - mrope_input_positions, mrope_position_delta = \ - self.runner.model.get_mrope_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - mrope_input_positions = mrope_input_positions.tolist() - else: - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - - seq_data.mrope_position_delta = mrope_position_delta - inter_data.mrope_input_positions[ - seq_idx] = mrope_input_positions - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - """Add a sequence group to the builder.""" - seq_ids = seq_group_metadata.seq_data.keys() - n_seqs = len(seq_ids) - is_prompt = seq_group_metadata.is_prompt - - if is_prompt: - assert n_seqs == 1 - self.decode_only = False - - encoder_seq_len = 0 - - if self.runner.model_config.is_encoder_decoder: - encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() - - inter_data = self.init_cached_inter_data( - request_id=seq_group_metadata.request_id, - seq_ids=seq_ids, - is_prompt=is_prompt, - block_tables=seq_group_metadata.block_tables, - computed_block_nums=seq_group_metadata.computed_block_nums, - reinit=True, - reinit_use_defaults=True, - encoder_seq_len=encoder_seq_len) - - self.inter_data_list.append(inter_data) - - for seq_idx in range(n_seqs): - for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) - for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) - - def _use_captured_graph(self, - batch_size: int, - decode_only: bool, - max_decode_seq_len: int, - max_encoder_seq_len: int = 0) -> bool: - return (decode_only and not self.runner.model_config.enforce_eager - and max_decode_seq_len <= self.runner.max_seq_len_to_capture - and max_encoder_seq_len <= self.runner.max_seq_len_to_capture - and batch_size <= self.runner.max_batchsize_to_capture) - - def _get_cuda_graph_pad_size(self, - num_seqs: int, - max_decode_seq_len: int, - max_encoder_seq_len: int = 0) -> int: - """ - Determine the number of padding sequences required for running in - CUDA graph mode. Returns -1 if CUDA graphs cannot be used. - - In the multi-step + chunked-prefill case, only the first step - has Prefills (if any). The rest of the steps are guaranteed to be all - decodes. In this case, we set up the padding as if all the sequences - are decodes so we may run all steps except the first step in CUDA graph - mode. - - Args: - num_seqs (int): Number of sequences scheduled to run. - max_decode_seq_len (int): Greatest of all the decode sequence - lengths. Used only in checking the viablility of using - CUDA graphs. - max_encoder_seq_len (int, optional): Greatest of all the encode - sequence lengths. Defaults to 0. Used only in checking the - viability of using CUDA graphs. - Returns: - int: Returns the determined number of padding sequences. If - CUDA graphs is not viable, returns -1. - """ - decode_only = self.decode_only - if not decode_only: - # Early exit so we can treat num_seqs as the batch_size below. - return -1 - - # batch_size out of this function refers to the number of input - # tokens being scheduled. This conflation of num_seqs as batch_size - # is valid as this is a decode-only case. - batch_size = num_seqs - if not self._use_captured_graph(batch_size, decode_only, - max_decode_seq_len, - max_encoder_seq_len): - return -1 - - graph_batch_size = self.runner.vllm_config.pad_for_cudagraph( - batch_size) - assert graph_batch_size >= batch_size - return graph_batch_size - batch_size - - def build(self) -> ModelInputForGPU: - """Finalize the builder intermediate data and - create on-device tensors. - """ - # Combine and flatten intermediate data. - input_tokens = list[int]() - inputs_embeds_list = list[torch.Tensor]() - for inter_data in self.inter_data_list: - for cur_input_tokens in inter_data.input_tokens: - input_tokens.extend(cur_input_tokens) - if inter_data.inputs_embeds is not None: - inputs_embeds_list.append( - inter_data.inputs_embeds.to( - dtype=self.runner.model_config.dtype, - device=self.runner.device)) - inputs_embeds: Optional[torch.Tensor] - if len(inputs_embeds_list) == 0: - inputs_embeds = None - else: - inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to( - dtype=self.runner.model_config.dtype, - device=self.runner.device) - assert len(inputs_embeds) == len(input_tokens) - - if not input_tokens and inputs_embeds is None: - # This may happen when all prefill requests hit - # prefix caching and there is no decode request. - return self.model_input_cls() - - mrope_input_positions: Optional[List[List[int]]] = None - if any(inter_data.mrope_input_positions is not None - for inter_data in self.inter_data_list): - mrope_input_positions = [[] for _ in range(3)] - for idx in range(3): - for inter_data in self.inter_data_list: - msections = inter_data.mrope_input_positions - if msections is None: - for _seq_input_positions in inter_data.input_positions: - mrope_input_positions[idx].extend( - _seq_input_positions) - else: - for _seq_mrope_input_positions in msections: - mrope_input_positions[idx].extend( - _seq_mrope_input_positions[idx]) - input_positions = None - else: - input_positions = [] - for inter_data in self.inter_data_list: - for cur_input_positions in inter_data.input_positions: - input_positions.extend(cur_input_positions) - - seq_lens = [] - query_lens = [] - max_decode_seq_len = 0 - max_encoder_seq_len = 0 - for inter_data in self.inter_data_list: - seq_lens.extend(inter_data.seq_lens) - query_lens.extend(inter_data.query_lens) - if not inter_data.is_prompt: - max_decode_seq_len = max(max_decode_seq_len, - max(inter_data.seq_lens)) - if self.runner.model_config.is_encoder_decoder: - max_encoder_seq_len = max(max_encoder_seq_len, - inter_data.encoder_seq_len) - - # Mapping from request IDs to sequence IDs. Used for Jamba models - # that manages the cache by itself. - request_ids_to_seq_ids = { - data.request_id: data.seq_ids - for data in self.inter_data_list - } - - cuda_graph_pad_size = self._get_cuda_graph_pad_size( - num_seqs=len(seq_lens), - max_decode_seq_len=max_decode_seq_len, - max_encoder_seq_len=max_encoder_seq_len) - - batch_size = len(input_tokens) - if cuda_graph_pad_size != -1: - # If cuda graph can be used, pad tensors accordingly. - # See `capture_model` API for more details. - # vLLM uses cuda graph only for decoding requests. - batch_size += cuda_graph_pad_size - - # Tokens and positions. - if cuda_graph_pad_size: - input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) - assert self.runner.device is not None - input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, - self.runner.device, - self.runner.pin_memory) - - if mrope_input_positions is not None: - for idx in range(3): - mrope_input_positions[idx].extend( - itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(mrope_input_positions, - torch.long, - self.runner.device, - self.runner.pin_memory) - else: - input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(input_positions, - torch.long, - self.runner.device, - self.runner.pin_memory) - # Sequence and query lengths. - if cuda_graph_pad_size: - seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) - - # Attention metadata. - attn_metadata = self.attn_metadata_builder.build( - seq_lens, query_lens, cuda_graph_pad_size, batch_size) - - # LoRA data. - lora_requests = set() - lora_mapping = None - if self.enable_lora: - lora_requests = set(r for data in self.inter_data_list - for r in data.lora_requests) - lora_index_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_index_mapping) - for inter_data in self.inter_data_list - ]) - if cuda_graph_pad_size: - lora_index_mapping.extend( - itertools.repeat(0, cuda_graph_pad_size)) - lora_prompt_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_prompt_mapping) - for inter_data in self.inter_data_list - ]) - - lora_mapping = LoRAMapping( - **dict(index_mapping=lora_index_mapping, - prompt_mapping=lora_prompt_mapping, - is_prefill=not self.decode_only)) - - # Multi-modal data. - multi_modal_kwargs_list = [ - data.multi_modal_kwargs for data in self.inter_data_list - if data.multi_modal_kwargs is not None - ] - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return self.model_input_cls( - input_tokens=input_tokens_tensor, - inputs_embeds=inputs_embeds, - input_positions=input_positions_tensor, - attn_metadata=attn_metadata, - seq_lens=seq_lens, - query_lens=query_lens, - lora_mapping=lora_mapping, - lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, - request_ids_to_seq_ids=request_ids_to_seq_ids, - finished_requests_ids=self.finished_requests_ids) - - -class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - """ - Helper class for shared methods between GPU model runners. - """ - _model_input_cls: Type[TModelInputForGPU] - _builder_cls: Type[ModelInputForGPUBuilder] - builder: ModelInputForGPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - - ModelRunnerBase.__init__(self, vllm_config) - model_config = self.model_config - cache_config = self.cache_config - - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - self.pin_memory = is_pin_memory_available() - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = \ - self.vllm_config.compilation_config.max_capture_size - - # - self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [ - {} for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.graph_memory_pool: Optional[Tuple[ - int, int]] = None # Set during graph capture. - - self.has_inner_state = model_config.has_inner_state - - self.in_profile_run = False - - # When using CUDA graph, the input block tables must be padded to - # max_seq_len_to_capture. However, creating the block table in - # Python can be expensive. To optimize this, we cache the block table - # in numpy and only copy the actual input content at every iteration. - # The shape of the cached block table will be - # (max batch size to capture, max seq len to capture / block size). - self.graph_block_tables = np.zeros( - (self.max_batchsize_to_capture, self.get_max_block_per_batch()), - dtype=np.int32) - - self.cross_layer_shared_graph_block_tables = np.zeros( - (self.max_batchsize_to_capture, self.get_max_block_per_batch()), - dtype=np.int32) - - # Attention-free but stateful models like Mamba need a placeholder attn - # backend, as the attention metadata is needed to manage internal state. - # However we must bypass attention selection altogether for some models - # used for speculative decoding to avoid a divide-by-zero in - # model_config.get_head_size() - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) - needs_attn_backend = (num_attn_heads != 0 - or self.model_config.is_attention_free) - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) if needs_attn_backend else None - if self.attn_backend: - self.attn_state = self.attn_backend.get_state_cls()( - weakref.proxy(self)) - else: - self.attn_state = CommonAttentionState(weakref.proxy(self)) - - # Multi-modal data support - self.input_registry = input_registry - self.mm_registry = mm_registry - - # Lazy initialization - self.model: nn.Module # Set after load_model - # Set after load_model. - self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - self.sampler = get_sampler() - - set_cpu_offload_max_bytes( - int(self.cache_config.cpu_offload_gb * 1024**3)) - - # Used to cache python objects - self.inter_data_cache: Dict[int, PyObjectCache] = {} - - # Using the PythonizationCache in Pipeline-Parallel clobbers the - # SequenceGroupToSample object. In Pipeline-Parallel, we have - # more than 1 Scheduler, resulting in a potential back-to-back - # prepare_model_inputs() call. This clobbers the cached - # SequenceGroupToSample objects, as we reset the cache during - # every prepare_model_inputs() call. - self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - if hasattr(self, "_builder_cls"): - # multi-step model runner does not have `_builder_cls` - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - logger.info("Starting to load model %s...", self.model_config.model) - with DeviceMemoryProfiler(self.device) as m: - time_before_load = time.perf_counter() - self.model = get_model(vllm_config=self.vllm_config) - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - - if supports_multimodal(self.model): - logger.warning( - "Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.vllm_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - ) - - self.model = self.lora_manager.create_lora_manager(self.model) - time_after_load = time.perf_counter() - - self.model_memory_usage = m.consumed_memory - logger.info("Model loading took %.4f GiB and %.6f seconds", - self.model_memory_usage / GiB_bytes, - time_after_load - time_before_load) - - - if self.vllm_config.compilation_config.level ==\ - CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): - backend = self.vllm_config.compilation_config.init_backend( - self.vllm_config) - compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile(self.model, - fullgraph=True, - backend=backend) - - def get_model(self) -> nn.Module: - return self.model - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - from vllm.model_executor.model_loader import ShardedStateLoader - ShardedStateLoader.save_model( - self.model, - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - from vllm.model_executor.model_loader import TensorizerLoader - TensorizerLoader.save_model( - self.model, - tensorizer_config=tensorizer_config, - model_config=self.model_config, - ) - - def get_max_block_per_batch(self) -> int: - block_size = self.block_size - return (self.max_seq_len_to_capture + block_size - 1) // block_size - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> TModelInputForGPU: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - - If cuda graph is required, this API automatically pads inputs. - """ - self.builder.prepare(finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - try: - self.builder.add_seq_group(seq_group_metadata) - except Exception as e: - # Raise an exception that tracks the ID of the bad request - raise InputProcessingError(seq_group_metadata.request_id, - str(e)) from e - - self.builder.reset_cached_inter_data() - - return self.builder.build() # type: ignore - - @contextmanager - def set_in_profile_run(self): - self.in_profile_run = True - try: - yield - finally: - self.in_profile_run = False - - @torch.inference_mode() - def profile_run(self) -> None: - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - self._dummy_run(max_num_batched_tokens, max_num_seqs) - - def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]: - assert num_loras > 0 - assert self.lora_manager is not None - - dummy_lora_requests: list[LoRARequest] = [] - with self.lora_manager.dummy_lora_cache(): - for idx in range(num_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - return dummy_lora_requests - - def _remove_dummy_loras(self): - # Remove dummy loras. - assert self.lora_manager is not None - self.remove_all_loras() - - def _dummy_run(self, - max_num_batched_tokens: int, - max_num_seqs: int = 1) -> None: - with self.set_in_profile_run(): - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = \ - SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - - # This represents the maximum number of different requests - # that will have unique loras, and therefore the max amount of - # memory consumption. Create dummy lora request copies from the - # lora request passed in, which contains a lora from the lora - # warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - dummy_lora_requests = self._add_dummy_loras( - self.lora_config.max_loras) - assert len(dummy_lora_requests) == self.lora_config.max_loras - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the - # total number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, - # which needs to be accounted for when calculating the GPU blocks - # for vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data. - multi_modal_placeholders, - ) - seqs.append(seq) - - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - # it is important to create tensors inside the loop, rather than - # multiplying the list, to avoid Dynamo from treating them as - # tensor aliasing. - kv_caches = [ - torch.tensor([], dtype=torch.float32, device=self.device) - for _ in range(num_layers) - ] - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = \ - self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - - # Disable KV Scale Calculation for dummy data during profile run - if model_input.attn_metadata is not None: - model_input.attn_metadata.enable_kv_scales_calculation = False - - self.execute_model(model_input, kv_caches, intermediate_tensors) - torch.cuda.synchronize() - if self.lora_config: - self._remove_dummy_loras() - - return - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - @torch.inference_mode() - def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> int: - """Cuda graph capture a model and return cudagraph memory - consumption in bytes. - - Note that CUDA graph's performance gain is negligible if number - of batched tokens are larger than 200. And since CUDA graph - requires fixed sized tensors, supporting large/variable batch - size requires high GPU memory overhead. Thus, vLLM only captures - decoding requests. Mixed batch (chunked prefill + decoding) or - prefill requests are not captured. - - Since it is used for decoding-only, it assumes there's only 1 token - per sequence in the batch. - """ - assert not self.model_config.enforce_eager - logger.info("Capturing cudagraphs for decoding. This may lead to " - "unexpected consequences if the model is not static. To " - "run the model in eager mode, set 'enforce_eager=True' or " - "use '--enforce-eager' in the CLI. " - "If out-of-memory error occurs during cudagraph capture," - " consider decreasing `gpu_memory_utilization` or " - "switching to eager mode. You can also reduce the " - "`max_num_seqs` as needed to decrease memory usage.") - start_time = time.perf_counter() - start_free_gpu_memory = torch.cuda.mem_get_info()[0] - - # Prepare dummy inputs. These will be reused for all batch sizes. - max_batch_size = self.max_batchsize_to_capture - input_tokens = torch.zeros(max_batch_size, - dtype=torch.long, - device=self.device) - input_positions = torch.zeros(max_batch_size, - dtype=torch.long, - device=self.device) - inputs_embeds = torch.zeros( - (max_batch_size, self.model_config.get_hidden_size()), - dtype=self.model_config.dtype, - device=self.device) - if self.model_config.uses_mrope: - input_positions = torch.tile(input_positions, - (3, 1)).cuda(device=self.device) - # Prepare dummy previous_hidden_states only if needed by the model. - # This is used by draft models such as EAGLE. - previous_hidden_states = None - if "previous_hidden_states" in inspect.signature( - self.model.forward).parameters: - previous_hidden_states = torch.empty( - [max_batch_size, - self.model_config.get_hidden_size()], - dtype=self.model_config.dtype, - device=self.device) - - intermediate_inputs = None - if not get_pp_group().is_first_rank: - intermediate_inputs = self.model.make_empty_intermediate_tensors( - batch_size=max_batch_size, - dtype=self.model_config.dtype, - device=self.device) - - dummy_lora_id: Optional[int] = None - dummy_lora_request: LoRARequest = [] - if self.lora_config: - # The goal is to capture the LoRA kernels in cuda graphs. - # for this purpose, as single dummy lora is sufficient. - dummy_lora_requests = self._add_dummy_loras(num_loras=1) - assert len(dummy_lora_requests) == 1 - dummy_lora_request = dummy_lora_requests[0] - dummy_lora_id = dummy_lora_request.lora_int_id - - with self.attn_state.graph_capture(max_batch_size), graph_capture( - self.device) as graph_capture_context: - # NOTE: Capturing the largest batch size first may help reduce the - # memory usage of CUDA graph. - for virtual_engine in range( - self.parallel_config.pipeline_parallel_size): - # We need to not only iterate over batch sizes, but also whether - # to use inputs_embeds or not, hence we use the cartesian - # product. - cudagraph_capture_sizes = self.vllm_config.compilation_config\ - .cudagraph_capture_sizes - cudagraph_inputs_embeds = (( - True, False) if self.model_config.enable_prompt_embeds else - (False, )) - compilation_cases = itertools.product( - cudagraph_capture_sizes, - cudagraph_inputs_embeds, - ) - # Only rank 0 should print progress bar during capture - if get_tensor_model_parallel_rank() == 0: - compilation_cases = tqdm( - list(compilation_cases), - disable=not self.load_config.use_tqdm_on_load, - desc="Capturing CUDA graph shapes") - for batch_size, use_inputs_embeds in compilation_cases: - attn_metadata = ( - self.attn_state.graph_capture_get_metadata_for_batch( - batch_size, - is_encoder_decoder_model=self.model_config. - is_encoder_decoder)) - # Disable KV Scale Calculation for graph capture - attn_metadata.enable_kv_scales_calculation = False - if self.lora_config: - lora_mapping = LoRAMapping( - **dict(index_mapping=[dummy_lora_id] * batch_size, - prompt_mapping=[dummy_lora_id] * batch_size, - is_prefill=False)) - self.set_active_loras(set([dummy_lora_request]), - lora_mapping) - - graph_runner = CUDAGraphRunner( - self.model, self.attn_backend.get_name(), - self.attn_state.graph_clone(batch_size), - self.model_config.is_encoder_decoder) - - capture_inputs = { - "input_ids": - input_tokens[:batch_size], - "inputs_embeds": - inputs_embeds[:batch_size] - if use_inputs_embeds else None, - "positions": - input_positions[..., :batch_size], - "intermediate_inputs": - intermediate_inputs[:batch_size] - if intermediate_inputs is not None else None, - "kv_caches": - kv_caches[virtual_engine], - "attn_metadata": - attn_metadata, - "memory_pool": - self.graph_memory_pool, - "stream": - graph_capture_context.stream - } - if previous_hidden_states is not None: - capture_inputs[ - "previous_hidden_states"] = previous_hidden_states[: - batch_size] - - if self.has_inner_state: - # Only used by Mamba-based models CUDA graph atm (Jamba) - capture_inputs.update({ - "seqlen_agnostic_capture_inputs": - self.model.get_seqlen_agnostic_capture_inputs( - batch_size) - }) - if self.model_config.is_encoder_decoder: - # add the additional inputs to capture for - # encoder-decoder models. - self._update_inputs_to_capture_for_enc_dec_model( - capture_inputs) - - with set_forward_context(attn_metadata, self.vllm_config, - virtual_engine): - graph_runner.capture(**capture_inputs) - self.graph_memory_pool = graph_runner.graph.pool() - self.graph_runners[virtual_engine][( - batch_size, use_inputs_embeds)] = graph_runner - - if self.lora_config: - self._remove_dummy_loras() - - end_time = time.perf_counter() - end_free_gpu_memory = torch.cuda.mem_get_info()[0] - elapsed_time = end_time - start_time - cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory - # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", - elapsed_time, cuda_graph_size / GiB_bytes) - return cuda_graph_size - - def _update_inputs_to_capture_for_enc_dec_model(self, - capture_inputs: Dict[str, - Any]): - """ - Updates the set of input tensors needed for CUDA graph capture in an - encoder-decoder model. - - This method modifies the provided `capture_inputs` dictionary by - adding tensors specific to encoder-decoder specific models that - need to be captured for CUDA Graph replay. - """ - # During the decode phase encoder_input_ids and encoder_positions are - # unset. Do the same thing for graph capture. - capture_inputs["encoder_input_ids"] = torch.tensor([], - dtype=torch.long, - device=self.device) - capture_inputs["encoder_positions"] = torch.tensor([], - dtype=torch.long, - device=self.device) - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - -class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): - """ - GPU model runner with sampling step. - """ - _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( - ModelInputForGPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForGPUWithSamplingMetadata: - model_input = \ - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - return model_input - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelInputForGPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - - If cuda graph is required, this API automatically pads inputs. - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - if get_pp_group().is_last_rank: - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, model_input.seq_lens, - model_input.query_lens, self.device, self.pin_memory, - generators, self.sampling_metadata_cache) - else: - sampling_metadata = None - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError("num_steps > 1 is not supported in ModelRunner") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - self.attn_state.begin_forward(model_input) - - # Currently cuda graph is only supported by the decode phase. - assert model_input.attn_metadata is not None - prefill_meta = model_input.attn_metadata.prefill_metadata - decode_meta = model_input.attn_metadata.decode_metadata - # TODO(andoorve): We can remove this once all - # virtual engines share the same kv cache. - virtual_engine = model_input.virtual_engine - previous_hidden_states = kwargs.get("previous_hidden_states") - if prefill_meta is None and decode_meta.use_cuda_graph: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - use_inputs_embeds = model_input.inputs_embeds is not None - model_executable = self.graph_runners[virtual_engine][( - graph_batch_size, use_inputs_embeds)] - if previous_hidden_states is not None: - previous_hidden_states = torch.cat([ - previous_hidden_states, - torch.empty([ - graph_batch_size - previous_hidden_states.shape[0], - *previous_hidden_states.shape[1:] - ], - dtype=previous_hidden_states.dtype, - device=previous_hidden_states.device) - ]) - else: - model_executable = self.model - - # Receive KV cache in distributed KV cache transfer setting - # In disagg prefill setting, it will also recv hidden states and bypass - # model forwarding - # In KV cache database setting, it will change the model input so that - # we can skip prefilling on tokens that successfully received KV caches - # NOTE: The receive operation is blocking - bypass_model_exec = False - if self.need_recv_kv(model_input, kv_caches): - hidden_or_intermediate_states, bypass_model_exec, model_input = \ - get_kv_transfer_group().recv_kv_caches_and_hidden_states( - # model is used to know which layer the current worker - # is working on, so that we can receive KV for only those - # layers. - model_executable, - model_input, - kv_caches=kv_caches - ) - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - model_kwargs = {} - if previous_hidden_states is not None: - model_kwargs["previous_hidden_states"] = previous_hidden_states - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start = torch.cuda.Event(enable_timing=True) - model_forward_end = torch.cuda.Event(enable_timing=True) - model_forward_start.record() - - if not bypass_model_exec: - with set_forward_context(model_input.attn_metadata, - self.vllm_config, virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=model_input.inputs_embeds, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **seqlen_agnostic_kwargs, - **model_kwargs, - ) - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.record() - - # Sending KV cache in distributed KV cache transfer setting - # NOTE: the send operation is non-blocking - if self.need_send_kv(model_input, kv_caches): - get_kv_transfer_group().send_kv_caches_and_hidden_states( - # model_executable is used to know which layer the current - # worker is working on, so that we can send KV for only those - # layers. - model_executable, - model_input, - kv_caches, - hidden_or_intermediate_states, - ) - - # Compute the logits in the last pipeline stage. - if not get_pp_group().is_last_rank: - if (self.is_driver_worker - and hidden_or_intermediate_states is not None - and isinstance(hidden_or_intermediate_states, - IntermediateTensors) - and self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - hidden_or_intermediate_states.tensors["model_forward_time"] = ( - torch.tensor(model_forward_time + orig_model_forward_time)) - return hidden_or_intermediate_states - - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - if self.is_driver_worker: - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - assert isinstance(self.sampler, Sampler) - orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor - if model_input.inputs_embeds is not None: - self.sampler.include_gpu_probs_tensor = True - - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - # If there are multiple workers, we are still tracking the - # latency from the start time of the driver worker to the end - # time of the driver worker. The model forward time will then - # end up covering the communication time as well. - output.model_forward_time = (orig_model_forward_time + - model_forward_time) - - if model_input.inputs_embeds is not None: - if self.is_driver_worker: - sampled_token_ids = [] - valid_outputs = [] - for sequence_group_output in output.outputs: - if len(sequence_group_output.samples) == 0: - continue - assert len(sequence_group_output.samples) == 1 - valid_outputs.append(sequence_group_output) - sampled_token_ids.append( - sequence_group_output.samples[0].output_token) - sampled_token_ids = torch.tensor(sampled_token_ids).to( - self.device) - sampled_token_ids = broadcast_tensor_dict( - {"sampled_token_ids": - sampled_token_ids})["sampled_token_ids"] - else: - sampled_token_ids = broadcast_tensor_dict( - )["sampled_token_ids"] - if len(sampled_token_ids) > 0: - sampled_token_embeds = \ - self.model.get_input_embeddings(sampled_token_ids) - if self.is_driver_worker: - self.sampler.include_gpu_probs_tensor = \ - orig_include_gpu_probs - for i, sequence_group_output in enumerate(valid_outputs): - sequence_group_output.samples[0].output_embed = \ - sampled_token_embeds[i] - - if not self.is_driver_worker: - return [] - - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert model_input.sampling_metadata is not None - indices = model_input.sampling_metadata.selected_token_indices - if model_input.is_prompt: - hidden_states = hidden_or_intermediate_states.index_select( - 0, indices) - output.prefill_hidden_states = hidden_or_intermediate_states - elif decode_meta.use_cuda_graph: - hidden_states = hidden_or_intermediate_states[:len(indices)] - else: - hidden_states = hidden_or_intermediate_states - - output.hidden_states = hidden_states - - return [output] - - def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor]) -> bool: - """Check if we need to receive kv-cache from the other worker. - We need to receive KV when - 1. current vLLM instance is KV cache consumer/decode vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - if model_input.attn_metadata is None: - raise ValueError("model_input.attn_metadata cannot be None") - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_consumer and ( - not is_profile_run) and is_prefill_run - - def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor]) -> bool: - """Check if we need to send kv-cache to the other worker. - We need to send KV when - 1. current vLLM instance is KV cache producer/prefill vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - if model_input.attn_metadata is None: - raise ValueError("model_input.attn_metadata cannot be None") - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_producer and ( - not is_profile_run) and is_prefill_run - - -# NOTE: this is nn.Module so the profiler can properly capture/group -# kernels calls made within the graph -class CUDAGraphRunner(nn.Module): - - def __init__(self, model: nn.Module, backend_name: str, - attn_state: AttentionState, is_encoder_decoder_model: bool): - super().__init__() - self.model = model - self.backend_name = backend_name - self.attn_state = attn_state - - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} - - self._graph: Optional[torch.cuda.CUDAGraph] = None - self._is_encoder_decoder_model = is_encoder_decoder_model - - @property - def graph(self): - assert self._graph is not None - return self._graph - - def capture( - self, - input_ids: torch.Tensor, - inputs_embeds: Optional[torch.Tensor], - positions: torch.Tensor, - intermediate_inputs: Optional[IntermediateTensors], - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - memory_pool: Optional[Tuple[int, int]], - stream: torch.cuda.Stream, - **kwargs, - ): - assert self._graph is None - # Run the model a few times without capturing the graph. - # This is to make sure that the captured graph does not include the - # kernel launches for initial benchmarking (e.g., Triton autotune). - # Note one iteration is not enough for torch.compile - for _ in range(_NUM_WARMUP_ITERS): - self.model( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - positions=positions, - intermediate_tensors=intermediate_inputs, - **kwargs, - ) - # Wait for the warm up operations to finish before proceeding with - # Graph Capture. - torch.cuda.synchronize() - # Capture the graph. - self._graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): - output_hidden_or_intermediate_states = self.model( - input_ids=input_ids, - **({ - "inputs_embeds": inputs_embeds, - } if inputs_embeds is not None else {}), - positions=positions, - intermediate_tensors=intermediate_inputs, - **kwargs, - ) - - if isinstance(output_hidden_or_intermediate_states, torch.Tensor): - hidden_or_intermediate_states = weak_ref_tensor( - output_hidden_or_intermediate_states) - elif isinstance(output_hidden_or_intermediate_states, - IntermediateTensors): - hidden_or_intermediate_states = IntermediateTensors( - tensors={ - key: weak_ref_tensor(value) - for key, value in - output_hidden_or_intermediate_states.tensors.items() - }) - - del output_hidden_or_intermediate_states - # make sure `output_hidden_or_intermediate_states` is deleted - # in the graph's memory pool - gc.collect() - torch.cuda.synchronize() - - # Save the input and output buffers. - self.input_buffers = { - "input_ids": - input_ids, - **({ - "inputs_embeds": inputs_embeds, - } if inputs_embeds is not None else {}), - "positions": - positions, - "kv_caches": - kv_caches, - **self.attn_state.get_graph_input_buffers( - attn_metadata, self._is_encoder_decoder_model), - **kwargs, - } - if intermediate_inputs is not None: - self.input_buffers.update(intermediate_inputs.tensors) - if get_pp_group().is_last_rank: - self.output_buffers = { - "hidden_states": hidden_or_intermediate_states - } - else: - self.output_buffers = hidden_or_intermediate_states - - def forward( - self, - input_ids: torch.Tensor, - inputs_embeds: Optional[torch.Tensor], - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors], - **kwargs, - ) -> torch.Tensor: - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - - # Copy the input tensors to the input buffers. - self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) - if positions is not None: - # in some case like MLA, it will reuse positions in metadata - # but truncate them to the original size - # so the shape is not padded, we need to copy partial only - self.input_buffers["positions"][:positions.shape[0]].copy_( - positions, non_blocking=True) - if inputs_embeds is not None: - self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_( - inputs_embeds, non_blocking=True) - - if self.backend_name != "NO_ATTENTION": - self.input_buffers["slot_mapping"].copy_( - attn_metadata.slot_mapping, non_blocking=True) - - self.attn_state.prepare_graph_input_buffers( - self.input_buffers, attn_metadata, self._is_encoder_decoder_model) - - if "seqlen_agnostic_capture_inputs" in self.input_buffers: - self.model.copy_inputs_before_cuda_graphs(self.input_buffers, - **kwargs) - - if "previous_hidden_states" in self.input_buffers: - self.input_buffers["previous_hidden_states"].copy_( - kwargs["previous_hidden_states"], non_blocking=True) - - if intermediate_tensors is not None: - for key in intermediate_tensors.tensors: - if key != "model_execute_time" and key != "model_forward_time": - self.input_buffers[key].copy_(intermediate_tensors[key], - non_blocking=True) - if self._is_encoder_decoder_model: - self.input_buffers["encoder_input_ids"].copy_( - kwargs['encoder_input_ids'], non_blocking=True) - self.input_buffers["encoder_positions"].copy_( - kwargs['encoder_positions'], non_blocking=True) - - # Run the graph. - self.graph.replay() - # Return the output tensor. - if get_pp_group().is_last_rank: - return self.output_buffers["hidden_states"] - - return self.output_buffers diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py deleted file mode 100644 index 12047bc390737..0000000000000 --- a/vllm/worker/worker.py +++ /dev/null @@ -1,666 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A GPU worker class.""" -import gc -import os -from contextlib import nullcontext -from typing import Dict, List, Optional, Set, Tuple, Type, Union - -import torch -import torch.distributed - -import vllm.envs as envs -from vllm.attention.layer import Attention -from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.device_allocator.cumem import CuMemAllocator -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment, - set_custom_all_reduce) -from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.platforms import current_platform -from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, - memory_profiling) -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class Worker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single GPU. The worker is responsible for - maintaining the KV cache and executing the model on the GPU. In case of - distributed inference, each worker is assigned a partition of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None, - ) -> None: - WorkerBase.__init__(self, vllm_config) - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - # Return hidden states from target model if the draft model is an - # mlp_speculator - speculative_config = self.speculative_config - model_config = self.model_config - speculative_args = {} if speculative_config is None \ - or (speculative_config.draft_model_config.hf_config.model_type == - model_config.hf_config.model_type) \ - or (speculative_config.draft_model_config.hf_config.model_type - not in ("medusa", - "mlp_speculator", - "eagle", - "deepseek_mtp", - "glm4_moe_mtp", - "mimo_mtp", - "ernie_mtp", - "qwen3_next_mtp")) \ - else {"return_hidden_states": True} - - self.model_runner: GPUModelRunnerBase = ModelRunner( - vllm_config=self.vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - **speculative_args, - ) - if model_runner_cls is not None: - self.model_runner = model_runner_cls(self.model_runner) - - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CacheEngine] - self.gpu_cache: Optional[List[List[torch.Tensor]]] = None - self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} - - # Buffers saved before sleep - self._sleep_saved_buffers: Dict[str, torch.Tensor] = {} - - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) - else: - self.profiler = None - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - # only print profiler results on rank 0 - if self.local_rank == 0: - print(self.profiler.key_averages().table( - sort_by="self_cuda_time_total")) - - def sleep(self, level: int = 1) -> None: - free_bytes_before_sleep = torch.cuda.mem_get_info()[0] - - # Save the buffers before level 2 sleep - if level == 2: - model = self.model_runner.model - self._sleep_saved_buffers = { - name: buffer.cpu().clone() - for name, buffer in model.named_buffers() - } - - allocator = CuMemAllocator.get_instance() - allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) - free_bytes_after_sleep, total = torch.cuda.mem_get_info() - freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep - used_bytes = total - free_bytes_after_sleep - assert freed_bytes >= 0, "Memory usage increased after sleeping." - logger.info( - "Sleep mode freed %.2f GiB memory, " - "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, - used_bytes / GiB_bytes) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - allocator = CuMemAllocator.get_instance() - allocator.wake_up(tags=tags) - - # Restore the buffers after level 2 sleep - if len(self._sleep_saved_buffers): - model = self.model_runner.model - for name, buffer in model.named_buffers(): - if name in self._sleep_saved_buffers: - buffer.data.copy_(self._sleep_saved_buffers[name].data) - self._sleep_saved_buffers = {} - - def init_device(self) -> None: - if self.device_config.device.type == "cuda": - # torch.distributed.all_reduce does not free the input tensor until - # the synchronization point. This causes the memory usage to grow - # as the number of all_reduce calls increases. This env var disables - # this behavior. - # Related issue: - # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - - # This env var set by Ray causes exceptions with graph building. - os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) - self.device = torch.device(f"cuda:{self.local_rank}") - torch.cuda.set_device(self.device) - - _check_if_gpu_supports_dtype(self.model_config.dtype) - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - self.baseline_snapshot = MemorySnapshot() - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank) - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CuMemAllocator.get_instance() - assert allocator.get_current_usage() == 0, ( - "Sleep mode can only be " - "used for one instance per process.") - context = allocator.use_memory_pool(tag="weights") - else: - context = nullcontext() - with context: - self.model_runner.load_model() - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - self.model_runner.save_sharded_state( - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - self.model_runner.save_tensorized_model( - tensorizer_config=tensorizer_config, ) - - @torch.inference_mode() - def determine_available_kv_cache_memory(self, - total_gpu_memory: int) -> float: - if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: - # still need a profile run which compiles the model for - # max_num_batched_tokens - self.model_runner.profile_run() - - GiB = lambda b: b / GiB_bytes - msg = ( - f"Initial free memory " - f"{GiB(self.baseline_snapshot.free_memory):.2f} " - f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for " - "KV Cache as specified by kv_cache_memory_bytes config and " - "skipped memory profiling. This does does not respect the " - "gpu_memory_utilization config. Only use kv_cache_memory_bytes " - "config when you want manual control of KV cache memory " - "size. If OOM'ed, check the difference of initial free " - "memory between the current run and the previous run " - "where kv_cache_memory_bytes is suggested and update it " - "correspondingly.") - logger.info(msg) - return self.cache_config.kv_cache_memory_bytes - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - with memory_profiling( - self.baseline_snapshot, - weights_memory=self.model_runner.model_memory_usage) as result: - self.model_runner.profile_run() - - self.non_torch_memory = result.non_torch_increase - self.peak_activation_memory = result.torch_peak_increase - - self._assert_memory_footprint_increased_during_profiling() - - self.requested_memory = total_gpu_memory * \ - self.cache_config.gpu_memory_utilization - - self.available_kv_cache_memory = (self.requested_memory - - result.non_kv_cache_memory) - - msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n" - "the current vLLM instance can use " - "total_gpu_memory " - f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" - " x gpu_memory_utilization " - f"({self.cache_config.gpu_memory_utilization:.2f})" - f" = {(self.requested_memory / GiB_bytes):.2f}GiB\n" - "model weights take " - f"{(result.weights_memory / GiB_bytes):.2f}GiB;" - " non_torch_memory takes " - f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" - " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" - " the rest of the memory reserved for KV Cache is " - f"{(self.available_kv_cache_memory / GiB_bytes):.2f}GiB.") - - logger.info(msg) - return self.available_kv_cache_memory - - @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculates the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - - Tip: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() - available_kv_cache_memory = self.determine_available_kv_cache_memory( - total_gpu_memory) - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = self.get_cache_block_size_bytes() - if cache_block_size == 0: - num_gpu_blocks = 0 - num_cpu_blocks = 0 - else: - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - - # Final cleanup - gc.collect() - - return num_gpu_blocks, num_cpu_blocks - - def _assert_memory_footprint_increased_during_profiling(self): - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - free_gpu_memory, total = torch.cuda.mem_get_info() - cuda_memory = total - free_gpu_memory - assert self.baseline_snapshot.cuda_memory < cuda_memory, ( - "Error in memory profiling. " - f"Initial used memory {self.baseline_snapshot.cuda_memory}, " - f"currently used memory {cuda_memory}. " - f"This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks. - - This also warms up the model, which may record CUDA graphs. - """ - raise_if_cache_size_invalid( - num_gpu_blocks, self.cache_config.block_size, - self.cache_config.is_attention_free, - self.model_config.max_model_len, - self.parallel_config.pipeline_parallel_size) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CuMemAllocator.get_instance() - context = allocator.use_memory_pool(tag="kv_cache") - else: - context = nullcontext() - with context: - self._init_cache_engine() - self._warm_up_model() - - def _init_cache_engine(self): - assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = [ - CacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.gpu_cache = [ - self.cache_engine[ve].gpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - - # Layer pairings for cross-layer KV sharing. - # If an Attention layer `layer_name` is in the keys of this dict, it - # means this layer will perform attention using the keys and values - # from the KV cache of `shared_kv_cache_layers[layer_name]`. - shared_kv_cache_layers: dict[str, str] = {} - - attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) - - for layer_name, attn_module in attn_layers.items(): - if (kv_tgt_layer := - attn_module.kv_sharing_target_layer_name) is not None: - # The layer doesn't need its own KV cache and will use that of - # the target layer. We skip creating a KVCacheSpec for it, so - # that KV cache management logic will act as this layer does - # not exist, and doesn't allocate KV cache for the layer. This - # enables the memory saving of cross-layer kv sharing, allowing - # a given amount of memory to accommodate longer context lengths - # or enable more requests to be processed simultaneously. - shared_kv_cache_layers[layer_name] = kv_tgt_layer - - bind_kv_cache(self.compilation_config.static_forward_context, - self.gpu_cache, shared_kv_cache_layers) - - def _warm_up_model(self) -> None: - # warm up sizes that are not in cudagraph capture sizes, - # but users still want to compile for better performance, - # e.g. for the max-num-batched token size in chunked prefill. - warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() - if not self.model_config.enforce_eager: - warmup_sizes = [ - x for x in warmup_sizes if x not in - self.vllm_config.compilation_config.cudagraph_capture_sizes - ] - for size in sorted(warmup_sizes, reverse=True): - logger.info("Compile and warming up model for size %d", size) - self.model_runner._dummy_run(size) - - cuda_graph_memory_bytes = 0 - if not self.model_config.enforce_eager: - cuda_graph_memory_bytes = self.model_runner.capture_model( - self.gpu_cache) - - if (self.cache_config.kv_cache_memory_bytes is None - and hasattr(self, "peak_activation_memory")): - # Suggests optimal kv cache memory size if we rely on - # memory_profiling to guess the kv cache memory size which - # provides peak_activation_memory and a few other memory - # consumption. `memory_profiling` does not consider - # CUDAGraph memory size and may not utilize all gpu memory. - # Users may want fine-grained control to specify kv cache - # memory size. - GiB = lambda b: round(b / GiB_bytes, 2) - non_kv_cache_memory = (self.model_runner.model_memory_usage + - self.peak_activation_memory + - self.non_torch_memory + - cuda_graph_memory_bytes) - - # empirically observed that the memory profiling may - # slightly underestimate the memory consumption. - # So leave a small buffer (=150MiB) to avoid OOM. - redundancy_buffer_memory = 150 * (1 << 20) - kv_cache_memory_bytes_to_gpu_limit = ( - self.baseline_snapshot.free_memory - non_kv_cache_memory - - redundancy_buffer_memory) - kv_cache_memory_bytes_to_requested_limit = ( - int(self.requested_memory) - non_kv_cache_memory - - redundancy_buffer_memory) - - msg = ( - f"Free memory on device " - f"({GiB(self.baseline_snapshot.free_memory)}/" - f"{GiB(self.baseline_snapshot.total_memory)} GiB) on startup. " - f"Desired GPU memory utilization is " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). " - f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " - f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " - f"for peak activation, {GiB(self.non_torch_memory)} GiB " - f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " - f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " - f"config with `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " - f"requested memory, or `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " - f"utilize gpu memory. Current kv cache memory in use is " - f"{int(self.available_kv_cache_memory)} bytes.") - logger.info(msg) - - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.gpu_cache - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_steps = execute_model_req.num_steps - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, - dtype=torch.int64).view(-1, 2) - - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - num_steps=num_steps, - ) - - @torch.inference_mode() - def execute_worker(self, worker_input: WorkerInput) -> None: - virtual_engine = worker_input.virtual_engine - # Issue cache operations. - if (worker_input.blocks_to_swap_in is not None - and worker_input.blocks_to_swap_in.numel() > 0): - self.cache_engine[virtual_engine].swap_in( - worker_input.blocks_to_swap_in) - if (worker_input.blocks_to_swap_out is not None - and worker_input.blocks_to_swap_out.numel() > 0): - self.cache_engine[virtual_engine].swap_out( - worker_input.blocks_to_swap_out) - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) - - def _get_cached_seq_group_metadata( - self, - seq_group_metadata_list: List[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]], - finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: - """Return a list of cached Sequence Group Metadata after updating its - state. - - It is used because scheduler only sends delta to workers to reduce - the data payload size. The function also cleans up cache based on - a given `finished_request_ids`. - """ - new_seq_group_metadata_list = [] - for metadata_or_delta in seq_group_metadata_list: - request_id = metadata_or_delta.request_id - if request_id not in self._seq_group_metadata_cache: - # The first prefill. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[request_id] = metadata_or_delta - else: - # The first prefill is already cached. - if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): - self._seq_group_metadata_cache[request_id].apply_delta( - metadata_or_delta) - else: - # If metadata snapshot is sent again, it is - # preempted. Reset the cache because we need to start - # from scratch. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[ - request_id] = metadata_or_delta - - new_seq_group_metadata_list.append( - self._seq_group_metadata_cache[request_id]) - - # Clean up finished ids - for finished_id in finished_request_ids: - del self._seq_group_metadata_cache[finished_id] - - return new_seq_group_metadata_list - - def _execute_model_spmd( - self, - execute_model_req: ExecuteModelRequest, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Optional[List[SamplerOutput]]: - if execute_model_req is not None: - new_seq_group_metadata_list = self._get_cached_seq_group_metadata( - execute_model_req.seq_group_metadata_list, - execute_model_req.finished_requests_ids) - - execute_model_req.seq_group_metadata_list = ( - new_seq_group_metadata_list) - output = super()._execute_model_spmd(execute_model_req, - intermediate_tensors) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - def get_cache_block_size_bytes(self) -> int: - """Get the size of the KV cache block size in bytes. - """ - return CacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - -def init_worker_distributed_environment( - vllm_config: VllmConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, -) -> None: - """Initialize the distributed environment.""" - parallel_config = vllm_config.parallel_config - set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) - - init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank, - current_platform.dist_backend) - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.decode_context_parallel_size) - - ensure_kv_transfer_initialized(vllm_config) - - -def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): - # Check if the GPU supports the dtype. - if torch_dtype == torch.bfloat16: # noqa: SIM102 - if not current_platform.has_device_capability(80): - capability = current_platform.get_device_capability() - gpu_name = current_platform.get_device_name() - - if capability is None: - compute_str = "does not have a compute capability" - else: - version_str = capability.as_version_str() - compute_str = f"has compute capability {version_str}" - - raise ValueError( - "Bfloat16 is only supported on GPUs with compute capability " - f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " - "You can use float16 instead by explicitly setting the " - "`dtype` flag in CLI, for example: --dtype=half.") - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, - max_model_len, pipeline_parallel_size) -> None: - if is_attention_free and num_gpu_blocks != 0: - raise ValueError("No memory should be allocated for the cache blocks " - f"for an attention-free model, but {num_gpu_blocks} " - "blocks are allocated.") - if not is_attention_free and num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size) - if not is_attention_free and max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") From 62b38dc8322f9f31fff7c680028b0b88306e1c39 Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Sat, 20 Sep 2025 20:29:12 -0700 Subject: [PATCH 067/185] [Doc] improve test-pipeline.yaml documentation (#25305) Signed-off-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9d38e571324b4..fe4796b35786c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -6,24 +6,28 @@ # to generate the final pipeline yaml file. # Documentation -# label(str): the name of the test. emoji allowed. -# fast_check(bool): whether to run this on each commit on fastcheck pipeline. -# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline. -# fast_check_only(bool): run this test on fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). # command(str): the single command to run for tests. incompatible with commands. -# commands(list): the list of commands to run for test. incompatbile with command. -# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] -# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 -# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. -# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, -# in this case, commands must be specified. the first command runs on first host, the second +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second # command runs on the second host. -# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests -# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. # When adding a test -# - If the test belong to an existing group, add it there +# - If the test belongs to an existing group, add it there # - If the test is short, add to any existing step # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. From 1cd885bd540c3765e4c2d378b3167102dfa26ab5 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 20:49:09 -0700 Subject: [PATCH 068/185] [V0 Deprecation] Remove V0 model runner base & simplify worker base (#25328) Signed-off-by: Woosuk Kwon --- vllm/attention/backends/abstract.py | 15 +- vllm/attention/backends/utils.py | 8 +- vllm/worker/model_runner_base.py | 307 ---------------------- vllm/worker/worker_base.py | 387 +--------------------------- 4 files changed, 11 insertions(+), 706 deletions(-) delete mode 100644 vllm/worker/model_runner_base.py diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index dfde67e1713c5..ab7ef2112b083 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -4,19 +4,14 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, - Protocol, Set, Tuple, Type, TypeVar) +from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple, + Type, TypeVar) import torch from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey from vllm.multimodal import MultiModalPlaceholderMap -if TYPE_CHECKING: - from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerInputBuilderBase) - class AttentionType: """ @@ -170,7 +165,7 @@ class AttentionState(ABC, Generic[T]): lifetime of the model runner.""" @abstractmethod - def __init__(self, runner: "ModelRunnerBase"): + def __init__(self, runner: Any): ... @abstractmethod @@ -210,7 +205,7 @@ class AttentionState(ABC, Generic[T]): ... @abstractmethod - def begin_forward(self, model_input: "ModelRunnerInputBase") -> None: + def begin_forward(self, model_input) -> None: """Prepare state for forward pass.""" ... @@ -219,7 +214,7 @@ class AttentionMetadataBuilder(ABC, Generic[T]): """Abstract class for attention metadata builders.""" @abstractmethod - def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None: + def __init__(self, input_builder) -> None: """Create the builder, remember some configuration and parameters.""" raise NotImplementedError diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 289cfa2177438..b28e6a4237cb6 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -5,8 +5,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import numpy as np import torch @@ -21,9 +20,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) -if TYPE_CHECKING: - from vllm.worker.model_runner_base import ModelRunnerBase - # Error string(s) for encoder/decoder # unsupported attention scenarios STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported " @@ -286,7 +282,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): class CommonAttentionState(AttentionState): - def __init__(self, runner: "ModelRunnerBase"): + def __init__(self, runner): self.runner = runner self._is_graph_capturing = False diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py deleted file mode 100644 index 1008b743619a4..0000000000000 --- a/vllm/worker/model_runner_base.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar) - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.models.interfaces import supports_transcription -from vllm.model_executor.models.interfaces_base import is_text_generation_model -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.tasks import GenerationTask, SupportedTask - -if TYPE_CHECKING: - from vllm.attention import AttentionMetadata - from vllm.attention.backends.abstract import AttentionBackend - from vllm.model_executor import SamplingMetadata - -logger = init_logger(__name__) - -T = TypeVar('T', bound="BroadcastableModelInput") - - -def _add_attn_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], - attn_metadata: Optional["AttentionMetadata"]) -> None: - """ - Helper method to update tensor_dict with broadcastable - AttentionMetadata fields. - """ - if attn_metadata is not None: - tensor_dict.update(attn_metadata.asdict_zerocopy()) - - -def _init_attn_metadata_from_tensor_dict( - attn_backend: "AttentionBackend", - tensor_dict: Dict[str, Any], -) -> Dict[str, Any]: - """ - Helper method to initialize AttentionMetadata based on an - AttentionBackend and broadcastable AttentionMetadata fields. - """ - # Extract the fields used to create AttentionMetadata. - valid_attn_kwargs = {} - for field in dataclasses.fields(attn_backend.get_metadata_cls()): - if field.name in tensor_dict: - if field.name == "input_positions": - valid_attn_kwargs[field.name] = tensor_dict[field.name] - else: - valid_attn_kwargs[field.name] = tensor_dict.pop(field.name) - - attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) - tensor_dict["attn_metadata"] = attn_metadata - return tensor_dict - - -def _init_sampling_metadata_from_tensor_dict( # type: ignore - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Helper method to initialize SamplingMetadata based on broadcastable - SamplingMetadata fields. - """ - from vllm.model_executor import SamplingMetadata - - selected_token_indices = tensor_dict.pop("selected_token_indices", None) - # An empty SamplingMetadata to signal that the worker should skip - # sampling. - if selected_token_indices is not None: - tensor_dict["sampling_metadata"] = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, - ) - return tensor_dict - - -def _add_sampling_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], - sampling_metadata: Optional["SamplingMetadata"]) -> None: - """ - Helper method to update tensor_dict with broadcastable - SamplingMetadata fields. - """ - if sampling_metadata is not None: - tensor_dict["selected_token_indices"] = ( - sampling_metadata.selected_token_indices) - - -def _init_frozen_model_input_from_tensor_dict( - frozen_model_input_cls: Type["ModelRunnerInputBase"], - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Helper method to initialize a frozen ModelInput based on broadcastable - """ - valid_tensor_kwargs = {} - for field in dataclasses.fields(frozen_model_input_cls): - val = tensor_dict.pop(field.name, None) - if val is not None: - valid_tensor_kwargs[field.name] = val - - frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs) - tensor_dict["frozen_model_input"] = frozen_model_input - return tensor_dict - - -class BroadcastableModelInput(ABC): - - @abstractmethod - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - """ - Extract broadcastable fields. Override for fields that require some - custom deserialization. - """ - raise NotImplementedError - - @classmethod - @abstractmethod - def from_broadcasted_tensor_dict( - cls: Type[T], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> T: - """ - Pop fields from the given tensor_dict and populate a new instance of - BroadcastableModelInput. - """ - raise NotImplementedError - - -@dataclasses.dataclass(frozen=True) -class ModelRunnerInputBase(BroadcastableModelInput): - """Local inputs to each worker's model runner. May contain - device-specific data. Different worker backends may have different methods - of converting from the global ExecuteModelRequest produced by the LLM - engine to the worker-local ModelRunnerInputBase objects. - - Model runners that support multi-GPU execution should define a - ModelRunnerInputBase subclass, add their required fields, and specify how to - serialize/deserialize a ModelInput for broadcast between workers. - """ - pass - - -class ModelRunnerInputBuilderBase(ABC, Generic[T]): - """A builder to create ModelRunnerInputBase objects. - """ - - @abstractmethod - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - raise NotImplementedError - - @abstractmethod - def add_seq_group(self, seq_group_metadata): - """TBA""" - raise NotImplementedError - - @abstractmethod - def build(self, *args, **kwargs) -> T: - """Build metadata with on-device tensors.""" - raise NotImplementedError - - -class ModelRunnerBase(ABC, Generic[T]): - """ - Model runner interface that abstracts a particular hardware and/or type of - model. Model execution may communicate data with model runners in other - processes, but it should not include control plane metadata communication. - - Each ModelRunnerBase subclass should define a corresponding - ModelRunnerInputBase subclass. - """ - - def __init__( - self, - vllm_config: VllmConfig, - ) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.observability_config = vllm_config.observability_config - - # Map of request_id -> generator used for seeded random sampling - generators: Dict[str, torch.Generator] = {} - - @abstractmethod - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> T: - """ - Make an instance of a ModelRunnerInputBase from the broadcasted tensor - dict. - """ - raise NotImplementedError - - @abstractmethod - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> T: - """ - Prepare the inputs to ModelRunnerBase.execute_model from an execution - request. This method may move data to the worker's local device. It is - not allowed to communicate with other workers or devices. - """ - raise NotImplementedError - - @abstractmethod - def get_model(self) -> nn.Module: - raise NotImplementedError - - def get_supported_generation_tasks(self) -> list[GenerationTask]: - model = self.get_model() - supported_tasks = list[GenerationTask]() - - if is_text_generation_model(model): - supported_tasks.append("generate") - - if supports_transcription(model): - if model.supports_transcription_only: - return ["transcription"] - - supported_tasks.append("transcription") - - return supported_tasks - - def get_supported_tasks(self) -> tuple[SupportedTask, ...]: - tasks = list[SupportedTask]() - - if self.model_config.runner_type == "generate": - tasks.extend(self.get_supported_generation_tasks()) - - return tuple(tasks) - - def execute_model( - self, - model_input: T, - kv_caches: Optional[List[torch.Tensor]], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[List[SamplerOutput]]: - """ - Execute the model on the given input. - """ - raise NotImplementedError - - def get_generators(self, finished_request_ids: Optional[List[str]] = None): - """ - Return dict of per-request generators used for random sampling. - """ - - # Clean up generators from completed requests - if finished_request_ids: - for request_id in finished_request_ids: - self.generators.pop(request_id, None) - - return self.generators - - -class ModelRunnerWrapperBase: - """ - The whole point of this class is to lazily initialize the model_runner. - """ - - def __init__( - self, - model_runner: ModelRunnerBase, - ) -> None: - self.model_runner: ModelRunnerBase = model_runner - - def __getattr__(self, attr): - return getattr(self.model_runner, attr) - - -class InputProcessingError(Exception): - """This exception is raised when an error occurs preparing the inputs for - a single sequence group. - This allows the engine to gracefully handle errors with a single sequence - group without having to fail the entire batch. - """ - - def __init__(self, request_id, message): - """request_id is the id of the offending sequence group""" - self.request_id = request_id - self.message = message - super().__init__(self.message) - - def __str__(self): - return "Failed to prepare inputs for sequence group with request id: " \ - f"{self.request_id}, Error: {self.message}" diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index d0a56f6ff4637..eaab976bf7f75 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,31 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses import os -import time -from abc import abstractmethod -from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type, - TypeVar, Union) +from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar, + Union) import cloudpickle -import torch import torch.nn as nn -from vllm.config import (ObservabilityConfig, VllmConfig, - set_current_vllm_config) -from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group +from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, IntermediateTensors +from vllm.sequence import ExecuteModelRequest from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, run_method, update_environment_variables, warn_for_unimplemented_methods) -from vllm.worker.model_runner_base import (BroadcastableModelInput, - ModelRunnerBase, - ModelRunnerInputBase) logger = init_logger(__name__) @@ -141,356 +132,6 @@ class WorkerBase: return -class DelegateWorkerBase(WorkerBase): - """ - A class that delegates all methods to another WorkerBase instance. This is - useful for creating a WorkerBase that wraps another WorkerBase instance, - e.g. speculative decoding. - """ - worker: WorkerBase - - def __init__( - self, - *args, - **kwargs, - ) -> None: - vllm_config: VllmConfig = kwargs.get("vllm_config") - cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls) - self.worker = cls(*args, **kwargs) - - def init_device(self) -> None: - self.worker.init_device() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - return self.worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def load_model(self) -> None: - """Load model onto target device.""" - self.worker.load_model() - - def get_model(self) -> nn.Module: - return self.worker.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[List[SamplerOutput]]: - return self.worker.execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - return self.worker.get_cache_block_size_bytes() - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.worker.list_loras() - - def __getattr__(self, attr): - return getattr(self.worker, attr) - - -class LoRANotSupportedWorkerBase(WorkerBase): - """Partial implementation of WorkerBase that raises exceptions when LoRA - methods are invoked. - """ - - def add_lora(self, lora_request: LoRARequest) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def remove_lora(self, lora_id: int) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def pin_lora(self, lora_id: int) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def list_loras(self) -> Set[int]: - raise ValueError(f"{type(self)} does not support LoRA") - - -@dataclasses.dataclass(frozen=True) -class WorkerInput: - """Local inputs to each worker. May contain device-specific data. These - fields should be broadcastable to other workers. - """ - - num_seq_groups: Optional[int] = None - blocks_to_swap_in: Optional[torch.Tensor] = None - blocks_to_swap_out: Optional[torch.Tensor] = None - blocks_to_copy: Optional[torch.Tensor] = None - virtual_engine: int = 0 - num_steps: int = 1 - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type["WorkerInput"], - tensor_dict: Dict[str, Any], - ) -> "WorkerInput": - """ - Pop fields from the given tensor_dict and populate a new instance of - WorkerInput. - """ - return cls( - num_seq_groups=tensor_dict.pop("num_seq_groups"), - blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"), - blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"), - blocks_to_copy=tensor_dict.pop("blocks_to_copy"), - virtual_engine=tensor_dict["virtual_engine"], - num_steps=tensor_dict.pop("num_steps"), - ) - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - """ - Extract broadcastable fields. - """ - tensor_dict = { - "num_seq_groups": self.num_seq_groups, - "blocks_to_swap_in": self.blocks_to_swap_in, - "blocks_to_swap_out": self.blocks_to_swap_out, - "blocks_to_copy": self.blocks_to_copy, - "virtual_engine": self.virtual_engine, - "num_steps": self.num_steps, - } - - return tensor_dict - - -class LocalOrDistributedWorkerBase(WorkerBase): - """ - Partial implementation of WorkerBase that has a default `execute_model` - definition to perform metadata transfer between workers when in distributed - mode. Subclasses of this interface should use model runners that inherit - from ModelRunnerBase, and should only need to implement worker-local logic. - If custom control plane logic is needed to transfer metadata, or if the - model runner cannot inherit from ModelRunnerBase, use WorkerBase instead. - """ - is_driver_worker: bool - model_runner: ModelRunnerBase - observability_config: Optional[ObservabilityConfig] = None - - @property - @abstractmethod - def do_metadata_broadcast(self) -> bool: - """ - Used by the default `execute_model` to check whether broadcast is - needed to transfer request inputs from the driver worker to other - workers in the TP group. If WorkerBase subclass only supports - single-worker execution, then this method should return False. - """ - raise NotImplementedError - - @property - @abstractmethod - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - """ - Gets the list of kv caches to pass to the worker's model runner. Each - element in the list is a kv cache corresponding to a particular virtual - engine (PP stream). Used by the default `execute_model`. If the worker's - model runner does not follow the ModelRunnerBase interface, then inherit - from WorkerBase instead. - """ - raise NotImplementedError - - @abstractmethod - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - """ - Prepare the inputs to WorkerBase.execute_worker from an execution - request. This method may move data to the worker's local device. It is - not allowed to communicate with other workers or devices. - """ - raise NotImplementedError - - @abstractmethod - def execute_worker(self, worker_input: WorkerInput) -> None: - """ - Process an execution request. - """ - raise NotImplementedError - - def _get_worker_input_from_broadcast( - self - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ - str, torch.Tensor]]]: - """ Get the worker input from the broadcasted tensor dict. """ - assert self.do_metadata_broadcast - assert not self.is_driver_worker - broadcast_data = broadcast_tensor_dict(src=0) - if not broadcast_data: - return None - - worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data) - model_input = ( - self.model_runner.make_model_input_from_broadcasted_tensor_dict( - broadcast_data)) - - kwargs = extract_previous_hidden_states(broadcast_data) - - return model_input, worker_input, kwargs - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: - """ Get the driver input and broadcast it to other workers. """ - assert self.is_driver_worker - - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: ModelRunnerInputBase = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - kwargs = extract_previous_hidden_states(execute_model_req) - - if self.do_metadata_broadcast: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update(model_input.as_broadcastable_tensor_dict()) - broadcast_data.update(kwargs) - broadcast_tensor_dict(broadcast_data, src=0) - - if execute_model_req.async_callback: - model_input = dataclasses.replace( # type: ignore - model_input, - async_callback=execute_model_req.async_callback) - - return model_input, worker_input, kwargs - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ - str, torch.Tensor]]]: - """ - Prepare the inputs to ModelRunner and workers. - """ - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - # This signals that there's no more requests to process for - # now. All workers are running infinite loop with - # broadcast_tensor_dict, and it stops the loop when the - # driver broadcasts an empty input. Send an empty input to - # notify all other workers to stop their execution loop. - broadcast_tensor_dict({}, src=0) - return None - return self._get_driver_input_and_broadcast(execute_model_req) - else: - return self._get_worker_input_from_broadcast() - - def get_model(self) -> nn.Module: - return self.model_runner.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: - """Executes at least one model step on the given sequences, unless no - sequences are provided.""" - start_time = time.perf_counter() - - inputs = self.prepare_input(execute_model_req) - if inputs is None: - return None - - model_input, worker_input, kwargs = inputs - num_steps = worker_input.num_steps - - self.execute_worker(worker_input) - - # If there is no input, we don't need to execute the model. - if worker_input.num_seq_groups == 0: - return [] - - intermediate_tensors = None - orig_model_execute_time = 0.0 - if not get_pp_group().is_first_rank: - intermediate_tensors = IntermediateTensors( - get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time): - orig_model_execute_time = intermediate_tensors.tensors.get( - "model_execute_time", torch.tensor(0)).item() - - output = self.model_runner.execute_model( - model_input=model_input, - kv_caches=self.kv_cache[worker_input.virtual_engine] - if self.kv_cache is not None else None, - intermediate_tensors=intermediate_tensors, - num_steps=num_steps, - **kwargs, - ) - - model_execute_time = time.perf_counter() - start_time - if not get_pp_group().is_last_rank: - # output is IntermediateTensors - assert isinstance(output, IntermediateTensors) - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time): - output.tensors["model_execute_time"] = torch.tensor( - model_execute_time + orig_model_execute_time) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) - return [None] - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time - and output is not None): - for o in output: - o.model_execute_time = (orig_model_execute_time + - model_execute_time) - - # output is List[SamplerOutput] - return output - - def _execute_model_spmd( - self, - execute_model_req: ExecuteModelRequest, - intermediate_tensors: Optional[IntermediateTensors] = None - ) -> Optional[List[SamplerOutput]]: - """ - Execute model in Single Program Multiple Data (SPMD) fashion. - All workers take the same request, prepare the input and - execute the model. - """ - assert execute_model_req is not None, ( - "_execute_model_spmd() requires each worker to take in an " - "ExecuteModelRequest") - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: ModelRunnerInputBase = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list)) - - self.execute_worker(worker_input) - - # If there is no input, we don't need to execute the model. - if worker_input.num_seq_groups == 0: - return [] - - kwargs = extract_previous_hidden_states(execute_model_req) - - return self.model_runner.execute_model( - model_input=model_input, - kv_caches=self.kv_cache[worker_input.virtual_engine] - if self.kv_cache is not None else None, - intermediate_tensors=intermediate_tensors, - **kwargs, - ) - - class WorkerWrapperBase: """ This class represents one process in an executor/engine. It is responsible @@ -636,23 +277,3 @@ class WorkerWrapperBase: def __getattr__(self, attr): return getattr(self.worker, attr) - - -def extract_previous_hidden_states( - data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \ - Dict[str, torch.Tensor]: - """If data contains previous_hidden_states, extract it. This returns a dict - which can be used directly as additional kwargs in any following - execute_model calls. This is used in draft models like EAGLE.""" - output = {} - - # When called from non-driver worker, data is dict but when called from - # driver worker, data is ExecuteModelRequest. - if isinstance(data, dict): - if "previous_hidden_states" in data: - output["previous_hidden_states"] = data["previous_hidden_states"] - elif data.previous_hidden_states is not None: - output["previous_hidden_states"] = data.previous_hidden_states\ - .hidden_states - - return output From 035fd2bd2cd2fb70f5834f5ca6c2ea30cdae9187 Mon Sep 17 00:00:00 2001 From: Wenlong Wang Date: Sat, 20 Sep 2025 20:55:10 -0700 Subject: [PATCH 069/185] [Multi Modal][Performance] Fused Q,K's apply_rope in more models (#25005) Signed-off-by: wwl2755 Co-authored-by: Roger Wang --- vllm/model_executor/models/ernie45_vl.py | 15 +++++++++------ vllm/model_executor/models/glm4_1v.py | 17 ++++++++++------- vllm/model_executor/models/qwen2_vl.py | 16 ++++++++++------ 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 3396c67f42b7b..0d4aced93ca1c 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -234,8 +234,9 @@ class Ernie4_5_VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: # from vllm_flash_attn.flash_attn_interface import ( @@ -261,8 +262,8 @@ class Ernie4_5_VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -281,6 +282,8 @@ class Ernie4_5_VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -291,8 +294,8 @@ class Ernie4_5_VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index cbf327ce02b6b..308b0cb602bc9 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -315,8 +315,10 @@ class Glm4vVisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.attn_backend == _Backend.FLASH_ATTN: # from vllm_flash_attn.flash_attn_interface import ( @@ -341,8 +343,8 @@ class Glm4vVisionAttention(nn.Module): ) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -361,6 +363,8 @@ class Glm4vVisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -371,9 +375,8 @@ class Glm4vVisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7f361678ba72e..dd4e7731e0b08 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -377,8 +377,10 @@ class Qwen2VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: if self.attn_backend == _Backend.ROCM_AITER_FA: @@ -402,8 +404,8 @@ class Qwen2VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -422,6 +424,8 @@ class Qwen2VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -432,8 +436,8 @@ class Qwen2VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output From 12dbd834cf33b539a50d65516e44327caff9d824 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 21:10:48 -0700 Subject: [PATCH 070/185] [V0 Deprecation] Remove from_seq_group methods (#25330) Signed-off-by: Woosuk Kwon --- vllm/multimodal/base.py | 122 +------------------------ vllm/outputs.py | 195 +--------------------------------------- 2 files changed, 2 insertions(+), 315 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index ef8f1b2e17b47..e0edb3e883ed6 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,14 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from collections.abc import Sequence from pathlib import Path -from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar - -if TYPE_CHECKING: - from vllm.sequence import SequenceGroupMetadata - -from .inputs import MultiModalKwargs, PlaceholderRange +from typing import Generic, NamedTuple, TypeVar _T = TypeVar("_T") @@ -53,120 +47,6 @@ class MultiModalPlaceholderMap: self.dest_ranges = [] self.dest_len = 0 - @classmethod - def from_seq_group( - cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]: - """ - Returns the multi-modal items that intersect with the portion of a - prompt (``seq_group``) represented by ``positions``, as well as a - ``MultiModalPlaceholderMap`` that relates the multi-modal embedding - vectors to their corresponding placeholders. - - Examples: - - ``` - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| - - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | - - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | - - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| - - images = [] - src_ranges = [] - dest_ranges = [] - ``` - """ - seq_mm_data = seq_group.multi_modal_data - seq_mm_placeholders = seq_group.multi_modal_placeholders - - if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs(), {} - - placeholder_maps = dict[str, MultiModalPlaceholderMap]() - - for modality, placeholders in seq_mm_placeholders.items(): - placeholder_map = MultiModalPlaceholderMap() - - if positions: - placeholder_map.append_items_from_seq_group( - positions, - # Dummy, since we don't care about intersecting items - [None] * len(placeholders), - placeholders, - ) - - placeholder_maps[modality] = placeholder_map - - return seq_mm_data, placeholder_maps - - def append_items_from_seq_group( - self, - positions: range, - multi_modal_items: list[_T], - multi_modal_placeholders: Sequence[PlaceholderRange], - ) -> list[_T]: - """ - Adds the multi-modal items that intersect ```positions`` to this - placeholder map and returns the intersecting items. - """ - intersecting_items = [] - - if len(multi_modal_items) != len(multi_modal_placeholders): - raise ValueError( - "Multi-modal placeholders and items must have the same length." - ) - for placeholder_dict, mm_item in zip(multi_modal_placeholders, - multi_modal_items): - placeholder = range( - placeholder_dict.offset, - placeholder_dict.offset + placeholder_dict.length, - ) - intersection = range( - max(positions.start, placeholder.start), - min(positions.stop, placeholder.stop), - ) - - if not intersection: - # Skip this multi-modal item. - continue - - token_embedding_range = range( - intersection.start - positions.start, - intersection.stop - positions.start, - ) - - multimodal_embedding_range = range( - intersection.start - placeholder.start + self.src_len, - intersection.stop - placeholder.start + self.src_len, - ) - - intersecting_items.append(mm_item) - self.dest_ranges.append(token_embedding_range) - self.src_ranges.append(multimodal_embedding_range) - self.src_len += len(placeholder) - - self.dest_len += len(positions) - return intersecting_items - def extend(self, other: "MultiModalPlaceholderMap"): """ Adds the placeholders from another ``MultiModalPlaceholderMap`` to this diff --git a/vllm/outputs.py b/vllm/outputs.py index 64bcfd472f2ad..4d8206bb2d830 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time from collections.abc import MutableSequence from collections.abc import Sequence as GenericSequence from dataclasses import dataclass @@ -14,9 +13,7 @@ from vllm.logger import init_logger from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict -from vllm.sampling_params import RequestOutputKind -from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase, - SequenceStatus) +from vllm.sequence import RequestMetrics logger = init_logger(__name__) @@ -171,170 +168,6 @@ class RequestOutput: else: self.outputs.append(next_completion) - @classmethod - def from_seq_group( - cls, seq_group: SequenceGroup, use_cache: bool, - seq_id_to_seq_group: dict[str, SequenceGroupBase] - ) -> Optional["RequestOutput"]: - finished = seq_group.is_finished() - - if seq_group.request_id in seq_id_to_seq_group: - group: SequenceGroupBase = seq_id_to_seq_group[ - seq_group.request_id] - assembled_seq_group = group.maybe_assemble_group(seq_group) - if finished: - group.finish_seq(seq_group) - if assembled_seq_group is None: - return None - - # clear finished seq in seq_id_to_seq_group - if len(group.to_be_finished) == 0: - for sub_request_id in list(group.seq_id_to_index.keys()): - if sub_request_id in seq_id_to_seq_group: - del seq_id_to_seq_group[sub_request_id] - - return cls.from_seq_group(assembled_seq_group, use_cache, - seq_id_to_seq_group) - - sampling_params = seq_group.sampling_params - if sampling_params is None: - raise ValueError( - "Sampling parameters are missing for a CompletionRequest.") - - if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and ( - not finished): - return None - - # Init cache (if needed) - if use_cache and seq_group.cached_request_output is None: - seq_group.cached_request_output = RequestOutput( # type: ignore - request_id="", - prompt=None, - prompt_token_ids=[], - prompt_logprobs=None, - outputs=[], - finished=False) - - top_n_seqs = seq_group.get_seqs() - - # Create the outputs. - # NOTE: We need omit logprobs here explicitly because the sequence - # always has the logprobs of the sampled tokens even if the - # logprobs are not requested. - include_logprobs = sampling_params.logprobs is not None - text_buffer_length = sampling_params.output_text_buffer_length - delta = sampling_params.output_kind == RequestOutputKind.DELTA - - outputs = [] - include_prompt = True - # num_cached_tokens should be the same for all the sequences - num_cached_tokens = None - for i, seq in enumerate(top_n_seqs): - output_text = seq.get_output_text_to_return( - text_buffer_length, delta) - - output_token_ids = seq.get_output_token_ids_to_return(delta) - num_output_tokens = 1 if isinstance(output_token_ids, - int) else len(output_token_ids) - num_cached_tokens = seq.data.get_num_cached_tokens() - - output_logprobs = seq.output_logprobs if include_logprobs else None - - if delta: - # Slice logprobs delta if applicable - if output_logprobs: - # num_output_tokens can be 0 when n > 1 and request finishes - # before the others - if num_output_tokens > 0: - output_logprobs = output_logprobs[-num_output_tokens:] - else: - output_logprobs = None - # Don't include prompt if this is after the first output - # containing decode token ids - if include_prompt and seq.get_output_len() > num_output_tokens: - include_prompt = False - - if use_cache: - # Get cached output object - cached_outputs = seq_group.cached_request_output.outputs # type: ignore - if i >= len(cached_outputs): - cached_outputs.append( - CompletionOutput(index=i, - text="", - token_ids=[], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - stop_reason=None)) - output = cached_outputs[i] - - # Init cached output object - assert output.index == i - output.text = output_text - - if isinstance(output_token_ids, int): - output.token_ids.clear() - output.token_ids.append(output_token_ids) - else: - output.token_ids = output_token_ids - - output.cumulative_logprob = seq.get_cumulative_logprob() \ - if include_logprobs else None - output.logprobs = output_logprobs - output.finish_reason = SequenceStatus.get_finished_reason( - seq.status) - output.stop_reason = seq.stop_reason - - else: - output = CompletionOutput( - top_n_seqs.index(seq), output_text, [output_token_ids] - if isinstance(output_token_ids, int) else output_token_ids, - seq.get_cumulative_logprob() if include_logprobs else None, - output_logprobs, - SequenceStatus.get_finished_reason(seq.status), - seq.stop_reason) - - outputs.append(output) - - # Every sequence in the sequence group should have the same prompt. - if include_prompt: - prompt = seq_group.prompt - prompt_token_ids = seq_group.prompt_token_ids - encoder_prompt = seq_group.encoder_prompt - encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids - prompt_logprobs = seq_group.prompt_logprobs - else: - prompt = None - prompt_token_ids = None - encoder_prompt = None - encoder_prompt_token_ids = None - prompt_logprobs = None - finished_time = time.time() if finished else None - seq_group.set_finished_time(finished_time) - - init_kwargs = { - "request_id": seq_group.request_id, - "prompt": prompt, - "prompt_token_ids": prompt_token_ids, - "prompt_logprobs": prompt_logprobs, - "outputs": outputs, - "finished": finished, - "metrics": seq_group.metrics, - "lora_request": seq_group.lora_request, - "encoder_prompt": encoder_prompt, - "encoder_prompt_token_ids": encoder_prompt_token_ids, - "num_cached_tokens": num_cached_tokens, - "multi_modal_placeholders": seq_group.multi_modal_placeholders - } - - if use_cache: - request_output = seq_group.cached_request_output - request_output.__init__(**init_kwargs) # type: ignore - else: - request_output = cls(**init_kwargs) # type: ignore - - return request_output - def __repr__(self) -> str: return (f"RequestOutput(request_id={self.request_id}, " f"prompt={self.prompt!r}, " @@ -371,19 +204,6 @@ class PoolingRequestOutput(Generic[_O]): self.finished = finished self.outputs = outputs - @staticmethod - def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": - pooled_data = seq_group.pooled_data - assert pooled_data is not None - - data = pooled_data.to(dtype=torch.float32, device="cpu") - output = PoolingOutput(data) - prompt_token_ids = seq_group.prompt_token_ids - finished = seq_group.is_finished() - - return PoolingRequestOutput(seq_group.request_id, output, - prompt_token_ids, finished) - def __repr__(self): return (f"{type(self).__name__}(request_id={self.request_id!r}, " f"outputs={self.outputs!r}, " @@ -391,19 +211,6 @@ class PoolingRequestOutput(Generic[_O]): f"finished={self.finished})") -class RequestOutputFactory: - - @staticmethod - def create(seq_group: SequenceGroup, - seq_id_to_seq_group: dict[str, SequenceGroupBase], - use_cache: bool = False): - if seq_group.pooled_data is not None: - return PoolingRequestOutput.from_seq_group(seq_group) - else: - return RequestOutput.from_seq_group(seq_group, use_cache, - seq_id_to_seq_group) - - @dataclass class EmbeddingOutput: """The output data of one embedding output of a request. From 7ed82d1974837957a3bfb6d576b9cffba24d31ae Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 20 Sep 2025 21:26:35 -0700 Subject: [PATCH 071/185] [V0 Deprecation] Remove V0 MP executor (#25329) Signed-off-by: Woosuk Kwon --- vllm/executor/mp_distributed_executor.py | 244 -------------------- vllm/executor/multiproc_worker_utils.py | 279 ----------------------- vllm/v1/executor/multiproc_executor.py | 40 +++- 3 files changed, 33 insertions(+), 530 deletions(-) delete mode 100644 vllm/executor/mp_distributed_executor.py delete mode 100644 vllm/executor/multiproc_worker_utils.py diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py deleted file mode 100644 index 136dca54e6e52..0000000000000 --- a/vllm/executor/mp_distributed_executor.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -from typing import Any, Callable, List, Optional, Union - -import cloudpickle - -from vllm.executor.executor_base import DistributedExecutorBase -from vllm.executor.multiproc_worker_utils import ( - ProcessWorkerWrapper, ResultHandler, WorkerMonitor, - set_multiprocessing_worker_envs) -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, - get_distributed_init_method, get_ip, get_open_port, - make_async, run_method, update_environment_variables) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class MultiprocessingDistributedExecutor(DistributedExecutorBase): - """Python multiprocessing-based distributed executor""" - - uses_ray: bool = False - - def _check_cuda(self) -> None: - """Check that the number of GPUs is sufficient for the parallel - configuration. Separate from _init_executor to reduce the number of - indented blocks. - """ - parallel_config = self.parallel_config - world_size = parallel_config.world_size - tensor_parallel_size = parallel_config.tensor_parallel_size - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - if tensor_parallel_size > cuda_device_count: - raise RuntimeError( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - if world_size > cuda_device_count: - raise RuntimeError( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - def _init_executor(self) -> None: - - from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - self._check_cuda() - - # Create the parallel GPU workers. - world_size = self.parallel_config.world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - - # Set multiprocessing envs that are common to V0 and V1 - set_multiprocessing_worker_envs(self.parallel_config) - - # Multiprocessing-based executor does not support multi-node setting. - # Since it only works for single node, we can use the loopback address - # 127.0.0.1 for communication. - distributed_init_method = get_distributed_init_method( - "127.0.0.1", get_open_port()) - - self.workers: List[ProcessWorkerWrapper] = [] - # This is the list of workers that are rank 0 of each TP group EXCEPT - # global rank 0. These are the workers that will broadcast to the - # rest of the workers. - self.tp_driver_workers: List[ProcessWorkerWrapper] = [] - # This is the list of workers that are not drivers and not the first - # worker in a TP group. These are the workers that will be - # broadcasted to. - self.non_driver_workers: List[ProcessWorkerWrapper] = [] - - if world_size == 1: - self.worker_monitor = None - else: - result_handler = ResultHandler() - for rank in range(1, world_size): - worker = ProcessWorkerWrapper(result_handler, - WorkerWrapperBase, - self.vllm_config, rank) - self.workers.append(worker) - if rank % tensor_parallel_size == 0: - self.tp_driver_workers.append(worker) - else: - self.non_driver_workers.append(worker) - - self.worker_monitor = WorkerMonitor(self.workers, result_handler) - result_handler.start() - self.worker_monitor.start() - - # Set up signal handlers to shut down the executor cleanly - # sometimes gc does not work well - - self.driver_worker = WorkerWrapperBase(self.vllm_config, 0) - - all_kwargs = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - for i in range(world_size): - local_rank = i - rank = i - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=(not self.parallel_config) - or (rank % self.parallel_config.tensor_parallel_size == 0), - ) - all_kwargs.append(kwargs) - self._run_workers("init_worker", all_kwargs) - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - self.driver_exec_model = make_async(self.driver_worker.execute_model) - self.pp_locks: Optional[List[asyncio.Lock]] = None - - def shutdown(self): - if (worker_monitor := getattr(self, "worker_monitor", - None)) is not None: - worker_monitor.close() - - def _driver_execute_model( - self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - return self.driver_worker.execute_model(execute_model_req) - - def _run_workers( - self, - method: Union[str, Callable], - *args, - async_run_tensor_parallel_workers_only: bool = False, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> List[Any]: - """Runs the given method on all workers. - - Args: - async_run_tensor_parallel_workers_only: If True the method will be - run only in the remote TP workers, not the driver worker. - It will also be run asynchronously and return a list of futures - rather than blocking on the results. - """ - if isinstance(method, str): - sent_method = method - else: - sent_method = cloudpickle.dumps(method) - del method - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - if async_run_tensor_parallel_workers_only: - # Run only non-driver workers and just return futures. - return [ - worker.execute_method(sent_method, *args, **kwargs) - for worker in self.non_driver_workers - ] - - # Start all remote workers first. - worker_outputs = [ - worker.execute_method(sent_method, *args, **kwargs) - for worker in self.workers - ] - - driver_worker_output = run_method(self.driver_worker, sent_method, - args, kwargs) - - # Get the results of the workers. - return [driver_worker_output - ] + [output.get() for output in worker_outputs] - - def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - if self.worker_monitor is not None and not self.worker_monitor.is_alive( - ): - raise RuntimeError("Worker processes are not running") - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - for result in parallel_worker_tasks: - result.get() - - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - if not self.tp_driver_workers: - return await self.driver_exec_model(execute_model_req) - - if self.pp_locks is None: - # This locks each pipeline parallel stage so multiple virtual - # engines can't execute on the same stage at the same time - # We create the locks here to avoid creating them in the constructor - # which uses a different asyncio loop. - self.pp_locks = [ - asyncio.Lock() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - tasks = [ - asyncio.create_task( - _run_task_with_lock(self.driver_exec_model, self.pp_locks[0], - execute_model_req)) - ] - for pp_rank, driver_worker in enumerate(self.tp_driver_workers, - start=1): - tasks.append( - asyncio.create_task( - _run_task_with_lock(driver_worker.execute_method_async, - self.pp_locks[pp_rank], - "execute_model", execute_model_req))) - results = await asyncio.gather(*tasks) - - # Only the last PP stage has the final results. - return results[-1] - - async def _start_worker_execution_loop(self): - coros = [ - worker.execute_method_async("start_worker_execution_loop") - for worker in self.non_driver_workers - ] - return await asyncio.gather(*coros) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py deleted file mode 100644 index 48b3479ed7997..0000000000000 --- a/vllm/executor/multiproc_worker_utils.py +++ /dev/null @@ -1,279 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -import threading -import uuid -from dataclasses import dataclass -from multiprocessing import Queue -from multiprocessing.connection import wait -from multiprocessing.process import BaseProcess -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union - -import torch - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context, - run_method) - -logger = init_logger(__name__) - -T = TypeVar('T') - -_TERMINATE = "TERMINATE" # sentinel - -JOIN_TIMEOUT_S = 2 - - -@dataclass -class Result(Generic[T]): - """Result of task dispatched to worker""" - - task_id: uuid.UUID - value: Optional[T] = None - exception: Optional[BaseException] = None - - -class ResultFuture(threading.Event, Generic[T]): - """Synchronous future for non-async case""" - - def __init__(self): - super().__init__() - self.result: Optional[Result[T]] = None - - def set_result(self, result: Result[T]): - self.result = result - self.set() - - def get(self) -> T: - self.wait() - assert self.result is not None - if self.result.exception is not None: - raise self.result.exception - return self.result.value # type: ignore[return-value] - - -def _set_future_result(future: Union[ResultFuture, asyncio.Future], - result: Result): - if isinstance(future, ResultFuture): - future.set_result(result) - return - loop = future.get_loop() - if not loop.is_closed(): - if result.exception is not None: - loop.call_soon_threadsafe(future.set_exception, result.exception) - else: - loop.call_soon_threadsafe(future.set_result, result.value) - - -class ResultHandler(threading.Thread): - """Handle results from all workers (in background thread)""" - - def __init__(self) -> None: - super().__init__(daemon=True) - self.result_queue = get_mp_context().Queue() - self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} - - def run(self): - for result in iter(self.result_queue.get, _TERMINATE): - future = self.tasks.pop(result.task_id) - _set_future_result(future, result) - # Ensure that all waiters will receive an exception - for task_id, future in self.tasks.items(): - _set_future_result( - future, - Result(task_id=task_id, - exception=ChildProcessError("worker died"))) - - def close(self): - self.result_queue.put(_TERMINATE) - - -class WorkerMonitor(threading.Thread): - """Monitor worker status (in background thread)""" - - def __init__(self, workers: List['ProcessWorkerWrapper'], - result_handler: ResultHandler): - super().__init__(daemon=True) - self.workers = workers - self.result_handler = result_handler - self._close = False - - def run(self) -> None: - # Blocks until any worker exits - dead_sentinels = wait([w.process.sentinel for w in self.workers]) - if not self._close: - self._close = True - - # Kill / cleanup all workers - for worker in self.workers: - process = worker.process - if process.sentinel in dead_sentinels: - process.join(JOIN_TIMEOUT_S) - if process.exitcode is not None and process.exitcode != 0: - logger.error("Worker %s pid %s died, exit code: %s", - process.name, process.pid, process.exitcode) - # Cleanup any remaining workers - if logger: - logger.info("Killing local vLLM worker processes") - for worker in self.workers: - worker.kill_worker() - # Must be done after worker task queues are all closed - self.result_handler.close() - - for worker in self.workers: - worker.process.join(JOIN_TIMEOUT_S) - - def close(self): - if self._close: - return - self._close = True - logger.info("Terminating local vLLM worker processes") - for worker in self.workers: - worker.terminate_worker() - # Must be done after worker task queues are all closed - self.result_handler.close() - - -class ProcessWorkerWrapper: - """Local process wrapper for vllm.worker.Worker, - for handling single-node multi-GPU tensor parallel.""" - - def __init__(self, result_handler: ResultHandler, - worker_factory: Callable[[VllmConfig, int], Any], - vllm_config: VllmConfig, rank: int) -> None: - self.mp = get_mp_context() - self._task_queue = self.mp.Queue() - self.result_queue = result_handler.result_queue - self.tasks = result_handler.tasks - self.process: BaseProcess = self.mp.Process( # type: ignore[attr-defined] - target=_run_worker_process, - name="VllmWorkerProcess", - kwargs=dict( - worker_factory=worker_factory, - task_queue=self._task_queue, - result_queue=self.result_queue, - vllm_config=vllm_config, - rank=rank, - ), - daemon=True) - - self.process.start() - - def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], - method: Union[str, bytes], args, kwargs): - task_id = uuid.uuid4() - self.tasks[task_id] = future - try: - self._task_queue.put((task_id, method, args, kwargs)) - except SystemExit: - raise - except BaseException as e: - del self.tasks[task_id] - raise ChildProcessError("worker died") from e - - def execute_method(self, method: Union[str, bytes], *args, **kwargs): - future: ResultFuture = ResultFuture() - self._enqueue_task(future, method, args, kwargs) - return future - - async def execute_method_async(self, method: Union[str, bytes], *args, - **kwargs): - future = asyncio.get_running_loop().create_future() - self._enqueue_task(future, method, args, kwargs) - return await future - - def terminate_worker(self): - try: - self._task_queue.put(_TERMINATE) - except ValueError: - self.process.kill() - self._task_queue.close() - - def kill_worker(self): - self._task_queue.close() - self.process.kill() - - -def _run_worker_process( - worker_factory: Callable[[VllmConfig, int], Any], - task_queue: Queue, - result_queue: Queue, - vllm_config: VllmConfig, - rank: int, -) -> None: - """Worker process event loop""" - - # Add process-specific prefix to stdout and stderr - process_name = get_mp_context().current_process().name - decorate_logs(process_name) - - # Initialize worker - worker = worker_factory(vllm_config, rank) - del worker_factory - - # Accept tasks from the engine in task_queue - # and return task output in result_queue - logger.info("Worker ready; awaiting tasks") - try: - for items in iter(task_queue.get, _TERMINATE): - output = None - exception = None - task_id, method, args, kwargs = items - try: - output = run_method(worker, method, args, kwargs) - except SystemExit: - raise - except KeyboardInterrupt: - break - except BaseException as e: - logger.exception( - "Exception in worker %s while processing method %s.", - process_name, method) - exception = e - result_queue.put( - Result(task_id=task_id, value=output, exception=exception)) - except KeyboardInterrupt: - pass - except Exception: - logger.exception("Worker failed") - - # Flush TunableOp results when TunableOp is enabled and - # online (in situ) tuning is enabled. - # Offline tuning API (record_untuned_is_enabled()) only - # available in PyTorch 2.6 or later. - if torch.cuda.is_available(): - import torch.cuda.tunable as tunable - if (tunable.is_enabled() and tunable.tuning_is_enabled() - and not tunable.record_untuned_is_enabled()): - tunable.write_file() - - logger.info("Worker exiting") - - -def set_multiprocessing_worker_envs(parallel_config): - """ Set up environment variables that should be used when there are workers - in a multiprocessing environment. This should be called by the parent - process before worker processes are created""" - - _maybe_force_spawn() - - # Configure thread parallelism if OMP_NUM_THREADS isn't set - # - # Helps to avoid CPU contention. The default of spawning a thread per - # core combined with multiprocessing for each GPU can have a negative - # impact on performance. The contention is amplified when running in a - # container where CPU limits can cause throttling. - default_omp_num_threads = 1 - if "OMP_NUM_THREADS" not in os.environ and ( - current_parallelism := - torch.get_num_threads()) > default_omp_num_threads: - logger.warning( - "Reducing Torch parallelism from %d threads to %d to avoid " - "unnecessary CPU contention. Set OMP_NUM_THREADS in the " - "external environment to tune this value as needed.", - current_parallelism, default_omp_num_threads) - os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) - torch.set_num_threads(default_omp_num_threads) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 3aa373f12b609..2aa732f34bcc8 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing +import os import pickle import queue import signal @@ -19,6 +20,7 @@ from threading import Thread from typing import Any, Callable, Optional, Union, cast import cloudpickle +import torch import vllm.envs as envs from vllm.config import VllmConfig @@ -28,14 +30,12 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, get_pp_group, get_tp_group) -from vllm.executor.multiproc_worker_utils import ( - set_multiprocessing_worker_envs) from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import worker_receiver_cache_from_config -from vllm.utils import (decorate_logs, get_distributed_init_method, - get_loopback_ip, get_mp_context, get_open_port, - set_process_title) +from vllm.utils import (_maybe_force_spawn, decorate_logs, + get_distributed_init_method, get_loopback_ip, + get_mp_context, get_open_port, set_process_title) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.executor.utils import get_and_update_mm_cache @@ -67,8 +67,8 @@ class MultiprocExecutor(Executor): f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" f"_parallel_size ({pp_parallel_size}). ") - # Set multiprocessing envs that are common to V0 and V1 - set_multiprocessing_worker_envs(self.parallel_config) + # Set multiprocessing envs + set_multiprocessing_worker_envs() # Multiprocessing-based executor does not support multi-node setting. # Since it only works for single node, we can use the loopback address @@ -698,3 +698,29 @@ class WorkerProc: process_name += f"_EP{ep_rank}" set_process_title(name=process_name) decorate_logs(process_name) + + +def set_multiprocessing_worker_envs(): + """ Set up environment variables that should be used when there are workers + in a multiprocessing environment. This should be called by the parent + process before worker processes are created""" + + _maybe_force_spawn() + + # Configure thread parallelism if OMP_NUM_THREADS isn't set + # + # Helps to avoid CPU contention. The default of spawning a thread per + # core combined with multiprocessing for each GPU can have a negative + # impact on performance. The contention is amplified when running in a + # container where CPU limits can cause throttling. + default_omp_num_threads = 1 + if "OMP_NUM_THREADS" not in os.environ and ( + current_parallelism := + torch.get_num_threads()) > default_omp_num_threads: + logger.warning( + "Reducing Torch parallelism from %d threads to %d to avoid " + "unnecessary CPU contention. Set OMP_NUM_THREADS in the " + "external environment to tune this value as needed.", + current_parallelism, default_omp_num_threads) + os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) + torch.set_num_threads(default_omp_num_threads) From cf56cf78b47e5f9b6a81ce0d50a94f9291922315 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 21 Sep 2025 13:08:07 +0800 Subject: [PATCH 072/185] [V1] Add sliding window support to Flex Attention backend (#24089) Signed-off-by: Isotr0py --- tests/v1/attention/test_attention_backends.py | 204 +++++++++++++----- vllm/v1/attention/backends/flex_attention.py | 90 ++++++-- 2 files changed, 227 insertions(+), 67 deletions(-) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 0b7e103beca63..8a4fc15791b08 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -1,15 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for v1 attention backends without GPUModelRunner dependency.""" +from functools import partial +from typing import Optional, Union import pytest import torch +from torch.nn.attention.flex_attention import create_block_mask, flex_attention from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata, create_standard_kv_cache_spec, create_vllm_config, get_attention_backend) +from vllm.config import ModelConfig +from vllm.platforms import current_platform from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, set_kv_cache_layout) @@ -183,13 +188,19 @@ class MockAttentionLayer: self._v_scale_float = 1.0 -def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, - layer_names: list[str], vllm_config, - device: torch.device, - common_attn_metadata: CommonAttentionMetadata, - query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor) -> torch.Tensor: +def run_attention_backend( + backend: _Backend, + kv_cache_spec: FullAttentionSpec, + layer_names: list[str], + vllm_config, + device: torch.device, + common_attn_metadata: CommonAttentionMetadata, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + sliding_window: Optional[int] = None, +) -> torch.Tensor: """Run attention computation using the specified backend's AttentionImpl.""" # Handle special case for FLEX_ATTENTION_SLOW @@ -253,7 +264,7 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, scale=scale, num_kv_heads=num_kv_heads, alibi_slopes=None, - sliding_window=None, + sliding_window=sliding_window, kv_cache_dtype="auto", ) @@ -275,13 +286,16 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, return output -@pytest.mark.parametrize("batch_spec_name", [ - "small_decode", "small_prefill", "mixed_small", "medium_decode", - "medium_prefill", "mixed_medium", "large_decode", "large_prefill", - "single_decode", "single_prefill" -]) -@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) -def test_backend_correctness(batch_spec_name: str, model: str): +def _test_backend_correctness( + batch_spec: BatchSpec, + model: str, + backend_to_test: list[Union[_Backend, str]], + mask_mod, + *, + block_size: int = 16, + atol: float = 1e-2, + rtol: float = 1e-2, +): """ Test that all backends produce similar outputs to a reference implementation using torch.nn.functional.scaled_dot_product_attention. @@ -297,9 +311,10 @@ def test_backend_correctness(batch_spec_name: str, model: str): simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. """ - batch_spec = BATCH_SPECS[batch_spec_name] + current_platform.seed_everything(42) vllm_config = create_vllm_config(model_name=model, max_model_len=max(batch_spec.seq_lens), + block_size=block_size, num_gpu_blocks=8192) device = torch.device("cuda:0") @@ -314,6 +329,7 @@ def test_backend_correctness(batch_spec_name: str, model: str): num_kv_heads = vllm_config.model_config.get_num_kv_heads( vllm_config.parallel_config) head_size = vllm_config.model_config.get_head_size() + sliding_window = vllm_config.model_config.get_sliding_window() dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) block_size = vllm_config.cache_config.block_size scale = 1.0 / (head_size**0.5) @@ -361,22 +377,21 @@ def test_backend_correctness(batch_spec_name: str, model: str): # Create causal mask: query token i attends to positions 0 to # (context_len + i) kv_len = s_len - offset = context_len - attn_mask = torch.full((q_len, kv_len), - float('-inf'), - device=device, - dtype=dtype) - for i in range(q_len): - attn_mask[i, :offset + i + 1] = 0.0 - sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( - q_sdpa_in, - k_sdpa_in, - v_sdpa_in, - attn_mask=attn_mask, - scale=scale, - enable_gqa=True) - # Convert back to (L, H, D) + final_mask_mod = partial(mask_mod, context_len=context_len) + block_mask = create_block_mask(final_mask_mod, + B=None, + H=None, + Q_LEN=q_len, + KV_LEN=kv_len, + device=device) + sdpa_out_i = flex_attention(q_sdpa_in, + k_sdpa_in, + v_sdpa_in, + block_mask=block_mask, + scale=scale, + enable_gqa=True) + all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0)) # Inputs for vLLM backends are just the new tokens @@ -412,7 +427,7 @@ def test_backend_correctness(batch_spec_name: str, model: str): # 4. Run vLLM backends and compare # Note: flex_attention has known Triton kernel compatibility issues # with test infrastructures - for backend_name in BACKENDS_TO_TEST: + for backend_name in backend_to_test: # FlashAttentionm + FlexAttention: # [2, num_blocks, block_size, num_kv_heads, head_size] # FlashInfer: @@ -427,12 +442,19 @@ def test_backend_correctness(batch_spec_name: str, model: str): 2, 3).contiguous().transpose(2, 3) set_kv_cache_layout("HND") - backend_output = run_attention_backend(backend_name, kv_cache_spec, - ["placeholder"], vllm_config, - device, common_attn_metadata, - query_vllm, key_vllm, - value_vllm, - kv_cache_for_backend) + backend_output = run_attention_backend( + backend_name, + kv_cache_spec, + ["placeholder"], + vllm_config, + device, + common_attn_metadata, + query_vllm, + key_vllm, + value_vllm, + kv_cache_for_backend, + sliding_window=sliding_window, + ) # Check shape and dtype consistency assert backend_output.shape == sdpa_output.shape, ( @@ -446,18 +468,102 @@ def test_backend_correctness(batch_spec_name: str, model: str): f"[{backend_name}] produced non-finite values") # Check numerical similarity - rtol = 1e-2 - atol = 5e-3 + def error_msg(msg: str, backend_name: str): + return (f"[{backend_name}] output differs from SDPA baseline. " + f"{msg}") - max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() - max_rel_diff = torch.max( - torch.abs(backend_output - sdpa_output) / - torch.abs(sdpa_output)).item() - all_close = torch.allclose(backend_output, + torch.testing.assert_close(backend_output, sdpa_output, rtol=rtol, - atol=atol) + atol=atol, + msg=partial(error_msg, + backend_name=backend_name)) - assert all_close, ( - f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") \ No newline at end of file + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_small", "medium_decode", + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" +]) +@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) +def test_causal_backend_correctness(batch_spec_name: str, model: str): + """Test backend's correctness with causal attention.""" + + def causal_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + *, + context_len: int, + ): + return (q_idx + context_len) >= kv_idx + + batch_spec = BATCH_SPECS[batch_spec_name] + LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION] + if is_torch_equal_or_newer("2.9.0.dev0") else []) + SMALL_BLOCK_BACKENDS = [ + x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS + ] + _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, + causal_mask_mod) + + # Fast FlexAttention needs to run with block_size=128 + if LARGE_BLOCK_BACKENDS: + _test_backend_correctness(batch_spec, + model, + LARGE_BLOCK_BACKENDS, + causal_mask_mod, + block_size=128) + + +SLIDING_WINDOW_BACKENDS_TO_TEST = [ + _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION, + _Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW" +] + + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_medium", "large_decode", + "large_prefill" +]) +@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"]) +def test_sliding_window_backend_correctness(batch_spec_name: str, model: str): + """Test backend's correctness with sliding window attention.""" + + def sliding_window_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + *, + context_len: int, + sliding_window: int, + ): + causal_mask = q_idx + context_len >= kv_idx + window_mask = q_idx + context_len - kv_idx < sliding_window + return causal_mask & window_mask + + batch_spec = BATCH_SPECS[batch_spec_name] + model_config = ModelConfig(model=model, + max_model_len=max(batch_spec.seq_lens)) + sliding_window = model_config.get_sliding_window() + sliding_window_mask_mod_fn = partial(sliding_window_mask_mod, + sliding_window=sliding_window) + + LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION] + if is_torch_equal_or_newer("2.9.0.dev0") else []) + SMALL_BLOCK_BACKENDS = [ + x for x in SLIDING_WINDOW_BACKENDS_TO_TEST + if x not in LARGE_BLOCK_BACKENDS + ] + _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, + sliding_window_mask_mod_fn) + + # Fast FlexAttention needs to run with block_size=128 + if LARGE_BLOCK_BACKENDS: + _test_backend_correctness(batch_spec, + model, + LARGE_BLOCK_BACKENDS, + sliding_window_mask_mod_fn, + block_size=128) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 662d3984554ad..c3358bfa74e91 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -9,7 +9,7 @@ import torch import torch._dynamo.decorators import torch.nn.functional as F from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature, - _score_mod_signature, + _score_mod_signature, and_masks, create_block_mask, flex_attention) @@ -292,6 +292,7 @@ class FlexAttentionMetadata: q_block_size: int = 16 kv_block_size: int = 16 transformed_score_mod: Optional[_score_mod_signature] = None + sliding_window: Optional[int] = None def _convert_physical_to_logical( self, @@ -380,6 +381,53 @@ class FlexAttentionMetadata: return final_mask_mod + def get_sliding_window_mask_mod(self) -> _mask_mod_signature: + """Creates the sliding window mask_mod function for FlexAttention. + + Note that the sliding window mask here is bidirectional, we need + to mask it with the bidirectional/causal mask for encoder/decoder. + """ + + if self.sliding_window is None: + raise ValueError( + "sliding_window must be set for sliding window attention") + + def sliding_window_mask_mod(b: torch.Tensor, h: torch.Tensor, + q_idx: torch.Tensor, kv_idx: torch.Tensor): + return torch.abs(q_idx - kv_idx) < self.sliding_window + + def final_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + physical_kv_idx: torch.Tensor, + ) -> torch.Tensor: + (is_valid, logical_q_idx, + logical_kv_idx) = self._convert_physical_to_logical( + self.doc_ids, q_idx, physical_kv_idx) + return torch.where( + is_valid, + sliding_window_mask_mod(b, h, logical_q_idx, logical_kv_idx), + False, + ) + + return final_mask_mod if self.causal else sliding_window_mask_mod + + def get_mask_mod(self): + # Stage-1: initialize the base mask_mod + # (causal mask for decoder or bidirectional mask for encoder) + if self.causal: + mask_mod = self.get_causal_mask_mod() + else: + mask_mod = self.get_bidirectional_mask_mod() + # stage-2: add external mask_mod for special attention during + # forwarding runtime to create the combined mask_mod. + if self.sliding_window is not None: + # Add sliding window mask for sliding window attention + sliding_window_mask_mod = self.get_sliding_window_mask_mod() + mask_mod = and_masks(mask_mod, sliding_window_mask_mod) + return mask_mod + def get_transformed_score_mod(self) -> Optional[_score_mod_signature]: """Creates the transformed score_mod function for FlexAttention. @@ -472,12 +520,9 @@ class FlexAttentionMetadata: return BlockMask.from_kv_blocks(**block_mask_kwargs) def build_block_mask(self) -> BlockMask: - if self.causal: - mask_mod = self.get_causal_mask_mod() - kv_len = self.total_cache_tokens - else: - mask_mod = self.get_bidirectional_mask_mod() - kv_len = self.num_actual_tokens + mask_mod = self.get_mask_mod() + kv_len = (self.total_cache_tokens + if self.causal else self.num_actual_tokens) return create_block_mask_compiled( mask_mod, None, @@ -498,11 +543,7 @@ class FlexAttentionMetadata: self.doc_ids = _offsets_to_doc_ids_tensor(self.query_start_loc) self.num_blocks = self.total_cache_tokens // self.block_size - if self.causal: - self.mask_mod = self.get_causal_mask_mod() - else: - self.mask_mod = self.get_bidirectional_mask_mod() - + self.mask_mod = self.get_mask_mod() self.transformed_score_mod = self.get_transformed_score_mod() if self.direct_build and self.causal: @@ -607,7 +648,7 @@ class FlexAttentionMetadataBuilder( class FlexAttentionImpl(AttentionImpl): - sliding_window: Optional[tuple[int, int]] + sliding_window: Optional[int] alibi_slopes: Optional[torch.Tensor] logits_soft_cap: Optional[float] @@ -641,11 +682,9 @@ class FlexAttentionImpl(AttentionImpl): "FlexAttention does not support alibi slopes yet.") else: self.alibi_slopes = None - if sliding_window is not None: - raise NotImplementedError( - "FlexAttention does not support sliding window yet.") - else: - self.sliding_window = (-1, -1) + + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap if self.logits_soft_cap is not None: @@ -712,6 +751,21 @@ class FlexAttentionImpl(AttentionImpl): num_actual_tokens = attn_metadata.num_actual_tokens + if attn_metadata.sliding_window != self.sliding_window: + attn_metadata.sliding_window = self.sliding_window + if attn_metadata.direct_build: + # TODO: Support skipping the computation of sliding window + # in direct block mask building code path. + logger.warning_once( + "Using direct block mask building with sliding window, " + "which is suboptimal now. Performance may be degraded.") + # update mask mod in attention metadata + attn_metadata.mask_mod = attn_metadata.get_mask_mod() + attn_metadata.block_mask = ( + attn_metadata._build_block_mask_direct()) + else: + attn_metadata.block_mask = attn_metadata.build_block_mask() + if not attn_metadata.causal: assert self.attn_type == AttentionType.ENCODER_ONLY From 30d08911f7cf78287f8da003ddcc99f6ef196f9f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 21 Sep 2025 04:05:20 -0700 Subject: [PATCH 073/185] [MM][Perf] Minor Optimization on Qwen3-VL `fast_pos_embed_interpolate` (#25337) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 121 +++++++++++-------------- 1 file changed, 53 insertions(+), 68 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 17375ff0959d7..ca232e03767b1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -270,6 +270,7 @@ class Qwen3_VisionTransformer(nn.Module): self.temporal_patch_size = vision_config.temporal_patch_size self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes self.use_data_parallel = use_data_parallel + self.num_grid_per_side = int(self.num_position_embeddings**0.5) # NOTE: This is used for creating empty tensor for all_gather for # DP ViT. Here out_hidden_size is enlarged due to deepstack @@ -377,82 +378,68 @@ class Qwen3_VisionTransformer(nn.Module): rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb - def fast_pos_embed_interpolate(self, grid_thw): - num_grid_per_side = int(self.num_position_embeddings**0.5) + def fast_pos_embed_interpolate(self, + grid_thw: list[list[int]]) -> torch.Tensor: - idx_list = [[] for _ in range(4)] - weight_list = [[] for _ in range(4)] + num_grid_per_side = self.num_grid_per_side + m_size = self.spatial_merge_size + hidden_dim = self.pos_embed.embedding_dim + outputs = [] for t, h, w in grid_thw: h_idxs = torch.linspace(0, num_grid_per_side - 1, h, - dtype=torch.float32) + dtype=torch.float32, + device=self.device) w_idxs = torch.linspace(0, num_grid_per_side - 1, w, - dtype=torch.float32) + dtype=torch.float32, + device=self.device) - h_idxs_floor = h_idxs.to(torch.long) - w_idxs_floor = w_idxs.to(torch.long) - h_idxs_ceil = torch.clamp(h_idxs.to(torch.long) + 1, - max=num_grid_per_side - 1) - w_idxs_ceil = torch.clamp(w_idxs.to(torch.long) + 1, - max=num_grid_per_side - 1) + h_floor = h_idxs.to(torch.long) + w_floor = w_idxs.to(torch.long) + h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1) + w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1) - dh = h_idxs - h_idxs_floor - dw = w_idxs - w_idxs_floor + dh = h_idxs - h_floor + dw = w_idxs - w_floor - idx_list[0].extend(((h_idxs_floor * num_grid_per_side)[None].T + - w_idxs_floor[None]).flatten().tolist() * t) - idx_list[1].extend(((h_idxs_floor * num_grid_per_side)[None].T + - w_idxs_ceil[None]).flatten().tolist() * t) - idx_list[2].extend(((h_idxs_ceil * num_grid_per_side)[None].T + - w_idxs_floor[None]).flatten().tolist() * t) - idx_list[3].extend(((h_idxs_ceil * num_grid_per_side)[None].T + - w_idxs_ceil[None]).flatten().tolist() * t) + w00 = ((1 - dh)[:, None] * (1 - dw)[None, :]).reshape(-1) + w01 = ((1 - dh)[:, None] * dw[None, :]).reshape(-1) + w10 = (dh[:, None] * (1 - dw)[None, :]).reshape(-1) + w11 = (dh[:, None] * dw[None, :]).reshape(-1) - weight_list[0].extend( - ((1 - dh)[None].T * (1 - dw)[None]).flatten().tolist() * t) - weight_list[1].extend( - ((1 - dh)[None].T * dw[None]).flatten().tolist() * t) - weight_list[2].extend( - (dh[None].T * (1 - dw)[None]).flatten().tolist() * t) - weight_list[3].extend( - (dh[None].T * dw[None]).flatten().tolist() * t) + idx00 = (h_floor[:, None] * num_grid_per_side + + w_floor[None, :]).reshape(-1) + idx01 = (h_floor[:, None] * num_grid_per_side + + w_ceil[None, :]).reshape(-1) + idx10 = (h_ceil[:, None] * num_grid_per_side + + w_floor[None, :]).reshape(-1) + idx11 = (h_ceil[:, None] * num_grid_per_side + + w_ceil[None, :]).reshape(-1) - device = self.pos_embed.weight.device - dtype = self.pos_embed.weight.dtype + indices = torch.stack([idx00, idx01, idx10, idx11], dim=0) + weights = torch.stack([w00, w01, w10, w11], + dim=0).to(dtype=self.dtype, + device=self.device) + weights = weights.unsqueeze(-1) - p0 = self.pos_embed( - torch.tensor( - idx_list[0], dtype=torch.long, device=device)) * torch.tensor( - weight_list[0], dtype=dtype, device=device)[:, None] - p1 = self.pos_embed( - torch.tensor( - idx_list[1], dtype=torch.long, device=device)) * torch.tensor( - weight_list[1], dtype=dtype, device=device)[:, None] - p2 = self.pos_embed( - torch.tensor( - idx_list[2], dtype=torch.long, device=device)) * torch.tensor( - weight_list[2], dtype=dtype, device=device)[:, None] - p3 = self.pos_embed( - torch.tensor( - idx_list[3], dtype=torch.long, device=device)) * torch.tensor( - weight_list[3], dtype=dtype, device=device)[:, None] + embeds = self.pos_embed(indices) + weighted_embeds = embeds * weights + p0, p1, p2, p3 = weighted_embeds.unbind(dim=0) + combined = p0 + p1 + p2 + p3 - patch_pos_embeds = p0 + p1 + p2 + p3 - patch_pos_embeds = patch_pos_embeds.split( - [t * h * w for t, h, w in grid_thw]) - patch_pos_embeds_permute = [] - m_size = self.spatial_merge_size - for pos_embed, (t, h, w) in zip(patch_pos_embeds, grid_thw): - pos_embed = pos_embed.view(t, h // m_size, m_size, w // m_size, - m_size, -1).permute(0, 1, 3, 2, 4, - 5).flatten(0, 4) - patch_pos_embeds_permute.append(pos_embed) - patch_pos_embeds = torch.cat(patch_pos_embeds_permute) - return patch_pos_embeds + combined = combined.view(h * w, hidden_dim) + repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous() + repeated = repeated.view(t, h // m_size, m_size, w // m_size, + m_size, hidden_dim) + repeated = repeated.permute(0, 1, 3, 2, 4, + 5).reshape(-1, hidden_dim) + outputs.append(repeated) + + return torch.cat(outputs, dim=0) def compute_attn_mask_seqlen( self, @@ -477,12 +464,9 @@ class Qwen3_VisionTransformer(nn.Module): hidden_states = hidden_states + pos_embeds rotary_pos_emb = self.rot_pos_emb(grid_thw) - if isinstance(grid_thw, list): - grid_thw_tensor = torch.tensor(grid_thw, - device=hidden_states.device, - dtype=torch.int32) - else: - grid_thw_tensor = grid_thw + grid_thw_tensor = torch.tensor(grid_thw, + device=self.device, + dtype=torch.int32) cu_seqlens = torch.repeat_interleave( grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], @@ -1224,7 +1208,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list, rope_type="rope_3d") else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync @@ -1526,4 +1511,4 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, language_model="language_model", connector="model.visual.merger", tower_model="model.visual.", - ) \ No newline at end of file + ) From 9aea7373fffe71c239b08a89edbbc46bbb560f45 Mon Sep 17 00:00:00 2001 From: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Date: Sun, 21 Sep 2025 13:36:47 +0200 Subject: [PATCH 074/185] [Bugfix] Typos in error message for missing model config file (#25339) Signed-off-by: simondanielsson --- vllm/transformers_utils/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index cafc43f6b7673..52e2c18a77847 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -524,10 +524,10 @@ def get_config( else: raise ValueError( "Could not detect config format for no config file found. " - "With config_format 'auto', ensure your model has either" - "config.json (HF format) or params.json (Mistral format)." - "Otherwise please specify your_custom_config_format" - "in engine args for customized config parser") + "With config_format 'auto', ensure your model has either " + "config.json (HF format) or params.json (Mistral format). " + "Otherwise please specify your_custom_config_format " + "in engine args for customized config parser.") except Exception as e: error_message = ( From 65a5910ce35f889740bddb2e19dad35c83278873 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 21 Sep 2025 19:41:02 +0800 Subject: [PATCH 075/185] [Optimization] Cache chat template result when processor fails to be loaded (#25341) Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 71 +++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c2c0ad74ef431..df49119d86420 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -421,6 +421,51 @@ def resolve_mistral_chat_template( return None +_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]() +""" +Used in `_try_get_processor_chat_template` to avoid calling +`cached_get_processor` again if the processor fails to be loaded. + +This is needed because `lru_cache` does not cache when an exception happens. +""" + + +def _try_get_processor_chat_template( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + model_config: ModelConfig, +) -> Optional[str]: + cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + if cache_key in _PROCESSOR_CHAT_TEMPLATES: + return _PROCESSOR_CHAT_TEMPLATES[cache_key] + + try: + processor = cached_get_processor( + tokenizer.name_or_path, + processor_cls=( + PreTrainedTokenizer, + PreTrainedTokenizerFast, + ProcessorMixin, + ), + trust_remote_code=model_config.trust_remote_code, + ) + if ( + isinstance(processor, ProcessorMixin) + and hasattr(processor, "chat_template") + and (chat_template := processor.chat_template) is not None + ): + _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template + return chat_template + except Exception: + logger.debug( + "Failed to load AutoProcessor chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) + + _PROCESSOR_CHAT_TEMPLATES[cache_key] = None + return None + + def resolve_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], chat_template: Optional[str], @@ -434,28 +479,10 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - try: - processor = cached_get_processor( - tokenizer.name_or_path, - processor_cls=( - PreTrainedTokenizer, - PreTrainedTokenizerFast, - ProcessorMixin, - ), - trust_remote_code=model_config.trust_remote_code, - ) - if ( - isinstance(processor, ProcessorMixin) - and hasattr(processor, "chat_template") - and processor.chat_template is not None - ): - return processor.chat_template - except Exception: - logger.debug( - "Failed to load AutoProcessor chat template for %s", - tokenizer.name_or_path, - exc_info=True, - ) # noqa: E501 + chat_template = _try_get_processor_chat_template(tokenizer, + model_config) + if chat_template is not None: + return chat_template # 3rd priority: AutoTokenizer chat template try: From 26e673fe9303ad759a47e19a087764393c69109f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 21 Sep 2025 08:52:15 -0700 Subject: [PATCH 076/185] [V0 Deprecation] Remove V0 Sequence class & Sampler (#25332) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- tests/conftest.py | 2 +- .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_phi4mm.py | 2 +- .../multimodal/generation/test_pixtral.py | 2 +- .../generation/vlm_utils/model_utils.py | 2 +- .../multimodal/generation/vlm_utils/types.py | 2 +- tests/models/utils.py | 2 +- tests/tokenization/test_detokenize.py | 140 +- tests/tool_use/test_jamba_tool_parser.py | 2 +- tests/tool_use/test_qwen3coder_tool_parser.py | 2 +- tests/tool_use/test_seed_oss_tool_parser.py | 2 +- tests/tool_use/test_xlam_tool_parser.py | 2 +- tests/v1/engine/test_output_processor.py | 2 +- vllm/executor/executor_base.py | 2 +- vllm/executor/ray_distributed_executor.py | 2 +- vllm/inputs/__init__.py | 13 +- vllm/inputs/registry.py | 67 +- vllm/model_executor/__init__.py | 4 +- .../model_executor/layers/logits_processor.py | 91 -- vllm/model_executor/layers/sampler.py | 1198 --------------- vllm/model_executor/models/medusa.py | 60 +- vllm/model_executor/models/mlp_speculator.py | 78 +- vllm/model_executor/models/phi4flash.py | 6 +- vllm/model_executor/sampling_metadata.py | 594 +------- vllm/sequence.py | 1322 +---------------- vllm/transformers_utils/detokenizer.py | 162 -- vllm/worker/worker_base.py | 2 +- 27 files changed, 69 insertions(+), 3696 deletions(-) delete mode 100644 vllm/model_executor/layers/sampler.py delete mode 100644 vllm/transformers_utils/detokenizer.py diff --git a/tests/conftest.py b/tests/conftest.py index f14b1e8780ad9..dc70c98359598 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,10 +48,10 @@ from vllm.distributed import (cleanup_dist_env_and_memory, initialize_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.sequence import Logprob from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils import set_default_torch_num_threads diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index f2e6fbfad6e80..c1305e0ae31ce 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -7,8 +7,8 @@ from typing import Optional import pytest from transformers import AutoModelForSpeechSeq2Seq +from vllm.logprobs import SampleLogprobs from vllm.lora.request import LoRARequest -from vllm.sequence import SampleLogprobs from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner) diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 67d35213d6422..77e2b90dd5e96 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -12,10 +12,10 @@ from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm.assets.image import ImageAsset +from vllm.logprobs import SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode, rescale_image_size from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput, PromptImageInput, VllmRunner) diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index cb3cc1d3d330d..715b08ef90e54 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -13,8 +13,8 @@ from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from transformers import AutoProcessor from vllm import SamplingParams, TextPrompt, TokensPrompt +from vllm.logprobs import Logprob, SampleLogprobs from vllm.multimodal import MultiModalDataBuiltins -from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test from ...utils import check_logprobs_close diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 8b7d051218f14..ba55450ec8a90 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature, GenerationConfig, GenerationMixin) from transformers.video_utils import VideoMetadata -from vllm.sequence import SampleLogprobs +from vllm.logprobs import SampleLogprobs from vllm.utils import is_list_of from .....conftest import HfRunner, ImageAsset, ImageTestAssets diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 9451131960885..e39ca40fbbf5e 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -12,7 +12,7 @@ from transformers import AutoModelForCausalLM from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config import RunnerOption -from vllm.sequence import SampleLogprobs +from vllm.logprobs import SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset, diff --git a/tests/models/utils.py b/tests/models/utils.py index 76c6e4823a12c..5da2382cef814 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -12,7 +12,7 @@ from transformers import PretrainedConfig from vllm.config import ModelConfig, ModelDType, RunnerOption from vllm.inputs import InputContext -from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs +from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from .registry import HF_EXAMPLE_MODELS diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index bd2b91073d568..fe6c313d2966f 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -8,10 +8,7 @@ import pytest from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.inputs import token_inputs -from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer, @@ -217,138 +214,3 @@ def test_oov_decode(tokenizer, fast): assert decoded_text == '' assert out_ids == [len(tokenizer)] - - -@pytest.fixture -def detokenizer(tokenizer_name: str) -> Detokenizer: - tokenizer = get_tokenizer( - tokenizer_name, - tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto", - trust_remote_code=False, - revision=None, - ) - - return Detokenizer(tokenizer) - - -@pytest.fixture(name="complete_sequence_token_ids") -def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer) -> list[int]: - return tokenizer(complete_sequence, add_special_tokens=False).input_ids - - -def create_sequence(prompt_token_ids=None): - prompt_token_ids = prompt_token_ids or [] - return Sequence( - seq_id=0, - inputs=token_inputs(prompt_token_ids), - block_size=16, - ) - - -def create_dummy_logprobs( - complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]: - return [{ - token_id: Logprob(logprob=0.0), - token_id + 1: Logprob(logprob=0.1) - } for token_id in complete_sequence_token_ids] - - -def create_dummy_prompt_logprobs( - complete_sequence_token_ids: list[int] -) -> list[Optional[dict[int, Any]]]: - # logprob for the first prompt token is None. - logprobs: list[Optional[dict[int, Any]]] = [None] - logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:]) - return logprobs - - -@pytest.mark.parametrize("complete_sequence", TRUTH) -@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) -def test_decode_sequence_logprobs(complete_sequence: str, - complete_sequence_token_ids: list[int], - detokenizer: Detokenizer, - skip_special_tokens: bool): - """Verify Detokenizer decodes logprobs correctly.""" - sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, - logprobs=2) - - # Run sequentially. - seq = create_sequence() - dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token: list[str] = [] - sequential_logprobs_text_other_token: list[str] = [] - for new_token, logprobs in zip(complete_sequence_token_ids, - dummy_logprobs): - seq.append_token_id(new_token, logprobs) - detokenizer.decode_sequence_inplace(seq, sampling_params) - sequential_logprobs_text_chosen_token.append( - seq.output_logprobs[-1][new_token].decoded_token) - sequential_logprobs_text_other_token.append( - seq.output_logprobs[-1][new_token + 1].decoded_token) - sequential_result = seq.output_text - - assert sequential_result == "".join(sequential_logprobs_text_chosen_token) - assert sequential_result != "".join(sequential_logprobs_text_other_token) - - if not skip_special_tokens: - # Text for logprobs for the chosen token should be the same as the - # generated text. Note that this will only be true if we skip - # special tokens. - assert sequential_result == complete_sequence - - -@pytest.mark.parametrize("complete_sequence", TRUTH) -@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -def test_decode_prompt_logprobs(complete_sequence: str, - complete_sequence_token_ids: list[int], - detokenizer: Detokenizer): - - # We want to use skip_special_tokens=False here but Mistral tokenizers - # don't support that. - if complete_sequence not in SPECIAL_TOKS_TRUTH: - skip_special_tokens = True - elif not isinstance(detokenizer.tokenizer, MistralTokenizer): - skip_special_tokens = False - else: - pytest.skip("MistralTokenizers don't support " - "skip_special_tokens=False") - return - """Verify Detokenizer decodes prompt logprobs correctly.""" - sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, - prompt_logprobs=1) - - # Run sequentially. - seq = create_sequence(complete_sequence_token_ids) - seq_group = SequenceGroup(request_id="1", - seqs=[seq], - sampling_params=sampling_params, - arrival_time=0.0) - dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids) - detokenizer.decode_prompt_logprobs_inplace(seq_group, - dummy_logprobs, - position_offset=0) - # First logprob is None. - decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[ - 1:] # type: ignore - - # decoded_prompt_logprobs doesn't contain the first token. - token_ids = complete_sequence_token_ids - tokenizer = detokenizer.tokenizer - text_full = tokenizer.decode(token_ids, - skip_special_tokens=skip_special_tokens) - text_first = tokenizer.decode(token_ids[0], - skip_special_tokens=skip_special_tokens) - text = text_full[len(text_first):] - - # Text for logprobs for the chosen token should be the same as the - # prompt text. Note that the first logprob is None. - assert text == "".join([ - logprobs[token_id].decoded_token - for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) - ]) - assert text != "".join([ - logprobs[token_id + 1].decoded_token - for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) - ]) diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 35153139350bf..57ace1fa22ac9 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -12,7 +12,7 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import JambaToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer MODEL = "ai21labs/Jamba-tiny-dev" diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index ccb2acf512caf..f06fb2b9f2f04 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ToolCall) from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser) -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index c276a598aa68c..118c7534622e2 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer # Use a common model that is likely to be available diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index 0bc22e4f1031c..c07ca0f56d6b0 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import xLAMToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer # Use a common model that is likely to be available diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index a9632ce54eac8..bdb40be99aa3f 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -12,9 +12,9 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST, STOP_STRINGS, DummyOutputProcessorTestVectors, MockEngineCore) +from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import PromptLogprobs, SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.output_processor import (OutputProcessor, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b75b94ad0acc2..fd4b992c3821b 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -15,10 +15,10 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.tasks import SupportedTask from vllm.utils import make_async +from vllm.v1.outputs import SamplerOutput from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 78d0ee6c1e3fc..84747575b4960 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -17,12 +17,12 @@ from vllm.executor.msgspec_utils import encode_hook from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster, ray) from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform from vllm.ray.ray_env import get_env_vars_to_copy from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, make_async) +from vllm.v1.outputs import SamplerOutput if ray is not None: from ray.actor import ActorHandle diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index e9db2a0dc13a8..46f49aaa013da 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -7,15 +7,7 @@ from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt, SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import (DummyData, InputContext, InputProcessingContext, - InputRegistry) - -INPUT_REGISTRY = InputRegistry() -""" -The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used -by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the -target model. -""" +from .registry import InputContext, InputProcessingContext __all__ = [ "DataPrompt", @@ -36,9 +28,6 @@ __all__ = [ "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", - "INPUT_REGISTRY", - "DummyData", "InputContext", "InputProcessingContext", - "InputRegistry", ] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f0b392e9767ae..b5316b6d0574c 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +from typing import TYPE_CHECKING, Any, Union import torch from transformers import BatchFeature, PretrainedConfig, ProcessorMixin @@ -15,16 +15,9 @@ from vllm.utils.jsontree import JSONTree, json_map_leaves if TYPE_CHECKING: from vllm.config import ModelConfig - from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict, - MultiModalRegistry) - from vllm.sequence import SequenceData from vllm.transformers_utils.tokenizer import AnyTokenizer else: ModelConfig = Any - MultiModalDataDict = Any - MultiModalPlaceholderDict = Any - MultiModalRegistry = Any - SequenceData = Any AnyTokenizer = Any _T = TypeVar("_T") @@ -191,61 +184,3 @@ class InputProcessingContext(InputContext): f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc - - -class DummyData(NamedTuple): - """ - Dummy data used for profiling. - - Note: This is only used in V0. - """ - - seq_data: SequenceData - multi_modal_data: Optional[MultiModalDataDict] = None - multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - - -class InputRegistry: - """ - Note: This is only used in V0. - """ - - def dummy_data_for_profiling( - self, - model_config: ModelConfig, - seq_len: int, - mm_registry: MultiModalRegistry, - is_encoder_data: bool = False, - ) -> DummyData: - """ - Create dummy data for profiling the memory usage of a model. - - The model is identified by ``model_config``. - """ - # Avoid circular import - from vllm.multimodal.cache import processor_only_cache_from_config - from vllm.sequence import SequenceData - - if not model_config.is_multimodal_model: - seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) - return DummyData(seq_data=seq_data) - - cache = processor_only_cache_from_config(model_config, mm_registry) - - # Encoder dummy data does not contain multi-modal data - if is_encoder_data: - enc_data = mm_registry.get_encoder_dummy_data(model_config, - seq_len, - cache=cache) - seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids) - return DummyData(seq_data=seq_data) - - dec_data = mm_registry.get_decoder_dummy_data(model_config, - seq_len, - cache=cache) - - return DummyData( - seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), - multi_modal_data=dec_data.multi_modal_data.get_data(), - multi_modal_placeholders=dec_data.multi_modal_placeholders, - ) diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 55dfe8088c8f3..a59aebfac4ff9 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -3,13 +3,11 @@ from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) -from vllm.model_executor.sampling_metadata import (SamplingMetadata, - SamplingMetadataCache) +from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed __all__ = [ "SamplingMetadata", - "SamplingMetadataCache", "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 8a4ac214443eb..8226437cb1898 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,13 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that compute logits from hidden_stats.""" -import inspect -from concurrent.futures import ThreadPoolExecutor from typing import Optional import torch -import vllm.envs as envs from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.custom_op import CustomOp @@ -16,11 +13,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform -_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None -if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: - _logits_processor_threadpool = ThreadPoolExecutor( - envs.VLLM_LOGITS_PROCESSOR_THREADS) - @CustomOp.register("logits_processor") class LogitsProcessor(CustomOp): @@ -60,15 +52,10 @@ class LogitsProcessor(CustomOp): hidden_states: torch.Tensor, sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[torch.Tensor] = None, - prune_hidden_states: bool = True, ) -> Optional[torch.Tensor]: if self.logits_as_input: logits = hidden_states else: - if sampling_metadata is not None and prune_hidden_states: - hidden_states = _prune_hidden_states(hidden_states, - sampling_metadata) - # Get the logits for the next tokens. logits = self._get_logits(hidden_states, lm_head, embedding_bias) if logits is not None: @@ -79,12 +66,6 @@ class LogitsProcessor(CustomOp): if self.scale != 1.0: logits *= self.scale - - # Apply logits processors (if any). - if sampling_metadata is not None and \ - sampling_metadata.seq_groups is not None: - logits = _apply_logits_processors(logits, sampling_metadata) - return logits def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor: @@ -125,75 +106,3 @@ class LogitsProcessor(CustomOp): s += f", org_vocab_size={self.org_vocab_size}" s += f", scale={self.scale}, logits_as_input={self.logits_as_input}" return s - - -def _prune_hidden_states( - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios - # (warmup, profile_run) we might not have selected_token_indices, - # so we skip pruning. - if sampling_metadata.selected_token_indices is not None: - return hidden_states.index_select( - 0, sampling_metadata.selected_token_indices) - else: - return hidden_states - - -def _apply_logits_processors( - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - found_logits_processors = False - logits_processed = 0 - logits_row_ids_and_logits_row_futures = [] - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - logits_processors = sampling_params.logits_processors - if logits_processors: - found_logits_processors = True - - for seq_id, logits_row_idx in zip(seq_ids, - seq_group.sample_indices): - logits_row = logits[logits_row_idx] - past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids - prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids - - if _logits_processor_threadpool is not None: - logits_row_ids_and_logits_row_futures.append( - (logits_row_idx, - _logits_processor_threadpool.submit( - _apply_logits_processors_single_seq, logits_row, - logits_processors, past_tokens_ids, - prompt_tokens_ids))) - else: - logits[logits_row_idx] = \ - _apply_logits_processors_single_seq( - logits_row, logits_processors, past_tokens_ids, - prompt_tokens_ids) - - logits_processed += len(seq_group.sample_indices) + len( - seq_group.prompt_logprob_indices) - - for logits_row_idx, future in logits_row_ids_and_logits_row_futures: - logits[logits_row_idx] = future.result() - - if found_logits_processors: - # verifies that no rows in logits were missed unexpectedly - assert logits_processed == logits.shape[0] - return logits - - -def _apply_logits_processors_single_seq(logits_row, logits_processors, - past_tokens_ids, - prompt_tokens_ids) -> torch.Tensor: - for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: - logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, - logits_row) - else: - logits_row = logits_processor(past_tokens_ids, logits_row) - return logits_row diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py deleted file mode 100644 index 9d93cad2420ad..0000000000000 --- a/vllm/model_executor/layers/sampler.py +++ /dev/null @@ -1,1198 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A layer that samples the next tokens from the model's outputs.""" -import itertools -from collections.abc import Iterator -from dataclasses import dataclass -from importlib.util import find_spec -from math import inf -from typing import Optional, Union - -import msgspec -import torch -import torch.nn as nn - -import vllm.envs as envs -from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs -from vllm.model_executor.layers.utils import apply_penalties -from vllm.model_executor.sampling_metadata import (SamplingMetadata, - SamplingTensors, - SequenceGroupToSample) -from vllm.sampling_params import SamplingType -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, SequenceOutput) - -if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): - # yapf: disable - from flashinfer.sampling import ( - top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) - - # yapf: enable -else: - flashinfer_top_k_top_p_sampling = None - -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -def get_sampler() -> torch.nn.Module: - if envs.VLLM_USE_V1: - # Lazy import: the v1 package isn't distributed - from vllm.v1.sample.sampler import Sampler as V1Sampler - return V1Sampler() - return Sampler() - - -# (num_token_ids, num_parent_ids) per sequence group. -SampleResultType = list[tuple[list[int], list[int]]] - -# Types of temporary data structures used for -# computing sample_result -SampleMetadataType = dict[SamplingType, tuple[list[int], - list[SequenceGroupToSample]]] -MultinomialSamplesType = dict[SamplingType, torch.Tensor] -SampleResultsDictType = dict[int, tuple[list[int], list[int]]] - - -# Encapsulates temporary data structures for computing -# sample_result. -# -# * For multi-step scheduling: must be returned -# by `Sampler.forward()` and used later to compute the pythonized -# sample_result -# -# * For single-step scheduling: consumed immediately -# inside `Sampler.forward()` to compute pythonized sample_result. -@dataclass -class SampleResultArgsType: - sample_metadata: SampleMetadataType - multinomial_samples: MultinomialSamplesType - sample_results_dict: SampleResultsDictType - sampling_metadata: SamplingMetadata - greedy_samples: Optional[torch.Tensor] - - -# Union of non-deferred (single-step scheduling) -# vs deferred (multi-step scheduling) -# sample result types -MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] - -# Abbreviation of the _sample() return type -SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] - - -class SamplerOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """For each sequence group, we generate a list of SequenceOutput object, - each of which contains one possible candidate for the next token. - - This data structure implements methods, so it can be used like a list, but - also has optional fields for device tensors. - """ - - outputs: list[CompletionSequenceGroupOutput] - - # On-device tensor containing probabilities of each token. - sampled_token_probs: Optional[torch.Tensor] = None - - # On-device tensor containing the logprobs of each token. - logprobs: Optional["torch.Tensor"] = None - - # Holds either (1) the pythonized sampler result (single-step scheduling) - # or (2) what will be arguments for later deferred pythonization of the - # sampler result (muliti-step scheduling) - deferred_sample_results_args: Optional[SampleResultArgsType] = None - - # On-device tensor containing the sampled token ids. - sampled_token_ids: Optional[torch.Tensor] = None - # CPU tensor containing the sampled token ids. Used during multi-step to - # return the sampled token ids from last rank to AsyncLLMEngine to be - # 'broadcasted' to all other PP ranks for next step. - sampled_token_ids_cpu: Optional[torch.Tensor] = None - - # On-device tensor containing the sampled token embeddings (embeddings - # corresponding to the sampled token ids). Used when prompt embeddings are - # specified in lieu of prompt token ids or text. - sampled_token_embeds: Optional[torch.Tensor] = None - - # Optional last hidden states from the model. - hidden_states: Optional[torch.Tensor] = None - - # Optional prefill hidden states from the model - # (used for models like EAGLE). - prefill_hidden_states: Optional[torch.Tensor] = None - - # Time taken in the forward pass for this across all workers - model_forward_time: Optional[float] = None - - # Time taken in the model execute function. This will include model forward, - # block/sync across workers, cpu-gpu sync time and sampling time. - model_execute_time: Optional[float] = None - - def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput: - return self.outputs[idx] - - def __setitem__(self, idx: int, value): - self.outputs[idx] = value - - def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]: - return iter(self.outputs) - - def __len__(self): - return len(self.outputs) - - def __eq__(self, other: object): - return isinstance(other, - self.__class__) and self.outputs == other.outputs - - def __repr__(self) -> str: - """Show the shape of a tensor instead of its values to reduce noise. - """ - sampled_token_probs_repr = ("None" if self.sampled_token_probs is None - else self.sampled_token_probs.shape) - sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else - self.sampled_token_ids.shape) - return (f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr})") - - -class Sampler(nn.Module): - """Samples the next tokens from the model's outputs. - - This layer does the following: - 1. Discard the hidden states that are not used for sampling (i.e., all - tokens except the final one in each prompt). - 2. Compute the logits for the next tokens. - 3. Apply presence, frequency and repetition penalties. - 4. Apply temperature scaling. - 5. Apply top-p and top-k truncation. - 6. Sample the next tokens. - Here, each sequence group within the batch can have different sampling - parameters (e.g., sampling method, temperature, top-p, top-k, etc.). - - The structure of the logits tensor is coupled with the seq_groups in - sampling_metadata. Typically, each sequence in each seq_group has one row in - logits for the next token to be sampled; however, for a seq_group with a - prompt request with the prompt_logprobs sampling parameter, there are rows - in logits for each token in the input prompt. - """ - - def __init__(self): - super().__init__() - - # Whether or not the SamplerOutput should have on-device tensors - # containing the sampled token ids and probabilities. This is used by - # speculative decoding and when prompt embeddings are specified. - self.include_gpu_probs_tensor = False - self.should_modify_greedy_probs_inplace = False - - def _init_sampling_tensors( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ): - """The goal here is to reuse sampling tensors between similar decode - runs. This is possible because sampling logic does not change between - decodes of the same sequences. - """ - _, vocab_size = logits.shape - - # First free any existing stored sampling tensors. - # This is necessary because some sampling tensors may - # have pinned memory. - self._sampling_tensors = None - - # Initialize new sampling tensors - (sampling_tensors, do_penalties, do_top_p_top_k, - do_min_p) = SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype) - - self._sampling_tensors = sampling_tensors - self._do_penalties = do_penalties - self._do_top_p_top_k = do_top_p_top_k - self._do_min_p = do_min_p - - def forward( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - """ - Single-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Pythonize sampling result & logprobs tensor - - Multi-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Defer Pythonization of sampling result & logprobs - tensor - * Encapsulate arguments required for deferred Pythonization - in the - [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput] - structure - - Args: - logits: (num_tokens, vocab_size). - sampling_metadata: Metadata for sampling. - """ - assert logits is not None - _, vocab_size = logits.shape - - # Prepare sampling tensors with pinned memory to avoid blocking. - if not sampling_metadata.reuse_sampling_tensors: - self._init_sampling_tensors(logits, sampling_metadata) - elif self._do_penalties: - # In this case, the sampling tensors logic depends on - # "output_tokens" of a sequence. As a result, we cannot - # reuse sampling tensors, since "output_tokens" changes - # between decode runs. - self._init_sampling_tensors(logits, sampling_metadata) - - assert self._sampling_tensors is not None - sampling_tensors = self._sampling_tensors - do_penalties = self._do_penalties - do_top_p_top_k = self._do_top_p_top_k - do_min_p = self._do_min_p - - logits = _apply_min_tokens_penalty(logits, sampling_metadata) - - # Apply presence and frequency penalties. - if do_penalties: - logits = apply_penalties(logits, sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties) - - # Use float32 to apply temperature scaling. - # Use in-place division to avoid creating a new tensor. - logits = logits.to(torch.float) - logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) - - if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) - - if do_min_p: - logits = _apply_min_p(logits, sampling_tensors.min_ps) - - # We use float32 for probabilities and log probabilities. - # Compute the probabilities. - probs = torch.softmax(logits, dim=-1, dtype=torch.float) - # Compute the log probabilities. - logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) - - # Sample the next tokens. - maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample( - probs, - logprobs, - sampling_metadata, - sampling_tensors, - include_gpu_probs_tensor=self.include_gpu_probs_tensor, - modify_greedy_probs=self._should_modify_greedy_probs_inplace, - ) - - if self.include_gpu_probs_tensor: - # Since we will defer sampler result Pythonization, - # preserve GPU-side tensors in support of later - # deferred pythonization of logprobs - assert maybe_sampled_tokens_tensor is not None - on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor) - else: - # Since Pythonization has already happened, don't preserve - # GPU-side tensors. - on_device_tensors = None - - # Get the logprobs query results. - prompt_logprobs = None - sample_logprobs = None - if not sampling_metadata.skip_sampler_cpu_output: - # Pythonize logprobs now (GPU -> CPU); do not defer. - assert not isinstance(maybe_deferred_sample_results, - SampleResultArgsType) - prompt_logprobs, sample_logprobs = get_logprobs( - logprobs, sampling_metadata, maybe_deferred_sample_results) - - return _build_sampler_output( - maybe_deferred_sample_results, - sampling_metadata, - prompt_logprobs, - sample_logprobs, - on_device_tensors=on_device_tensors, - skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output) - - @property - def _should_modify_greedy_probs_inplace(self) -> bool: - """Whether or not the sampler should modify the probability distribution - of greedily-sampled tokens such that multinomial sampling would sample - the greedily-sampled token. - - In other words, if True then we set the probability of the greedily- - sampled token to 1. - - This is used by speculative decoding, which requires that the sampling - method be encoded into the probability distribution. - """ - return self.should_modify_greedy_probs_inplace - - -def _apply_min_tokens_penalty( - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens - have not been generated yet - """ - # list of indices in logits that will be set to -inf - logits_to_penalize: list[tuple[int, int]] = [] - logits_applied = 0 - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - - sample_indices = seq_group.sample_indices - logits_applied += len(sample_indices) + len( - seq_group.prompt_logprob_indices) - if not seq_group.do_sample: - continue - - start_idx = sample_indices[0] - min_tokens = sampling_params.min_tokens - token_ids_to_penalize = sampling_params.all_stop_token_ids - if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize: list[int] = [] - for j, seq_id in enumerate(seq_ids): - seq_data = seq_group.seq_data[seq_id] - if len(seq_data.output_token_ids_array) < min_tokens: - seqs_to_penalize.append(j) - - if seqs_to_penalize: - # convert to the index into logits - seqs_to_penalize = [start_idx + j for j in seqs_to_penalize] - # itertools.product pairs each seq index with every token id - logits_to_penalize.extend( - itertools.product(seqs_to_penalize, token_ids_to_penalize)) - - if logits_to_penalize: - # use zip and * to group indices along each dimension - # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) ) - logits[tuple(zip(*logits_to_penalize))] = -float("inf") - - # verifies that no rows in logits were missed unexpectedly - assert logits_applied == logits.shape[0] - return logits - - -def _apply_top_k_top_p( - logits: torch.Tensor, - p: torch.Tensor, - k: torch.Tensor, -) -> torch.Tensor: - logits_sort, logits_idx = logits.sort(dim=-1, descending=False) - - # Apply top-k. - top_k_mask = logits_sort.size(1) - k.to(torch.long) - # Get all the top_k values. - top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) - top_k_mask = logits_sort < top_k_mask - logits_sort.masked_fill_(top_k_mask, -float("inf")) - - # Apply top-p. - probs_sort = logits_sort.softmax(dim=-1) - probs_sum = probs_sort.cumsum(dim=-1) - top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) - # at least one - top_p_mask[:, -1] = False - logits_sort.masked_fill_(top_p_mask, -float("inf")) - - # Re-sort the probabilities. - logits = torch.empty_like(logits_sort).scatter_(dim=-1, - index=logits_idx, - src=logits_sort) - return logits - - -def _apply_min_p( - logits: torch.Tensor, - min_p: torch.Tensor, -) -> torch.Tensor: - """ - Adapted from - https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17 - """ - probs = torch.softmax(logits, dim=-1) - top_probs, _ = probs.max(dim=-1, keepdim=True) - scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs - tokens_to_remove = probs < scaled_min_p - logits = logits.masked_fill_(tokens_to_remove, -float("inf")) - - return logits - - -def _greedy_sample( - selected_seq_groups: list[SequenceGroupToSample], - samples: torch.Tensor, -) -> SampleResultType: - """Run greedy sampling on a given samples. - - Args: - selected_seq_groups: A list of sequence groups batched. - samples: (num_selected_samples,) A tensor of samples. The length of - samples could be smaller than selected_seq_groups if - seq_group.do_sample is False. - Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is - same as the length of selected_seq_groups. If the corresponding - seq_group has do_sample=False, tuple contains ([], []) - """ - samples_lst = samples.tolist() - sample_idx = 0 - results: SampleResultType = [] - for seq_group in selected_seq_groups: - if not seq_group.do_sample: - results.append(([], [])) - continue - - seq_ids = seq_group.seq_ids - num_parent_seqs = len(seq_ids) - assert num_parent_seqs == 1, ( - "Greedy sampling should have only one seq.") - parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples_lst[sample_idx]] - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -def _random_sample( - selected_seq_groups: list[SequenceGroupToSample], - random_samples: torch.Tensor, -) -> SampleResultType: - """Run random sampling on a given samples. - - Args: - selected_seq_groups: A list of sequence groups batched. - random_samples: (num_selected_samples,) A tensor of samples. The - length of samples could be smaller than selected_seq_groups if - seq_group.do_sample is False. - Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is - same as the length of selected_seq_groups. If the corresponding - seq_group has do_sample=False, tuple contains ([], []) - """ - # Find the maximum n value of the prompt phase requests. - random_samples = random_samples.cpu() - sample_idx = 0 - results: SampleResultType = [] - for seq_group in selected_seq_groups: - if not seq_group.do_sample: - results.append(([], [])) - continue - - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - is_prompt = seq_group.is_prompt - num_parent_seqs = len(seq_ids) - if is_prompt: - # Prompt phase. - parent_ids = [0] * sampling_params.n - next_token_ids = random_samples[ - sample_idx, :sampling_params.n].tolist() - else: - # Generation phase. - parent_ids = list(range(num_parent_seqs)) - next_token_ids = random_samples[sample_idx:sample_idx + - num_parent_seqs, 0].tolist() - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -def _multinomial( - probs: torch.Tensor, - num_samples: int, - seq_groups: Optional[list[SequenceGroupToSample]] = None, -) -> torch.Tensor: - if num_samples > 1: - probs = probs.repeat_interleave(num_samples, dim=0) - q = torch.empty_like(probs) - if seq_groups is None: - q.exponential_() - else: - sample_idx = 0 - for seq_group in seq_groups: - seq_ids = seq_group.seq_ids - stride = len(seq_ids) * num_samples - assert seq_group.generator is not None - q[sample_idx:sample_idx + - stride].exponential_(generator=seq_group.generator) - sample_idx += stride - return probs.div_(q).argmax(dim=1).view(-1, num_samples) - - -def _top_k_top_p_multinomial_with_flashinfer( - probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, - num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]): - if num_samples > 1: - probs = probs.repeat_interleave(num_samples, dim=0) - top_ks = top_ks.repeat_interleave(num_samples) - top_ps = top_ps.repeat_interleave(num_samples) - batch_next_token_ids = flashinfer_top_k_top_p_sampling( - probs, - top_ks, - top_ps, - ) - return batch_next_token_ids.view(-1, num_samples) - - -def get_pythonized_sample_results( - sample_result_args: SampleResultArgsType) -> SampleResultType: - '''This function consumes GPU-side sampler results and computes - Pythonized CPU-side sampler results (GPU -> CPU sync.) - - Single-step scheduling: this function is invoked at sampling-time - for immediate Pythonization. - - Multi-step scheduling: Pythonization is deferred until after multiple - GPU-side steps have been completed. - - Args: - sample_result_args: GPU-side inputs to the Pythonization process - - Returns: - Pythonized sampler results - ''' - - ( - sample_metadata, - sampling_metadata, - greedy_samples, - multinomial_samples, - sample_results_dict, - ) = ( - sample_result_args.sample_metadata, - sample_result_args.sampling_metadata, - sample_result_args.greedy_samples, - sample_result_args.multinomial_samples, - sample_result_args.sample_results_dict, - ) - - for sampling_type in SamplingType: - if sampling_type not in sample_metadata: - continue - (seq_group_id, seq_groups) = sample_metadata[sampling_type] - if sampling_type == SamplingType.GREEDY: - sample_results = _greedy_sample(seq_groups, greedy_samples) - elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - sample_results = _random_sample(seq_groups, - multinomial_samples[sampling_type]) - sample_results_dict.update(zip(seq_group_id, sample_results)) - - return [ - sample_results_dict.get(i, ([], [])) - for i in range(len(sampling_metadata.seq_groups)) - ] - - -def _sample_with_torch( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sampling_tensors: SamplingTensors, - include_gpu_probs_tensor: bool, - modify_greedy_probs: bool, -) -> SampleReturnType: - '''Torch-oriented _sample() implementation. - - Single-step scheduling: - * Perform GPU-side sampling computation - * Immediately Pythonize sampling result - - Multi-step scheduling: - * Perform GPU-side sampling computation - * Defer Pythonization & preserve GPU-side - tensors required for Pythonization - ''' - - categorized_seq_group_ids: dict[SamplingType, list[int]] = { - t: [] - for t in SamplingType - } - categorized_sample_indices = sampling_metadata.categorized_sample_indices - for i, seq_group in enumerate(sampling_metadata.seq_groups): - sampling_params = seq_group.sampling_params - sampling_type = sampling_params.sampling_type - categorized_seq_group_ids[sampling_type].append(i) - - sample_results_dict: SampleResultsDictType = {} - sample_metadata: SampleMetadataType = {} - multinomial_samples: MultinomialSamplesType = {} - greedy_samples: Optional[torch.Tensor] = None - - # Create output tensor for sampled token ids. - if include_gpu_probs_tensor: - sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1), - VLLM_INVALID_TOKEN_ID, - dtype=torch.long, - device=logprobs.device) - else: - sampled_token_ids_tensor = None - - # Counterintuitively, having two loops here is actually faster. - # The first loop can run without waiting on GPU<->CPU sync. - for sampling_type in SamplingType: - sample_indices = categorized_sample_indices[sampling_type] - num_tokens = len(sample_indices) - if num_tokens == 0: - continue - - seq_group_id = categorized_seq_group_ids[sampling_type] - seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_id] - sample_metadata[sampling_type] = (seq_group_id, seq_groups) - long_sample_indices = sample_indices.long() - if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[long_sample_indices], - dim=-1) - - if sampled_token_ids_tensor is not None: - # Store sampled tokens in output tensor. - sampled_token_ids_tensor[ - long_sample_indices] = greedy_samples.unsqueeze(-1) - - if modify_greedy_probs: - # If required, modify the probabilities such that sampling from - # the modified distribution would always sample the argmax - # token id. - _modify_greedy_probs_inplace(logprobs, probs, - long_sample_indices, - greedy_samples) - - elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - max_n_in_batch = 1 - for seq_group in seq_groups: - if seq_group.is_prompt: - sampling_params = seq_group.sampling_params - max_n_in_batch = max(max_n_in_batch, sampling_params.n) - seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else - seq_groups) - - if flashinfer_top_k_top_p_sampling is not None: - logger.warning("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") - - multinomial_samples[sampling_type] = _multinomial( - probs[long_sample_indices], - max_n_in_batch, - seq_groups=seq_groups_arg) - - if sampled_token_ids_tensor is not None: - # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = \ - multinomial_samples[sampling_type].to(torch.long) - - else: - raise ValueError(f"Unsupported sampling type: {sampling_type}") - - # Encapsulate arguments for computing Pythonized sampler - # results, whether deferred or otherwise. - maybe_deferred_args = SampleResultArgsType( - sampling_metadata=sampling_metadata, - sample_metadata=sample_metadata, - multinomial_samples=multinomial_samples, - greedy_samples=greedy_samples, - sample_results_dict=sample_results_dict) - - if not sampling_metadata.skip_sampler_cpu_output: - # GPU<->CPU sync happens here. - # This also converts the sampler output to a Python object. - # Return Pythonized sampler result & sampled token ids - return get_pythonized_sample_results( - maybe_deferred_args), sampled_token_ids_tensor - else: - # Defer sampler result Pythonization; return deferred - # Pythonization args & sampled token ids - return ( - maybe_deferred_args, - sampled_token_ids_tensor, - ) - - -def _sample( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sampling_tensors: SamplingTensors, - include_gpu_probs_tensor: bool, - modify_greedy_probs: bool, -) -> SampleReturnType: - """ - Args: - probs: (num_query_tokens_in_batch, num_vocab) - logprobs: (num_query_tokens_in_batch, num_vocab) - sampling_metadata: The metadata for a batch for sampling. - sampling_tensors: Tensors that include sampling related metadata. - - Returns: - (next_token_ids, parent_seq_ids) for each seq group in a batch. - If sampling is skipped, it returns ([], []) - sampled_token_ids_tensor: A tensor of sampled token ids. - """ - return _sample_with_torch( - probs, - logprobs, - sampling_metadata, - sampling_tensors, - include_gpu_probs_tensor=include_gpu_probs_tensor, - modify_greedy_probs=modify_greedy_probs, - ) - - -def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: - """ - This function calculates the ranks of the chosen tokens in a logprob tensor. - - Args: - x (torch.Tensor): 2D logprob tensor of shape (N, M) - where N is the no. of tokens and M is the vocab dim. - indices (torch.Tensor): List of chosen token indices. - - Returns: - torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens. - Each element in the returned tensor represents the rank - of the chosen token in the input logprob tensor. - """ - vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), - indices] - result = (x > vals[:, None]) - del vals - return result.sum(1).add_(1) - - -def get_logprobs( - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sample_results: SampleResultType, -) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]: - """Return sample logprobs and prompt logprobs. - - The logic consists of 3 parts. - - Select indices to compute logprob from, ranks of token ids, and - the top k token ids from logprobs. - - Compute prompt logprobs if required. - - Compute sample logprobs if required. - - Args: - logprobs: (num_query_tokens_across_batch, num_vocab). Each query token's - logprob per vocab. Sequence groups' query tokens are batched in a - single flattened tensor. For example, assuming there are N - seq groups, it is sorted by prefill tokens for seq_group_1 (if - prompt logprob is enabled), decode tokens for seq_group_1 (if - sampling is required), prefill tokens for seq_group_2, ... - sampling_metadata: The sampling metadata. - sample_results: (num_seq_groups) The tuple of (next_token_ids, - parent_ids) for each sequence group. When beam search is enabled, - sample_results can contain different number of seq_ids from - sampling_metadata.seq_groups. It is because beam search creates - 2 * BEAM_WIDTH number of samples (whereas there are only up to - BEAM_WIDTH number of seq_ids). - - Returns: - A tuple of prompt and sample logprobs per sequence group in a batch. - """ - # The index of query token to calculate logprobs. It includes both - # prompt and sample logprob indices. - query_indices: list[int] = [] - # The next token ids to get the logprob value from. - next_token_ids: list[int] = [] - # The largest requested number of logprobs. We find logprobs as many as the - # largest num logprobs in this API. If every logprobs is None, it will be - # set to -1. - largest_num_logprobs = -1 - - # Select indices to compute logprob from, ranks of token ids, and the top - # k token ids from logprobs. - for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, - sample_results): - sampling_params = seq_group.sampling_params - - # Update indices and tokens for prompt logprobs. - if (seq_group.is_prompt - and sampling_params.prompt_logprobs is not None): - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.prompt_logprobs) - next_prompt_tokens = _get_next_prompt_tokens(seq_group) - query_indices.extend(seq_group.prompt_logprob_indices) - next_token_ids.extend(next_prompt_tokens) - - # Update indices and next tokenes for sample logprob. - if seq_group.do_sample: - token_ids, parent_seq_ids = sample_result - # NOTE: We cannot directly use sample_indices because - # sample_indices only contain parent seq_ids of a previous step. - # The current step may have different number of seq_ids, and - # we can obtain it from `sample_result[1]`. - query_idx = seq_group.sample_indices[0] - query_indices.extend( - [query_idx + parent_id for parent_id in parent_seq_ids]) - next_token_ids.extend(token_ids) - - if sampling_params.logprobs is not None: - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.logprobs) - - assert len(next_token_ids) == len(query_indices) - - if len(query_indices) == 0: - empty_sampled_logprob: SampleLogprobs = [] - empty_prompt_logprob: Optional[PromptLogprobs] = None - num_seq_groups = len(sampling_metadata.seq_groups) - return [empty_prompt_logprob - ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups - - selected_logprobs, ranks = None, None - top_logprobs, top_token_ids = None, None - - # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can - # skip the whole logprob calculation. - if largest_num_logprobs >= 0: - query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) - next_token_ids_gpu = torch.tensor(next_token_ids, - device=logprobs.device) - - # (num_selected_query_tokens, num_logprobs). Note that query_indices can - # contain duplicates if beam search is enabled. - selected_logprobs = logprobs[[ - query_indices_gpu, - next_token_ids_gpu, - ]] - ranks = _get_ranks( - logprobs[query_indices_gpu], - next_token_ids_gpu, - ) - assert selected_logprobs.shape[0] == ranks.shape[0] - - # We need to compute top k only if there exists logprobs > 0. - if largest_num_logprobs > 0: - # Logprobs of topk tokens for a batch of sequence groups. - # (num_query_tokens_across_batch). - top_logprobs, top_token_ids = torch.topk(logprobs, - largest_num_logprobs, - dim=-1) - top_logprobs = top_logprobs.to('cpu') - top_token_ids = top_token_ids.to('cpu') - - selected_logprobs = selected_logprobs.to('cpu') - ranks = ranks.to('cpu') - - # Find prompt/sample logprobs. - prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = [] - sample_logprobs_per_seq_group: list[SampleLogprobs] = [] - top_logprob_idx = 0 - selected_logprobs_idx = 0 - - for seq_group, sample_result in zip(sampling_metadata.seq_groups, - sample_results): - (prompt_logprobs, top_logprob_idx, - selected_logprobs_idx) = _get_prompt_logprob_if_needed( - seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs, - selected_logprobs_idx, top_logprob_idx) - prompt_logprobs_per_seq_group.append(prompt_logprobs) - - (sampled_logprobs, top_logprob_idx, - selected_logprobs_idx) = _get_sampled_logprob_if_needed( - seq_group, sample_result, selected_logprobs, ranks, top_token_ids, - top_logprobs, selected_logprobs_idx, top_logprob_idx) - sample_logprobs_per_seq_group.append(sampled_logprobs) - - return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group - - -def _get_prompt_logprob_if_needed( - seq_group: SequenceGroupToSample, - selected_logprobs: torch.Tensor, - ranks: torch.Tensor, - top_token_ids: torch.Tensor, - top_logprobs: torch.Tensor, - selected_logprobs_idx: int, - top_logprob_idx: int, -): - """Compute the prompt logprob from a sequence group if needed.""" - sampling_params = seq_group.sampling_params - is_prompt = seq_group.is_prompt - - # Find prompt logprobs - prompt_logprobs: Optional[PromptLogprobs] = None - if is_prompt and sampling_params.prompt_logprobs is not None: - prompt_logprobs = [] - num_logprobs = sampling_params.prompt_logprobs - next_prompt_tokens = _get_next_prompt_tokens(seq_group) - # Pre-select indexes and create a list. It is faster than calling .item - # repetitively. - selected_logprob_items = selected_logprobs[ - selected_logprobs_idx:selected_logprobs_idx + - len(next_prompt_tokens)].tolist() - rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + - len(next_prompt_tokens)].tolist() - - for idx, token_id in enumerate(next_prompt_tokens): - # Calculate the prompt logprob of the real prompt tokens. - # {token_id: (logprob, rank_from_vocab)} - prompt_logprobs_dict: dict[int, tuple[float, int]] = { - token_id: (selected_logprob_items[idx], rank_items[idx]) - } - - # Add top K prompt logprobs along with its rank. - if num_logprobs > 0: - top_ids = top_token_ids[ - top_logprob_idx, :num_logprobs].tolist() - top_probs = top_logprobs[ - top_logprob_idx, :num_logprobs].tolist() - # Top K is already sorted by rank, so we can use 1 ~ - # num_logprobs + 1 for rank. - top_ranks = range(1, num_logprobs + 1) - prompt_logprobs_dict.update({ - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip(top_ids, top_probs, - top_ranks) - }) - prompt_logprobs.append({ - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in prompt_logprobs_dict.items() - }) - # + 1 to go to the next prompt token. - top_logprob_idx += 1 - - # + len(next_prompt_tokens) to go to the next prompt. - selected_logprobs_idx += len(next_prompt_tokens) - return prompt_logprobs, top_logprob_idx, selected_logprobs_idx - - -def _get_sampled_logprob_if_needed( - seq_group: SequenceGroupToSample, - sample_result: tuple[list[int], list[int]], - selected_logprobs: torch.Tensor, - ranks: torch.Tensor, - top_token_ids: torch.Tensor, - top_logprobs: torch.Tensor, - selected_logprobs_idx: int, - top_logprob_idx: int, -): - """Compute the sample logprob if needed.""" - seq_ids = seq_group.seq_ids - num_logprobs = seq_group.sampling_params.logprobs - sampled_logprobs: SampleLogprobs = [] - next_token_ids, parent_seq_ids = sample_result - - if seq_group.do_sample: - assert len(next_token_ids) > 0 - if num_logprobs is None: - for next_token_id in next_token_ids: - # Use a dummy logprob - sampled_logprobs.append({next_token_id: Logprob(inf)}) - else: - # Pre-select items from tensor. tolist() is faster than repetitive - # `.item()` calls. - selected_logprob_items = selected_logprobs[ - selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - for idx, (next_token_id, parent_id) in enumerate( - zip(next_token_ids, parent_seq_ids)): - # Get the logprob of a sampled token. - sampled_logprobs_dict = { - next_token_id: - (selected_logprob_items[idx], rank_items[idx]) - } - if num_logprobs is not None and num_logprobs > 0: - # Get top K logprobs. - top_ids = top_token_ids[top_logprob_idx + - parent_id, :num_logprobs].tolist() - top_probs = top_logprobs[ - top_logprob_idx + parent_id, :num_logprobs].tolist() - # Top K is already sorted by rank, so we can use 1 ~ - # num_logprobs + 1 for rank. - top_ranks = range(1, num_logprobs + 1) - sampled_logprobs_dict.update({ - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip( - top_ids, top_probs, top_ranks) - }) - - sampled_logprobs.append({ - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in - sampled_logprobs_dict.items() - }) - - # NOTE: This part of code is not intuitive. `selected_logprobs` include - # logprobs for the current step, which has len(next_token_ids) tokens - # per sequence group. `logprobs` includes logprobs from the previous - # steps, which has len(seq_ids) tokens per sequence group. - - # Iterate to the next sequence group in a batch. - selected_logprobs_idx += len(next_token_ids) - # Iterate to the next sequence group in a batch. - top_logprob_idx += len(seq_ids) - return sampled_logprobs, top_logprob_idx, selected_logprobs_idx - - -def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, - sample_indices: torch.Tensor, - greedy_samples: torch.Tensor) -> None: - """Modify the probability distributions of the greedily-sampled tokens such - that each sampled token has a "probability" of 1.0. This is required by - speculative decoding, which depends on the sampling method being encoded - within the probability distribution for correctness. - - # Why do we only need to do this for greedy sampling? - - vLLM's sampler performs the following steps for greedy or multinomial - (random) sampling: - 1. Get logits from model. - 2. Modify logits according to per-sequence sampling parameters. - - Multiply by temperature, top-k and top-p masking, penalize tokens - according to their frequency, etc. - 3. Sample a token. - - Random sampling simply samples from the modified probability - distribution. - - Greedy sampling performs `argmax` to obtain the token with the - highest likelihood. - - Ignoring greedy sampling for a moment, we find that the computed probability - distribution has the following property: we can sample from it independently - and find that the token sampled by the Sampler has a frequency corresponding - to how often we see it in our sampling. In other words, for tokens sampled - with vLLM's random SamplingType, the computed probability distribution - encodes the sampling methodology completely. - - Greedy sampling does not normally have this property. vLLM modifies logits - according to sampling params, then performs `argmax`, then returns the - sampled token and the computed probability distribution. If we sample from - the distribution, we'll find the likelihood of the greedily-sampled token - is not always 1.0. - - Since lossless speculative decoding requires that the sampling methodology - be encoded within the probability distribution, we are motivated to modify - the probability distribution such that the sampled token has probability 1 - when speculative decoding is used. - - NOTE: Alternatively, we could use an extremely low temperature to achieve - greedy sampling using multinomial computation and unite the codepaths. This - has implications on the overall design of the sampler, e.g. how to record - accurate logprobs for the user, so this improvement is deferred to later. - """ - # NOTE: logprobs are not modified so they can be returned to the user. - probs[sample_indices, :] = 0 - probs[sample_indices, greedy_samples] = 1.0 - - -def _build_sampler_output( - maybe_deferred_sample_results: MaybeDeferredSampleResultType, - sampling_metadata: SamplingMetadata, - prompt_logprobs: Optional[list[Optional[PromptLogprobs]]], - sample_logprobs: Optional[list[SampleLogprobs]], - on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, - torch.Tensor]], - skip_sampler_cpu_output: bool = False, -) -> SamplerOutput: - """Construct Python objects with the output of sampling. - - Args: - on_device_tensors: Tuple containing on-device tensors with the - probabilities used in sampling and the sampled token ids. This - allows post-processing without copies to CPU/serialization, e.g. in - speculative decoding rejection sampling. - """ - sampler_output: list[CompletionSequenceGroupOutput] = [] - - if skip_sampler_cpu_output: - assert isinstance(maybe_deferred_sample_results, SampleResultArgsType) - deferred_sample_results_args = maybe_deferred_sample_results - else: - assert prompt_logprobs is not None - assert sample_logprobs is not None - assert not isinstance(maybe_deferred_sample_results, - SampleResultArgsType) - assert len(sampling_metadata.seq_groups) \ - == len(maybe_deferred_sample_results) \ - == len(prompt_logprobs) \ - == len(sample_logprobs) - deferred_sample_results_args = None - - for (seq_group, sample_result, group_prompt_logprobs, - group_sample_logprobs) in zip(sampling_metadata.seq_groups, - maybe_deferred_sample_results, - prompt_logprobs, sample_logprobs): - seq_ids = seq_group.seq_ids - next_token_ids, parent_ids = sample_result - seq_outputs: list[SequenceOutput] = [] - for parent_id, next_token_id, logprobs in zip( - parent_ids, next_token_ids, group_sample_logprobs): - seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, - logprobs)) - sampler_output.append( - CompletionSequenceGroupOutput(seq_outputs, - group_prompt_logprobs)) - - # If not specified, store None values in SamplerOutput. - if on_device_tensors is not None: - (sampled_token_probs, logprobs_tensor, - sampled_token_ids) = on_device_tensors - else: - sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, - None) - - return SamplerOutput( - outputs=sampler_output, - sampled_token_probs=sampled_token_probs, - sampled_token_ids=sampled_token_ids, - logprobs=logprobs_tensor, - deferred_sample_results_args=deferred_sample_results_args) - - -def _get_next_prompt_tokens( - seq_group: SequenceGroupToSample) -> tuple[int, ...]: - """Get a list of next prompt tokens to compute logprob from a - given sequence group. - - It is used to compute prompt logprob. Imagine you have logprob for each - query token. Query token needs to know the next prompt token id to compute - prompt logprob. This is a helper to obtain next prompt token ids. - - This API has to be used only when the caller knows seq_group is in prefill - stage. - - Returns: - A list of next prompt tokens to compute logprob. - """ - assert seq_group.is_prompt, ( - "Caller should ensure the sequence group is in a prefill stage.") - seq_ids = seq_group.seq_ids - query_len = seq_group.query_len - assert query_len is not None - # prompt has only 1 seq id. - assert len(seq_ids) == 1 - seq_data = seq_group.seq_data[seq_ids[0]] - computed_len = seq_data.get_num_computed_tokens() - prompt_tokens = seq_data.prompt_token_ids - # +1 because we are looking for a next prompt token. - next_token_index_start = computed_len + 1 - next_token_index_end = min(computed_len + query_len + 1, - len(prompt_tokens)) - next_prompt_tokens = prompt_tokens[ - next_token_index_start:next_token_index_end] - return next_prompt_tokens diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 6ba8ad372c95a..b0a96fca2ff8a 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -2,18 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Optional import torch import torch.nn as nn from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from .utils import maybe_prefix @@ -105,8 +102,10 @@ class Medusa(nn.Module): return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: list[torch.Tensor], - sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + self, + hidden_states: list[torch.Tensor], + sampling_metadata, + ) -> list[torch.Tensor]: logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): @@ -130,57 +129,6 @@ class Medusa(nn.Module): return logits_lst - def sample( - self, - logits: list[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: - logits = torch.stack(logits, dim=0).float() - logprobs = torch.log_softmax(logits, dim=-1) - token_ids = logits.argmax(-1) # support only top-1 for now - probs = torch.softmax(logits, dim=-1) - - token_id_list = [] - token_prob_list = [] - token_logprob_list = [] - - for idx, seq_group in enumerate(sampling_metadata.seq_groups): - token_id_list.append(token_ids[:, seq_group.sample_indices]) - token_prob_list.append(probs[:, seq_group.sample_indices]) - token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - - outputs: list[Optional[SamplerOutput]] = [] - for idx in range(len(sampling_metadata.seq_groups)): - outputs.append( - SamplerOutput( - outputs=None, - sampled_token_probs=token_prob_list[idx].squeeze(1), - logprobs=token_logprob_list[idx].squeeze(1), - sampled_token_ids=token_id_list[idx].squeeze(1), - )) - - return outputs - - def generate_proposals( - self, - previous_hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - # During preemption, we may receive an empty tensor (batch_size=0) - if previous_hidden_states.size(0) == 0: - # Return None to signal the Top1Proposer that no proposals - # were generated for this batch, allowing it to handle this - # special case appropriately - return None - - return self.sample( - logits=self.compute_logits( - hidden_states=self.forward(previous_hidden_states), - sampling_metadata=sampling_metadata, - ), - sampling_metadata=sampling_metadata, - ) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index c6a97388dc188..d057eb49a62d1 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -8,9 +8,7 @@ import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -141,55 +139,57 @@ class MLPSpeculator(nn.Module): self.config = config self.logits_processor = LogitsProcessor(config.vocab_size, config.vocab_size, 1.0) - self.sampler = get_sampler() - def generate_proposals( - self, - input_ids: torch.Tensor, - previous_hidden_states: torch.Tensor, - num_predict_tokens: int, - sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: - if num_predict_tokens > self.max_speculative_tokens: - raise ValueError(f"Max speculative tokens for model is " - f"{self.max_speculative_tokens}, but " - f"{num_predict_tokens} were requested") + # NOTE(woosuk): This method is commented out because it is old code + # using V0. We should either port it to V1 or remove it. - # b x 1 x d - previous_hidden_states = previous_hidden_states.unsqueeze(1) + # def generate_proposals( + # self, + # input_ids: torch.Tensor, + # previous_hidden_states: torch.Tensor, + # num_predict_tokens: int, + # sampling_metadata: SamplingMetadata, + # ) -> list[SamplerOutput]: + # if num_predict_tokens > self.max_speculative_tokens: + # raise ValueError(f"Max speculative tokens for model is " + # f"{self.max_speculative_tokens}, but " + # f"{num_predict_tokens} were requested") - if self.scale_input: - previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 + # # b x 1 x d + # previous_hidden_states = previous_hidden_states.unsqueeze(1) - # b x 1 - last_tokens = input_ids.unsqueeze(1) + # if self.scale_input: + # previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 - next_tokens = [] + # # b x 1 + # last_tokens = input_ids.unsqueeze(1) - for head_index in range(num_predict_tokens): + # next_tokens = [] - # Project and predict - z = self.emb[head_index](last_tokens) # b k d - states = self.proj[head_index](previous_hidden_states) + # for head_index in range(num_predict_tokens): - # Weighted add of state_weight*state and emb_weight*z - # Let subsequent LN take care of denominator - # state_weight is close to 1, so shouldn't be any precision issues - states.add_(z, alpha=self.emb_weight / self.state_weight) + # # Project and predict + # z = self.emb[head_index](last_tokens) # b k d + # states = self.proj[head_index](previous_hidden_states) - states = self.activation(self.ln[head_index](states)) # b k d - previous_hidden_states = states - # TODO: not yet supporting top_k_tokens_per_head - states = states.flatten(0, 1) + # # Weighted add of state_weight*state and emb_weight*z + # # Let subsequent LN take care of denominator + # # state_weight is close to 1, so shouldn't be any precision issues + # states.add_(z, alpha=self.emb_weight / self.state_weight) - logits = self.logits_processor(self.head[head_index], states, - sampling_metadata) + # states = self.activation(self.ln[head_index](states)) # b k d + # previous_hidden_states = states + # # TODO: not yet supporting top_k_tokens_per_head + # states = states.flatten(0, 1) - output = self.sampler(logits, sampling_metadata) - last_tokens = output.sampled_token_ids - next_tokens.append(output) + # logits = self.logits_processor(self.head[head_index], states, + # sampling_metadata) - return next_tokens + # output = self.sampler(logits, sampling_metadata) + # last_tokens = output.sampled_token_ids + # next_tokens.append(output) + + # return next_tokens def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index c4548ee168bd7..aa7c434a44aeb 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -697,16 +697,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - # If the shape is the same, it means that we have already - # prune hidden states manually. - prune_hidden_states = hidden_states.size( - 0) != sampling_metadata.selected_token_indices.size(0) processed_logits = self.logits_processor( self.lm_head, hidden_states, sampling_metadata, self.embedding_bias, - prune_hidden_states=prune_hidden_states) + ) return processed_logits def load_weights( diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 2315f9dad5a5a..8c4548ff7f7dc 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,597 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from array import array -from dataclasses import dataclass -from typing import Optional - -import torch - -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData, - SequenceGroupMetadata) -from vllm.utils import (PyObjectCache, async_tensor_h2d, - is_pin_memory_available, make_tensor_with_pad) - -_SAMPLING_EPS = 1e-5 - - -@dataclass -class SequenceGroupToSample: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Sequence ids for the sequence group in a previous step. - seq_ids: list[int] - sampling_params: SamplingParams - # seq_id -> sequence data. - seq_data: dict[int, SequenceData] - # The length of the sequence (all tokens seen in the past + new token to - # compute attention) of the sequence group. None if it is in a decode - # stage. - seq_len: Optional[int] - # The length of new query tokens to compute in the current step. None if it - # is in a decode stage. The length of query_len <= seq_len if chunked - # prefill is enabled. - query_len: Optional[int] - # A random number generator for sampling. - generator: Optional[torch.Generator] - # True if the sequence group is in prefill stage. False if it is in a - # decode stage. - is_prompt: bool - # Query token indices from logits. to compute prompt logprob. Empty if - # prompt logprob is not required. - prompt_logprob_indices: list[int] - # Sample token indices from logits. Empty if sampling is not required. - sample_indices: list[int] - - @property - def do_sample(self): - return len(self.sample_indices) > 0 - - def __post_init__(self): - if len(self.prompt_logprob_indices) > 0: - assert self.sampling_params.prompt_logprobs is not None - if self.is_prompt: - assert self.seq_len is not None - assert self.query_len is not None - - -def gen_seq_group_to_sample_builder(num_seqs: int): - return lambda: SequenceGroupToSample( - seq_ids=[0] * num_seqs, - sampling_params=None, - seq_data=None, # type: ignore - seq_len=0, - query_len=0, - generator=None, - is_prompt=True, - prompt_logprob_indices=[], - sample_indices=[], - ) - - -class SamplingMetadataCache: - """Used to cache SamplingMetadata objects between scheduler iterations""" - - def __init__(self): - self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {} - - def get_cached_seq_group_to_sample(self, num_seqs): - if num_seqs not in self._seq_group_to_sample_cache: - self._seq_group_to_sample_cache[num_seqs] = PyObjectCache( - gen_seq_group_to_sample_builder(num_seqs)) - - obj = self._seq_group_to_sample_cache[num_seqs].get_object() - return obj - - def reset(self): - for cache in self._seq_group_to_sample_cache.values(): - cache.reset() - class SamplingMetadata: - """Metadata for input sequences. Used in sampler. - - The usage is as follows; - ``` - hidden_states = execute_model(...) - logits = hidden_states[sampling_metadata.selected_token_indices] - sample(logits) - - def sample(logits): - # Use categorized_sample_indices for sampling.... - ``` - - Args: - seq_groups: List of batched sequence groups. - selected_token_indices: (num_query_tokens_to_logprob). Indices to find - logits from the initial model output hidden states. - categorized_sample_indices: SamplingType -> token indices to sample. - Each token indices is 2D tensor of (num_indices, num_indices) where - the first item means the sample index within the returned logit - (before pruning padding), and the second item means the sample - index after pruning using selected_token_indices. - For example, if the returned logit is [1, 2, 3], and we select - [1, 2] for sampling, the pruned logit will be [2, 3]. In this case, - The first tuple is [1, 2] (sampled index within original logit), - and the second tuple is [0, 1] (sampled index within pruned logit). - num_prompts: Number of prompt sequence groups in seq_groups. - skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU - serialization of token outputs. - reuse_sampling_tensors: Indicates if we want to reuse sampling - tensors that are part of the sampler forward pass. Currently, - it is mainly used for multi-step decode. - - """ - - def __init__( - self, - seq_groups: list[SequenceGroupToSample], - selected_token_indices: torch.Tensor, - categorized_sample_indices: dict[SamplingType, torch.Tensor], - num_prompts: int, - skip_sampler_cpu_output: bool = False, - reuse_sampling_tensors: bool = False, - ) -> None: - self.seq_groups = seq_groups - self.selected_token_indices = selected_token_indices - self.categorized_sample_indices = categorized_sample_indices - self.num_prompts = num_prompts - self.skip_sampler_cpu_output = skip_sampler_cpu_output - self.reuse_sampling_tensors = reuse_sampling_tensors - - @staticmethod - def prepare( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], - device: str, - pin_memory: bool, - generators: Optional[dict[str, torch.Generator]] = None, - cache: Optional[SamplingMetadataCache] = None, - ) -> "SamplingMetadata": - ( - seq_groups, - selected_token_indices, - categorized_sample_indices, - num_prompts, - ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens, - device, generators, cache) - selected_token_indices = async_tensor_h2d( - selected_token_indices, - dtype=torch.long, - target_device=device, - pin_memory=pin_memory, - ) - categorized_sample_indices = { - t: - async_tensor_h2d( - seq_ids, - dtype=torch.int, - target_device=device, - pin_memory=pin_memory, - ) - for t, seq_ids in categorized_sample_indices.items() - } - - sampling_metadata = SamplingMetadata( - seq_groups=seq_groups, - selected_token_indices=selected_token_indices, - categorized_sample_indices=categorized_sample_indices, - num_prompts=num_prompts, - ) - return sampling_metadata - - def __repr__(self) -> str: - return ( - "SamplingMetadata(" - f"seq_groups={self.seq_groups}, " - f"selected_token_indices={self.selected_token_indices}, " - f"categorized_sample_indices={self.categorized_sample_indices})") - - -def _prepare_seq_groups( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], - device: str, - generators: Optional[dict[str, torch.Generator]] = None, - cache: Optional[SamplingMetadataCache] = None, -) -> tuple[ - list[SequenceGroupToSample], - list[int], - dict[SamplingType, list[int]], - int, -]: - """Prepare sequence groups and indices for sampling. - - Args: - seq_group_metadata_list: A list of sequence group to batch. - seq_lens: A list of sequence lens per sequence group. - Index of prompt len should match with seq_group_metadata_list. - query_lens: A list of query lengths. Prompt lens include the length - of entire prompt tokens, and it could be shorter. - device: A device to use for random number generators, - `SequenceGroupToSample.generator`. - generators: A store of per-request random number generators used - for seeded requests. - - Returns: - seq_groups: A list of sequence group to sample. - selected_token_indices: See the definition from `SamplingMetadata`. - categorized_sample_indices: See the definition from `SamplingMetadata`. - num_prompts: Total number of prompts from `seq_group_metadata_list`. - """ - # Batched sequence groups for the current model forward stsep. - seq_groups: list[SequenceGroupToSample] = [] - # A list of token indices to sample/compute logprob. It is used to - # prune the outcome logits from the model for the performance. - selected_token_indices: list[int] = [] - # Used for selected_token_indices. - model_output_idx = 0 - - # Sampling type -> ( - # indices to sample/prompt logprob within pruned output logits, - # indices to sample within pruned logits) - categorized_sample_indices: dict[SamplingType, list[int]] = { - t: [] - for t in SamplingType - } - # Index of logits to compute logprob. Logits include both prompt logprob - # and sample logprob indices. - logit_idx = 0 - # Total number of prompts from given sequence groups. - num_prompts = 0 - - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = seq_group_metadata.seq_data.keys() - - if cache is not None: - sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids)) - - for j, seq_id in enumerate(seq_ids): - sample_obj.seq_ids[j] = seq_id - - sample_obj.prompt_logprob_indices.clear() - sample_obj.sample_indices.clear() - - sampling_params = seq_group_metadata.sampling_params - is_prompt = seq_group_metadata.is_prompt - generator: Optional[torch.Generator] = None - # If the current seq group is in decode stage, it is None. - seq_len: Optional[int] = None - query_len: Optional[int] = None - prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices - if cache is not None else []) - sample_indices: list[int] = (sample_obj.sample_indices - if cache is not None else []) - do_sample = seq_group_metadata.do_sample - - if seq_group_metadata.is_prompt: - if sampling_params.seed is not None: - generator = torch.Generator(device=device).manual_seed( - sampling_params.seed) - if generators is not None: - generators[seq_group_metadata.request_id] = generator - - num_prompts += 1 - num_prefill_sample = len(seq_ids) - assert num_prefill_sample == 1 - assert query_lens is not None and seq_lens is not None - query_len, seq_len = query_lens[i], seq_lens[i] - # If we need sampling, exclude num_prefill_sample tokens from - # prompt logprob. - prompt_logprob_len = (query_len - num_prefill_sample - if do_sample else query_len) - sample_len = num_prefill_sample if do_sample else 0 - else: - # Decode - prompt_logprob_len = 0 - query_len = query_lens[i] if query_lens is not None and len( - query_lens) > 0 else 1 - sample_len = len(seq_ids) * query_len if do_sample else 0 - - if sampling_params.seed is not None and generators is not None: - generator = generators.get(seq_group_metadata.request_id) - - # Update indices to select from the model output. - """ - This blocks computes selected_token_indices which is used in the - following way. - - hidden_states = model(...) - logits = hidden_states[selected_token_indices] - """ - - if sampling_params.prompt_logprobs is not None: - selected_token_indices.extend( - range(model_output_idx, model_output_idx + prompt_logprob_len)) - model_output_idx += prompt_logprob_len - if do_sample: - selected_token_indices.extend( - range(model_output_idx, model_output_idx + sample_len)) - model_output_idx += sample_len - - # We now find indices for logprob computation and sampling. - """ - This block computes categorized_sample_indices which is used in the - following way. - - hidden_states = model(...) - logits = hidden_states[selected_token_indices] - def sample(logits): - # Use categorized_sample_indices for sampling. - # prompt_logprob_indices to find prompt logprob indices. - # sample_indices to find sample indices. - """ - - if sampling_params.prompt_logprobs is not None: - prompt_logprob_indices.extend( - range(logit_idx, logit_idx + prompt_logprob_len)) - logit_idx += prompt_logprob_len - if do_sample: - sample_indices.extend(range(logit_idx, logit_idx + sample_len)) - categorized_sample_indices[sampling_params.sampling_type].extend( - list(range(logit_idx, logit_idx + sample_len))) - logit_idx += sample_len - - if cache is not None: - sample_obj.sampling_params = sampling_params - sample_obj.seq_data = seq_group_metadata.seq_data - sample_obj.seq_len = seq_len - sample_obj.query_len = query_len - sample_obj.generator = generator - sample_obj.is_prompt = is_prompt - else: - sample_obj = SequenceGroupToSample( - seq_ids=list(seq_ids), - sampling_params=sampling_params, - seq_data=seq_group_metadata.seq_data, - seq_len=seq_len, - query_len=query_len, - generator=generator, - is_prompt=is_prompt, - prompt_logprob_indices=list(prompt_logprob_indices), - sample_indices=list(sample_indices), - ) - - seq_groups.append(sample_obj) - - if cache is not None: - cache.reset() - - return (seq_groups, selected_token_indices, categorized_sample_indices, - num_prompts) - - -@dataclass -class SamplingTensors: - """Tensors for sampling.""" - - temperatures: torch.Tensor - top_ps: torch.Tensor - top_ks: torch.Tensor - min_ps: torch.Tensor - presence_penalties: torch.Tensor - frequency_penalties: torch.Tensor - repetition_penalties: torch.Tensor - prompt_tokens: torch.Tensor - output_tokens: torch.Tensor - - @classmethod - def from_sampling_metadata( - cls, - sampling_metadata: "SamplingMetadata", - vocab_size: int, - device: torch.device, - dtype: torch.dtype, - ) -> tuple["SamplingTensors", bool, bool, bool]: - prompt_tokens: list[array] = [] - output_tokens: list[array] = [] - top_ks: list[int] = [] - temperatures: list[float] = [] - top_ps: list[float] = [] - min_ps: list[float] = [] - presence_penalties: list[float] = [] - frequency_penalties: list[float] = [] - repetition_penalties: list[float] = [] - do_penalties = False - do_top_p_top_k = False - do_min_p = False - - assert sampling_metadata.seq_groups is not None - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - temperature = sampling_params.temperature - p = sampling_params.presence_penalty - f = sampling_params.frequency_penalty - r = sampling_params.repetition_penalty - top_p = sampling_params.top_p - min_p = sampling_params.min_p - - # k should not be greater than the vocab size. - top_k = min(sampling_params.top_k, vocab_size) - top_k = vocab_size if top_k < 1 else top_k - if temperature < _SAMPLING_EPS: - # NOTE: Zero temperature means deterministic sampling - # (i.e., greedy sampling or beam search). - # Set the temperature to 1 to avoid division by zero. - temperature = 1.0 - if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS - or top_k != vocab_size): - do_top_p_top_k = True - if not do_min_p and min_p > _SAMPLING_EPS: - do_min_p = True - if not do_penalties and (abs(p) >= _SAMPLING_EPS - or abs(f) >= _SAMPLING_EPS - or abs(r - 1.0) >= _SAMPLING_EPS): - do_penalties = True - - is_prompt = seq_group.is_prompt - if is_prompt and sampling_params.prompt_logprobs is not None: - # For tokens in the prompt that we only need to get - # their logprobs - query_len = seq_group.query_len - assert query_len is not None - prefill_len = len(seq_group.prompt_logprob_indices) - temperatures += [temperature] * prefill_len - top_ps += [top_p] * prefill_len - top_ks += [top_k] * prefill_len - min_ps += [min_p] * prefill_len - presence_penalties += [0] * prefill_len - frequency_penalties += [0] * prefill_len - repetition_penalties += [1] * prefill_len - - if seq_group.do_sample: - sample_lens = len(seq_group.sample_indices) - assert sample_lens >= len(seq_ids) - temperatures += [temperature] * sample_lens - top_ps += [top_p] * sample_lens - top_ks += [top_k] * sample_lens - min_ps += [min_p] * sample_lens - presence_penalties += [p] * sample_lens - frequency_penalties += [f] * sample_lens - repetition_penalties += [r] * sample_lens - - if do_penalties: - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - if (seq_group.is_prompt - and sampling_params.prompt_logprobs is not None): - prefill_len = len(seq_group.prompt_logprob_indices) - prompt_tokens.extend( - array(VLLM_TOKEN_ID_ARRAY_TYPE) - for _ in range(prefill_len)) - output_tokens.extend( - array(VLLM_TOKEN_ID_ARRAY_TYPE) - for _ in range(prefill_len)) - if seq_group.do_sample: - for seq_id in seq_ids: - seq_data = seq_group.seq_data[seq_id] - prompt_tokens.append(seq_data.prompt_token_ids_array) - output_tokens.append(seq_data.output_token_ids_array) - - sampling_tensors = SamplingTensors.from_lists( - temperatures, - top_ps, - top_ks, - min_ps, - presence_penalties, - frequency_penalties, - repetition_penalties, - prompt_tokens, - output_tokens, - vocab_size, - device, - dtype, - ) - return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) - - @classmethod - def from_lists( - cls, - temperatures: list[float], - top_ps: list[float], - top_ks: list[int], - min_ps: list[float], - presence_penalties: list[float], - frequency_penalties: list[float], - repetition_penalties: list[float], - prompt_tokens: list[array], - output_tokens: list[array], - vocab_size: int, - device: torch.device, - dtype: torch.dtype, - ) -> "SamplingTensors": - # Note that the performance will be very bad without - # pinned memory. - pin_memory = is_pin_memory_available() - - do_penalties = prompt_tokens or output_tokens - - if do_penalties: - prompt_t = make_tensor_with_pad( - prompt_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) - output_t = make_tensor_with_pad( - output_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) - else: - empty_tensor = torch.empty(0, device=device, dtype=torch.long) - prompt_t = empty_tensor - output_t = empty_tensor - - temperatures_t = torch.tensor( - temperatures, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ps_t = torch.tensor( - top_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - min_ps_t = torch.tensor( - min_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - presence_penalties_t = torch.tensor( - presence_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - frequency_penalties_t = torch.tensor( - frequency_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - repetition_penalties_t = torch.tensor( - repetition_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ks_t = torch.tensor( - top_ks, - device="cpu", - dtype=torch.int, - pin_memory=pin_memory, - ) - # Because the memory is pinned, we can do non-blocking - # transfer to device. - - return cls( - temperatures=temperatures_t.to(device=device, non_blocking=True), - top_ps=top_ps_t.to(device=device, non_blocking=True), - top_ks=top_ks_t.to(device=device, non_blocking=True), - min_ps=min_ps_t.to(device=device, non_blocking=True), - presence_penalties=presence_penalties_t.to(device=device, - non_blocking=True), - frequency_penalties=frequency_penalties_t.to(device=device, - non_blocking=True), - repetition_penalties=repetition_penalties_t.to(device=device, - non_blocking=True), - prompt_tokens=prompt_t.to(device=device, non_blocking=True), - output_tokens=output_t.to(device=device, non_blocking=True), - ) + # Placeholder until it can be safely removed. + pass diff --git a/vllm/sequence.py b/vllm/sequence.py index 24114c0bb792e..a6c194fbac0b2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,28 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sequence and its related classes.""" -import copy -import enum -from abc import ABC, abstractmethod -from array import array -from collections import defaultdict -from collections.abc import Mapping -from collections.abc import Sequence as GenericSequence -from dataclasses import dataclass, field -from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional, Union import msgspec import torch -from vllm.inputs import SingletonInputs -from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs -from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import RequestOutputKind, SamplingParams - if TYPE_CHECKING: - from vllm.lora.request import LoRARequest from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) else: @@ -34,50 +19,6 @@ VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 -def array_full(token_id: int, count: int): - """[`array`][] equivalent of [numpy.full][].""" - return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count - - -class SequenceStatus(enum.IntEnum): - """Status of a sequence.""" - WAITING = 0 - RUNNING = 1 - SWAPPED = 2 - # Note: anything after SWAPPED (2) will be considered - # as a finished status. - FINISHED_STOPPED = 3 - FINISHED_LENGTH_CAPPED = 4 - FINISHED_ABORTED = 5 - FINISHED_IGNORED = 6 - - @staticmethod - def is_finished(status: "SequenceStatus") -> bool: - return status > SequenceStatus.SWAPPED - - @staticmethod - def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: - if status == SequenceStatus.FINISHED_STOPPED: - finish_reason = "stop" - elif status == SequenceStatus.FINISHED_LENGTH_CAPPED: - finish_reason = "length" - elif status == SequenceStatus.FINISHED_ABORTED: - finish_reason = "abort" - elif status == SequenceStatus.FINISHED_IGNORED: - # The ignored sequences are the sequences whose prompt lengths - # are longer than the model's length cap. Therefore, the stop - # reason should also be "length" as in OpenAI API. - finish_reason = "length" - else: - finish_reason = None - return finish_reason - - -class SequenceStage(enum.Enum): - PREFILL = enum.auto() - DECODE = enum.auto() - - @dataclass class RequestMetrics: """Metrics associated with a request. @@ -107,971 +48,12 @@ class RequestMetrics: model_execute_time: Optional[float] = None -class SequenceDataDelta( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Delta SequenceData to send to workers per step.""" - # A new token to be appended to existing SequenceData. - new_output_token_ids: list[int] - # Overwriting existing `cumulative_logprob` - new_cumulative_logprob: float - # Overwriting existing `num_computed_tokens`. - new_num_computed_tokens: int - # Overwriting existing `stage`. - new_stage: SequenceStage - - -class SequenceData(msgspec.Struct, - omit_defaults=True): # type: ignore[call-arg] - """Data associated with a sequence.""" - # NOTE: we cannot use Union[list, array] because msgspec cannot support - # union of 2 list types. - _prompt_token_ids: array - _output_token_ids: array = msgspec.field( - default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, [])) - - _prompt_embeds: Optional[torch.Tensor] = None - _output_embeds: Optional[torch.Tensor] = None - - ### The below fields should not be passed as an argument ### - _cumulative_logprob: float = 0.0 - _prompt_token_ids_tuple: tuple[int, - ...] = msgspec.field(default_factory=tuple) - # The number of tokens that are computed (that run against the model). - _num_computed_tokens: int = 0 - # The number of tokens with prefix cache hit. - _num_cached_tokens: int = 0 - _stage: SequenceStage = SequenceStage.PREFILL - _cached_all_token_ids: list[int] = msgspec.field(default_factory=list) - _cached_all_token_embeds: Optional[torch.Tensor] = None - - # It is used to get delta input. It is reset when `get_delta_and_reset` - # is called. - _new_appended_tokens: list[int] = msgspec.field(default_factory=list) - - # It is used to compute mrope_position_ids. - _mrope_position_delta: Optional[int] = None - - @staticmethod - def from_prompt_token_counts( - *token_counts: tuple[int, int]) -> "SequenceData": - """ - Construct a [`SequenceData`][vllm.sequence.SequenceData] instance - by concatenating prompt token sequences. - - Each tuple represents one token sequence, expressed in the form - `(token_id, count)`. - """ - if len(token_counts) == 0: - return SequenceData.from_seqs([]) - - prompt_token_ids_arr = reduce( - array.__iadd__, - (array_full(token_id, count) for token_id, count in token_counts), - ) - - return SequenceData(prompt_token_ids_arr) - - @staticmethod - def from_seqs( - prompt_token_ids: GenericSequence[int], - output_token_ids: Optional[GenericSequence[int]] = None, - *, - prompt_embeds: Optional[torch.Tensor] = None, - ) -> "SequenceData": - """ - Construct a [`SequenceData`][vllm.sequence.SequenceData] instance - from prompt and output token sequences. - """ - prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, - prompt_token_ids) - - if output_token_ids is None: - return SequenceData(prompt_token_ids_arr, - _prompt_embeds=prompt_embeds) - - output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, - output_token_ids) - - return SequenceData(prompt_token_ids_arr, - _output_token_ids=output_token_ids_arr, - _prompt_embeds=prompt_embeds) - - def __post_init__(self) -> None: - assert self._prompt_token_ids.typecode == "l" - assert self._output_token_ids.typecode == "l" - self._prompt_token_ids_tuple: tuple[int, ...] = tuple( - self._prompt_token_ids) - self._update_cached_all_tokens() - if self._prompt_embeds is not None: - self._update_cached_all_token_embeds() - - def _update_cached_all_tokens(self): - assert isinstance(self._prompt_token_ids, array) - assert isinstance(self._output_token_ids, array) - self._cached_all_token_ids: list[int] = list(self._prompt_token_ids + - self._output_token_ids) - - def _update_cached_all_token_embeds(self): - assert isinstance(self._prompt_embeds, torch.Tensor) - self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds - if self._output_embeds is not None: - self._cached_all_token_embeds = torch.cat( - (self._cached_all_token_embeds, self._output_embeds), dim=0) - - @property - def cumulative_logprob(self) -> float: - """The cumulative log probability of the output.""" - return self._cumulative_logprob - - @property - def prompt_token_ids(self) -> tuple[int, ...]: - """The token IDs of the prompt.""" - return self._prompt_token_ids_tuple - - @prompt_token_ids.setter - def prompt_token_ids(self, new_prompt_token_ids) -> None: - raise NotImplementedError - - @property - def prompt_token_ids_array(self) -> array: - """Return the prompt token ids in array type. - - Note that the array is in "I" type, and it is not compatible - with torch.long (2 bytes vs 4 bytes). So beware of the usage. - """ - return self._prompt_token_ids - - @property - def output_token_ids(self) -> tuple[int, ...]: - """The token IDs of the output.""" - return tuple(self._output_token_ids) - - @output_token_ids.setter - def output_token_ids(self, - new_output_token_ids: GenericSequence[int]) -> None: - self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, - new_output_token_ids) - self._update_cached_all_tokens() - - @property - def output_embeds(self) -> Optional[torch.Tensor]: - return self._output_embeds - - @output_embeds.setter - def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None: - self._output_token_embeds = new_output_token_embeds - self._update_cached_all_token_embeds() - - @property - def output_token_ids_array(self) -> array: - """Return the prompt token ids in array type. - - Note that the array is in "I" type, and it is not compatible - with torch.long (2 bytes vs 4 bytes). So beware of the usage. - """ - assert isinstance(self._output_token_ids, array) - return self._output_token_ids - - @property - def prompt_embeds(self) -> Optional[torch.Tensor]: - return self._prompt_embeds - - @prompt_embeds.setter - def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None: - self._prompt_embeds = prompt_embeds - self._update_cached_all_token_embeds() - - @property - def mrope_position_delta(self) -> Optional[int]: - return self._mrope_position_delta - - @mrope_position_delta.setter - def mrope_position_delta(self, new_mrope_position_delta): - self._mrope_position_delta = new_mrope_position_delta - - def append_token_id(self, - token_id: int, - logprob: float, - token_embed: Optional[torch.Tensor] = None) -> None: - self._output_token_ids.append(token_id) - self._new_appended_tokens.append(token_id) - self._cached_all_token_ids.append(token_id) - self._cumulative_logprob += logprob - if token_embed is not None: - # Do not pass in with batch or sequence dimensions - assert token_embed.ndim == 1 - token_embed = token_embed.detach().cpu().unsqueeze(0) - if self._output_embeds is None: - self._output_embeds = token_embed - else: - self._output_embeds = torch.cat( - (self._output_embeds, token_embed), dim=0) - assert self._cached_all_token_embeds is not None - self._cached_all_token_embeds = torch.cat( - (self._cached_all_token_embeds, - token_embed.to(device=self._cached_all_token_embeds.device)), - dim=0) - - def get_len(self) -> int: - return len(self._output_token_ids) + len(self._prompt_token_ids) - - def get_prompt_len(self) -> int: - return len(self._prompt_token_ids) - - def get_output_len(self) -> int: - return len(self._output_token_ids) - - def get_token_ids(self) -> list[int]: - return self._cached_all_token_ids - - def get_token_embeddings(self) -> Optional[torch.Tensor]: - return self._cached_all_token_embeds - - def get_prefix_token_ids( - self, num_tokens: int - ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]: - """Get prefix tokens, and make the return value hashable""" - prompt_length = self.get_prompt_len() - if num_tokens > prompt_length: - return (self._prompt_token_ids_tuple, - tuple(self._output_token_ids[:num_tokens - prompt_length])) - else: - return (self._prompt_token_ids_tuple[:num_tokens], None) - - def get_num_computed_tokens(self) -> int: - """Return the number of prefill tokens that are already computed.""" - return self._num_computed_tokens - - def update_num_computed_tokens(self, num_new_computed_tokens: int): - """Update number of tokens computed so far.""" - self._num_computed_tokens += num_new_computed_tokens - assert self._num_computed_tokens <= self.get_len(), ( - self._num_computed_tokens, self.get_len()) - # If all tokens are computed, it means it is in decoding phase. - if self.get_num_uncomputed_tokens() == 0: - self._stage = SequenceStage.DECODE - - def get_num_cached_tokens(self) -> int: - """Return the number of tokens with prefix cache hit.""" - return self._num_cached_tokens - - def update_num_cached_tokens(self, num_cached_tokens: int): - """Update the number of tokens with prefix cache hit.""" - self._num_cached_tokens = num_cached_tokens - - def reset_state_for_recompute(self) -> None: - """Reset the number of computed tokens from this sequence. It is - supposed to be called when a sequence needs to be started from - the beginning again (e.g., sequence is preempted). - """ - self._num_computed_tokens = 0 - self._stage = SequenceStage.PREFILL - self._new_appended_tokens = [] - - def get_num_uncomputed_tokens(self) -> int: - """Return the number of prefill tokens that are not computed.""" - # we use `get_len()` which includes prompt_len + output_len instead - # of prompt_len here. This is because during recompute we need to - # prefill for both prompt and output. - return self.get_len() - self.get_num_computed_tokens() - - def get_last_token_id(self) -> int: - if not self._output_token_ids: - return self._prompt_token_ids[-1] - return self._output_token_ids[-1] - - def get_prompt_token_ids(self) -> tuple[int, ...]: - return self.prompt_token_ids - - def get_output_token_ids(self) -> tuple[int, ...]: - return self.output_token_ids - - def get_delta_and_reset(self) -> SequenceDataDelta: - delta = SequenceDataDelta(self._new_appended_tokens, - self._cumulative_logprob, - self.get_num_computed_tokens(), self.stage) - # Reset delta state. - self._new_appended_tokens = [] - return delta - - def apply_delta(self, delta: SequenceDataDelta): - self._num_computed_tokens = delta.new_num_computed_tokens - self._cumulative_logprob = delta.new_cumulative_logprob - self._stage = delta.new_stage - self._output_token_ids.extend(delta.new_output_token_ids) - self._cached_all_token_ids.extend(delta.new_output_token_ids) - - @property - def stage(self) -> SequenceStage: - return self._stage - - def __repr__(self) -> str: - return (f"SequenceData(" - f"prompt_token_ids={self._prompt_token_ids}, " - f"prompt_embeds.shape=" - f"{getattr(self._prompt_embeds, 'shape', None)}, " - f"output_token_ids={self.output_token_ids}, " - f"cumulative_logprob={self.cumulative_logprob}, " - f"get_num_computed_tokens={self.get_num_computed_tokens()})") - - -class Sequence: - """Stores the data, status, and block information of a sequence. - - The sequence is constructed from the - [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only) - or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] - (for encoder-decoder) instance passed in through the `inputs` - constructor argument. - - Args: - seq_id: The ID of the sequence. - inputs: The inputs of the sequence. - block_size: The block size of the sequence. Should be the same as the - block size used by the block manager and cache engine. - eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. - lora_request: LoRA request. - """ - - def __init__( - self, - seq_id: int, - inputs: SingletonInputs, - block_size: int, - eos_token_id: Optional[int] = None, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.seq_id = seq_id - self.inputs = inputs - self.block_size = block_size - self.eos_token_id = eos_token_id - self.lora_request = lora_request - - self.data = SequenceData.from_seqs( - self.prompt_token_ids, - prompt_embeds=self.inputs["prompt_embeds"] - if self.inputs["type"] == "embeds" else None) - self.output_logprobs: SampleLogprobs = [] - self.output_text = "" - - self.status = SequenceStatus.WAITING - self.stop_reason: Union[int, str, None] = None - - # These are used to keep track of delta outputs - self._last_output_token_ids_offset: int = 0 - self._last_output_text_offset: int = 0 - - # Used for incremental detokenization - self.prefix_offset = 0 - self.read_offset = 0 - # Input + output tokens - self.tokens: Optional[list[str]] = None - - @property - def n_blocks(self) -> int: - return (self.get_len() + self.block_size - 1) // self.block_size - - @property - def prompt(self) -> Optional[str]: - if self.inputs["type"] == "embeds": - return None - return self.inputs.get("prompt") - - @property - def prompt_token_ids(self) -> list[int]: - if self.inputs["type"] == "embeds": - return [0] * len(self.inputs["prompt_embeds"]) - return self.inputs["prompt_token_ids"] - - @property - def multi_modal_data(self) -> MultiModalKwargs: - if self.inputs["type"] == "multimodal": - return self.inputs["mm_kwargs"].get_data() - - return MultiModalKwargs() - - @property - def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - if self.inputs["type"] == "multimodal": - return self.inputs["mm_placeholders"] - - return {} - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def get_output_text_to_return(self, buffer_length: int, - delta: bool) -> str: - """If delta is True, only new text since the last call to - this method is returned""" - - # We return the full output text if the sequence is finished. - truncate = buffer_length and not self.is_finished() - if not delta: - return self.output_text[:-buffer_length] if truncate else ( - self.output_text) - length = len(self.output_text) - if truncate: - length -= buffer_length - last_offset = self._last_output_text_offset - if last_offset < length: - self._last_output_text_offset = length - return self.output_text[last_offset:length] - return "" - - def get_output_token_ids_to_return( - self, delta: bool) -> Union[GenericSequence[int], int]: - """If delta is True, only new tokens since the last call to - this method are returned""" - if not delta: - return self.get_output_token_ids() - - output_len = self.get_output_len() - - # Get the number of new tokens - num_new_tokens = output_len - self._last_output_token_ids_offset - self._last_output_token_ids_offset = output_len - - # Return new tokens - if num_new_tokens == 1: - # Optimization for single decode token case - # (which is what we have most of the time) - return self.data._cached_all_token_ids[-1] - - if num_new_tokens == 0: - return [] - - return self.data._cached_all_token_ids[-num_new_tokens:] - - def hash_of_block(self, logical_idx: int) -> int: - # TODO This can produce incorrect hash when block size > prompt size - - # Compute the number of tokens in the sequence - # TODO: The current hashing function is O(L^2). We should optimize - # this in the future. - num_tokens = self.num_hashed_tokens_of_block(logical_idx) - hashed_tokens = self.data.get_prefix_token_ids(num_tokens) - return hash((hashed_tokens, self.lora_int_id)) - - def extra_hash(self) -> Optional[int]: - """ - This function computes an extra hash for a sequence, specifically - designed for prefix caching mode. The final sequence hash is determined - by applying token_ids from the sequence's blocks. - """ - if self.lora_int_id == 0: - return None - - # NOTE: If there are additional factors influencing the block aside from - # token_ids, include them as input parameters to the hash. - return hash(self.lora_int_id) - - def num_hashed_tokens_of_block(self, logical_idx: int): - return logical_idx * self.block_size + self.block_size - - def reset_state_for_recompute(self): - """Reset the sequence states for recomputation.""" - self.data.reset_state_for_recompute() - - def append_token_id(self, - token_id: int, - logprobs: dict[int, Logprob], - token_embed: Optional[torch.Tensor] = None) -> None: - assert token_id in logprobs - self.output_logprobs.append(logprobs) - self.data.append_token_id(token_id, logprobs[token_id].logprob, - token_embed) - - def get_len(self) -> int: - return self.data.get_len() - - def get_prompt_len(self) -> int: - return self.data.get_prompt_len() - - def get_output_len(self) -> int: - return self.data.get_output_len() - - def get_token_ids(self) -> list[int]: - return self.data.get_token_ids() - - def get_prompt_token_ids(self) -> tuple[int, ...]: - return self.data.get_prompt_token_ids() - - def get_last_token_id(self) -> int: - return self.data.get_last_token_id() - - def get_output_token_ids(self) -> tuple[int, ...]: - return self.data.get_output_token_ids() - - def get_cumulative_logprob(self) -> float: - return self.data.cumulative_logprob - - def is_finished(self) -> bool: - return SequenceStatus.is_finished(self.status) - - def fork(self, new_seq_id: int) -> "Sequence": - new_seq = copy.deepcopy(self) - new_seq.seq_id = new_seq_id - return new_seq - - def get_num_new_tokens(self) -> int: - """Get the number of new tokens to be computed. - - Returns: - The new number of tokens to be computed. I.e., 1 for decode, or - the remaining prompt size for prefill. - """ - if self.data.stage == SequenceStage.DECODE: - return 1 - return self.data.get_num_uncomputed_tokens() - - def get_num_computed_tokens(self) -> int: - return self.data.get_num_computed_tokens() - - def is_prefill(self) -> bool: - return self.data.stage == SequenceStage.PREFILL - - def __repr__(self) -> str: - return (f"Sequence(seq_id={self.seq_id}, " - f"status={self.status.name}, " - f"num_blocks={self.n_blocks})") - - -class SequenceGroupState(msgspec.Struct, - omit_defaults=True): # type: ignore[call-arg] - """Mutable state tied to a specific sequence group""" - - # for multi-step decoding - num_steps: int = 1 - current_step: int = 0 - - @property - def remaining_steps(self) -> int: - return self.num_steps - self.current_step - - -class SequenceGroup: - """A group of sequences that are generated from the same prompt. - - Args: - request_id: The ID of the request. - seqs: The list of sequences. - sampling_params: The sampling parameters used to generate the outputs. - arrival_time: The arrival time of the request. - lora_request: LoRA request. - pooling_params: The parameters used to generate the pooler - for a pooling model. - pooled_data: The extracted hidden states from a pooling model. - encoder_seq: Optional, the single encoder sequence. Should be None - unless you are working with an encoder/decoder model. - trace_headers: OpenTelemetry trace headers. - priority: User-defined priority of the request. - draft_size: The number of speculative tokens plus one from the target - model; equal to max number of tokens a step can generate - for single-draft speculative decoding but larger than - that for multi-draft SD (currently not supported). - """ - - def __init__(self, - request_id: str, - seqs: list[Sequence], - arrival_time: float, - sampling_params: Optional[SamplingParams] = None, - lora_request: Optional[LoRARequest] = None, - pooling_params: Optional[PoolingParams] = None, - pooled_data: Optional[torch.Tensor] = None, - encoder_seq: Optional[Sequence] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - draft_size: int = 1) -> None: - self.request_id = request_id - self.seqs = seqs - self.first_seq = seqs[0] - self.arrival_time = arrival_time - self.is_single_seq = len(seqs) == 1 - self.seqs_dict = {seq.seq_id: seq for seq in seqs} - - self.sampling_params = sampling_params - self.metrics = RequestMetrics(arrival_time=arrival_time, - last_token_time=arrival_time, - first_scheduled_time=None, - first_token_time=None, - time_in_queue=None) - self.last_token_latency = 0.0 - self.lora_request = lora_request - self.prompt_logprobs: Optional[PromptLogprobs] = None - self.state = SequenceGroupState() - self.pooling_params = pooling_params - self.pooled_data = pooled_data - self.encoder_seq = encoder_seq - self.trace_headers = trace_headers - self.priority = priority - - self.cached_request_output = None - - @property - def prompt(self) -> Optional[str]: - return self.first_seq.prompt - - @property - def prompt_token_ids(self) -> list[int]: - return self.first_seq.prompt_token_ids - - @property - def encoder_prompt(self) -> Optional[str]: - # There are either 0 or 1 encoder sequences - # If one is present, its prompt is distinct - # from the decoder's. - return (self.encoder_seq.prompt - if self.encoder_seq is not None else None) - - @property - def encoder_prompt_token_ids(self) -> Optional[list[int]]: - # There are either 0 or 1 encoder sequences - # If one is present, its prompt token ids are - # distinct from the decoder's. - return (self.encoder_seq.prompt_token_ids - if self.encoder_seq is not None else None) - - @property - def multi_modal_data(self) -> MultiModalKwargs: - if self.first_seq.multi_modal_data: - return self.first_seq.multi_modal_data - elif self.encoder_seq is not None: - return self.encoder_seq.multi_modal_data - return MultiModalKwargs() - - @property - def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - if self.first_seq.multi_modal_data: - return self.first_seq.multi_modal_placeholders - elif self.encoder_seq is not None: - return self.encoder_seq.multi_modal_placeholders - return {} - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def set_last_token_time(self, now: float) -> None: - """Sets the last token time for Request level timings.""" - # If still in prefill phase, assertion fails. - assert not self.is_prefill(), ( - "seq_group.set_last_token_time() should not be called " - "if the seq_group is in prefill phase.") - self.last_token_latency = now - self.metrics.last_token_time - self.metrics.last_token_time = now - - def get_last_token_latency(self) -> float: - """Returns the latency of the last token.""" - assert not self.is_prefill(), ( - "seq_group.get_last_token_latency() should not be called " - "if the seq_group is in prefill phase.") - return self.last_token_latency - - def maybe_set_first_token_time(self, time: float) -> None: - """Sets the first token time for Request level timings.""" - # Note: in a case where a sequence_group is swapped and - # recomputed, the time between iterations is counted - # in TPOT, rather than recalculating TTFT (since from the ) - # POV of the user, there is simply a long generation delay. - if (self.metrics.first_token_time is None - and self.first_seq.get_output_len() == 1): - self.metrics.first_token_time = time - - def maybe_set_first_scheduled_time(self, time: float) -> None: - """Sets the first scheduled time and time in queue for Request - level timings.""" - if self.metrics.first_scheduled_time is None: - self.metrics.first_scheduled_time = time - self.metrics.time_in_queue = time - self.metrics.arrival_time - - def set_finished_time(self, time: Optional[float]) -> None: - """Sets the finished time for Request level timings.""" - self.metrics.finished_time = time - - def get_max_num_running_seqs(self) -> int: - """The maximum number of sequences running in parallel in the remaining - lifetime of the request.""" - if self.is_single_seq: - return 0 if self.first_seq.is_finished() else 1 - return self.num_seqs() - self.num_finished_seqs() - - def get_seqs( - self, - status: Optional[SequenceStatus] = None, - ) -> list[Sequence]: - if status is None: - return self.seqs - - if self.is_single_seq: - return self.seqs if self.first_seq.status == status else [] - - return [seq for seq in self.seqs if seq.status == status] - - def is_encoder_decoder(self) -> bool: - return self.encoder_seq is not None - - def get_encoder_seq(self) -> Optional[Sequence]: - return self.encoder_seq - - def get_finished_seqs(self) -> list[Sequence]: - if self.is_single_seq: - return self.seqs if self.first_seq.is_finished() else [] - - return [seq for seq in self.seqs if seq.is_finished()] - - def update_num_computed_tokens(self, num_new_computed_tokens: int): - """Update number of tokens computed so far.""" - for seq in self.seqs: - if not seq.is_finished(): - seq.data.update_num_computed_tokens(num_new_computed_tokens) - - def get_num_uncomputed_tokens(self) -> int: - num_uncomputed_tokens = 0 - for seq in self.seqs: - if not seq.is_finished(): - num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() - return num_uncomputed_tokens - - def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: - # Optimization. We don't need to call get_seqs if we don't need to - # filter by states. - if status is None: - return len(self.seqs) - - if self.is_single_seq: - return 1 if self.seqs[0].status == status else 0 - - return len(self.get_seqs(status)) - - def num_finished_seqs(self) -> int: - if self.is_single_seq: - return 1 if self.seqs[0].is_finished() else 0 - return len(self.get_finished_seqs()) - - def is_finished(self) -> bool: - if self.is_single_seq: - return self.first_seq.is_finished() - return all(seq.is_finished() for seq in self.seqs) - - def is_prefill(self) -> bool: - return self.first_seq.is_prefill() - - def __repr__(self) -> str: - return (f"SequenceGroup(request_id={self.request_id}, " - f"sampling_params={self.sampling_params}, " - f"num_seqs={len(self.seqs)})") - - def uses_prompt_embeds(self) -> bool: - """Returns True if the sequence group uses input embeds.""" - return any(seq.data.prompt_embeds is not None for seq in self.seqs) - - -class SequenceGroupMetadataDelta( - msgspec.Struct, - tag=True, # type: ignore[call-arg] - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Delta of SequenceGroupMetadata. - - After sending the first SequenceGroupMetadata, vLLM scheduler - only sends delta to reduce the data payload size. - """ - seq_data_delta: dict[int, SequenceDataDelta] - request_id: str - block_tables: dict[int, list[int]] - is_prompt: bool - do_sample: bool = True - token_chunk_size: Optional[int] = None - computed_block_nums: Optional[list[int]] = None - state: Optional[SequenceGroupState] = msgspec.field( - default_factory=lambda: SequenceGroupState()) - - -class SequenceGroupMetadata( - msgspec.Struct, - tag=True, # type: ignore[call-arg] - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Metadata for a sequence group. Used to create `AttentionMetadata`. - - Attributes: - request_id: The ID of the request. - is_prompt: Whether the request is at prompt stage. - seq_data: The sequence data. (Seq id -> sequence data) - sampling_params: The sampling parameters used to generate the outputs. - block_tables: The block tables. (Seq id -> list of physical block - numbers) - do_sample: True if sampling is required. Sampling is not required when - e.g., prefill is chunked, and the current iteration only computes - query tokens for prefill, we don't need sampling. - pooling_params: Pooling parameters. - lora_request: LoRA request. - computed_block_nums: The block numbers that are already computed, - used in prefix caching. - state: Internal state tied to this sequence group. - token_type_ids: Token type IDs. - multi_modal_data: Multi modal data. - multi_modal_placeholders: Multi modal placeholders. - encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None - unless you are working with an encoder/decoder - model. - cross_block_table: Optional cross-attention block table associated - with the encoder prompt - (SequenceGroup.encoder_seq). Should be None - unless you are working with an encoder/decoder - model. - """ - - request_id: str - is_prompt: bool - seq_data: dict[int, SequenceData] - sampling_params: Optional[SamplingParams] - block_tables: dict[int, list[int]] - do_sample: bool = True - pooling_params: Optional[PoolingParams] = None - lora_request: Optional[LoRARequest] = None - computed_block_nums: Optional[list[int]] = None - state: Optional[SequenceGroupState] = msgspec.field( - default_factory=lambda: SequenceGroupState()) - multi_modal_data: Optional[MultiModalKwargs] = None - multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - encoder_seq_data: Optional[SequenceData] = None - cross_block_table: Optional[list[int]] = None - token_chunk_size: Optional[int] = None - - ### Stateful fields that are lazily defined. ### - # The number of speculative tokens adopted in this request. - # None means specuative decoding is not used. - # Zero means speculative decoding is disabled for some reasons. - # TODO: We should maintain this states out of the sequence group. - num_speculative_tokens: Optional[int] = None - - def __post_init__(self): - if self.seq_data is not None and self.token_chunk_size is None: - if self.is_prompt: - self.token_chunk_size = next(iter( - self.seq_data.values())).get_len() - else: - self.token_chunk_size = 1 - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - # Multi-Step Chunked-Prefill property - @property - def is_single_step_prompt(self) -> bool: - # do_sample is true, only when the token_chunk_size matches the - # num_uncomputed_tokens of the sequence. This indicates that - # the prompt will finish processing in a single `execute_model` - # step. - return self.is_prompt and self.do_sample - - def get_first_seq_id(self) -> int: - # This is an efficient way of fetching the seq_id when - # we know this SequenceGroup has only one sequence. - return next(iter(self.seq_data)) - - def apply_delta(self, - sequence_group_metadata_delta: SequenceGroupMetadataDelta): - for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): - self.seq_data[id].apply_delta(delta) - assert self.request_id == sequence_group_metadata_delta.request_id - self.block_tables = sequence_group_metadata_delta.block_tables - self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size - self.do_sample = sequence_group_metadata_delta.do_sample - self.is_prompt = sequence_group_metadata_delta.is_prompt - - def finish_step(self) -> None: - assert self.state is not None - assert self.state.current_step < self.state.num_steps, \ - f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa - self.state.current_step += 1 - - -class SequenceOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """The model output associated with a sequence. - - Attributes: - parent_seq_id: The ID of the parent sequence (for forking in beam - search). - output_token: The output token ID. - logprobs: The logprobs of the output token. - (Token id -> logP(x_i+1 | x_0, ..., x_i)) - output_embed: Optional output embedding tensor. - """ - parent_seq_id: int - output_token: int - logprobs: dict[int, Logprob] - output_embed: Optional[torch.Tensor] = None - - def __repr__(self) -> str: - output_embed_shape = \ - self.output_embed.shape if self.output_embed is not None else None - return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " - f"output_token={self.output_token}, " - f"output_embed.shape={output_embed_shape}, " - f"logprobs={self.logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SequenceOutput): - raise NotImplementedError() - equal = (self.parent_seq_id == other.parent_seq_id - and self.output_token == other.output_token) - log_probs_equal = other.logprobs == self.logprobs - return equal and log_probs_equal - - -class SequenceGroupOutput(ABC): - """The base class for model outputs associated with a sequence group.""" - - @abstractmethod - def __repr__(self) -> str: - pass - - @abstractmethod - def __eq__(self, other: object) -> bool: - pass - - -class CompletionSequenceGroupOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """The model output associated with a completion sequence group.""" - __metaclass__ = SequenceGroupOutput - samples: list[SequenceOutput] - # Prompt logprob for each prompt query token. - prompt_logprobs: Optional[PromptLogprobs] - step_index: Optional[int] = 0 - - def __repr__(self) -> str: - return (f"CompletionSequenceGroupOutput(samples={self.samples}, " - f"prompt_logprobs={self.prompt_logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, CompletionSequenceGroupOutput): - raise NotImplementedError() - return (self.samples == other.samples - and self.prompt_logprobs == other.prompt_logprobs) - - class PoolingSequenceGroupOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True, # type: ignore[call-arg] ): """The model output associated with a pooling sequence group.""" - __metaclass__ = SequenceGroupOutput # Annotated as Any to be compatible with msgspec # The actual type is in SequenceGroup.pooled_data data: Any @@ -1161,305 +143,9 @@ class PoolerOutput( self.__class__) and self.outputs == other.outputs -def get_all_seq_ids( - seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] - - -def get_all_seq_ids_and_request_ids( - seq_group_metadata_list: list[SequenceGroupMetadata] -) -> tuple[list[int], dict[str, set[int]]]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - seq_ids: list[int] = [] - request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set) - for sg in seq_group_metadata_list: - for seq_id in sg.seq_data: - seq_ids.append(seq_id) - request_id_seq_ids_mapping[sg.request_id].add(seq_id) - return seq_ids, request_id_seq_ids_mapping - - -class HiddenStates(msgspec.Struct, array_like=True, - omit_defaults=True): # type: ignore[call-arg] - """Hidden states corresponding to in-progress sequences. - Used in speculative decoding to pass hidden states from - the target model to the proposer model. - - seq_ids are the sequence ids of each entry of the batch - dimension of the hidden_states tensor""" - # Scorer hidden states. For prefill step, it is used for hidden states of - # all tokens, whereas for decode step, it is used for last accepted tokens. - hidden_states: torch.Tensor - # The sequence group metadata list. Only needed for decode step. - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None - # Scorer hidden states of the 2nd last token proposed by the proposer ( - # irrespective of whether it was accepted or not). Only used for cases when - # last proposed token is accepted (i.e., in case of bonus tokens). For the - # case of no bonus tokens, these are ignored. - second_last_token_hidden_states: Optional[torch.Tensor] = None - - _seq_ids: list[int] = msgspec.field(default_factory=list) - - def __post_init__(self): - if self.seq_group_metadata_list is not None: - assert len(self.seq_group_metadata_list) == len(self.hidden_states) - self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) - - @property - def seq_ids(self) -> list[int]: - return self._seq_ids - - def update(self, - hidden_states: torch.Tensor, - seq_group_metadata_list: list[SequenceGroupMetadata], - second_last_token_hidden_states: Optional[torch.Tensor] = None): - """Update hidden states from target model invocation. Only used for - decode steps""" - assert len(seq_group_metadata_list) == len(hidden_states) - self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) - self.hidden_states = torch.cat([self.hidden_states, hidden_states]) - - if self.second_last_token_hidden_states is not None: - # Adding dummy hidden_states to this to maintain same shape - self.second_last_token_hidden_states = torch.cat([ - self.second_last_token_hidden_states, - torch.zeros_like(hidden_states) - if second_last_token_hidden_states is None else - second_last_token_hidden_states - ]) - - def prune(self, - seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: - """Prune to provided list of sequence ids. Only used for decode steps. - """ - # Currently this prunes all seq_ids not present in - # seq_group_metadata_list which might cause problems where a sequence - # may be "paused" then "resumed" later. This should only prune sequences - # which are confirmed to be aborted. - seq_ids = get_all_seq_ids(seq_group_metadata_list) - # Only keep sequence IDs that exist in self._seq_ids - seq_ids = [seq_id for seq_id in seq_ids if seq_id in self._seq_ids] - if seq_ids != self._seq_ids: - # Batch contents changed - prune removed sequences. - index = [self._seq_ids.index(seq_id) for seq_id in seq_ids] - self.hidden_states = self.hidden_states[index] - if self.second_last_token_hidden_states is not None: - self.second_last_token_hidden_states = self\ - .second_last_token_hidden_states[index] - self._seq_ids = seq_ids - - def expand_with_bonus_tokens( - self, seq_with_bonus_token_in_last_step: set) -> None: - """Expand hidden states for sequences with bonus tokens. This is in - alignment with `MultiStepWorker._expand_execute_model_request`.""" - if self.second_last_token_hidden_states is None \ - or not seq_with_bonus_token_in_last_step: - return - - index = [] - for seq_id in self._seq_ids: - i = self._seq_ids.index(seq_id) - if seq_id in seq_with_bonus_token_in_last_step: - index.append(i + len(self._seq_ids)) - index.append(i) - - self.hidden_states = torch.cat( - [self.hidden_states, self.second_last_token_hidden_states])[index] - - class ExecuteModelRequest( msgspec.Struct, array_like=True, # type: ignore[call-arg] omit_defaults=True): # type: ignore[call-arg] - """The model execution request, containing CPU metadata only. The LLM - engine should create an instance of this class for each request batch.""" - # The sequence group metadata list. - seq_group_metadata_list: list[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]] - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: list[tuple[int, - int]] = msgspec.field(default_factory=list) - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: list[tuple[int, - int]] = msgspec.field(default_factory=list) - # Blocks to copy. Source to dest block. - blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list) - # Virtual engine ID for pipeline parallel. - virtual_engine: int = 0 - # The number of slots for lookahead decoding. - num_lookahead_slots: int = 0 - # The number of requests in the running queue. - running_queue_size: int = 0 - # Optional hidden states from prior step. - previous_hidden_states: Optional[HiddenStates] = None - # The number of forward steps to run. - num_steps: int = 1 - # Finished request ids since last step. - finished_requests_ids: list[str] = msgspec.field(default_factory=list) - # The last sampled token ids for multi step decoding. - last_sampled_token_ids: Optional[torch.Tensor] = None - # Async callback - async_callback: Optional[Callable] = None - - @property - def is_last_step(self) -> bool: - # TODO(will) make this be able to handle batches with variable number of - # steps - assert len(self.seq_group_metadata_list) > 0 - first_seq_group = self.seq_group_metadata_list[0] - assert first_seq_group.state is not None - return first_seq_group.state.remaining_steps == 1 - - @property - def current_step(self) -> int: - # TODO(will) make this be able to handle batches with variable number of - # steps - assert len(self.seq_group_metadata_list) > 0 - state = self.seq_group_metadata_list[0].state - assert state is not None - return state.current_step - - def clone( - self, seq_group_metadata_list: list[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]] - ) -> "ExecuteModelRequest": - """Clone the request with a new sequence group metadata list.""" - return ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=self.blocks_to_swap_in.copy(), - blocks_to_swap_out=self.blocks_to_swap_out.copy(), - blocks_to_copy=self.blocks_to_copy.copy(), - virtual_engine=self.virtual_engine, - num_lookahead_slots=self.num_lookahead_slots, - running_queue_size=self.running_queue_size, - previous_hidden_states=self.previous_hidden_states, - num_steps=self.num_steps, - finished_requests_ids=self.finished_requests_ids, - last_sampled_token_ids=self.last_sampled_token_ids.clone() - if self.last_sampled_token_ids is not None else None, - async_callback=self.async_callback) - - -@dataclass -class SequenceGroupBase: - group_id: str # the original request id before splitting - - assembled_seq_group: Optional[SequenceGroup] = None - - # seq id to a unique index inside this group - seq_id_to_index: dict[str, int] = field(default_factory=dict) - - # seq ids to be finished - to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict) - - # seq id to finished sequences - finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict) - - streaming: bool = False - - output_produced: bool = False - - @staticmethod - def add_request(request_id: str, engine, params, *args, **kwargs): - """When we are ready to add a request with request_id and params - into the engine, we can split the request into multiple requests. - """ - raise NotImplementedError - - def finish_seq(self, seq: SequenceGroup): - """The sequence `seq` finishes, we should record the information. - """ - del self.to_be_finished[seq.request_id] - self.finished_reqs[seq.request_id] = seq - - def maybe_assemble_group( - self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - """Assemble the sequence group, for producing the final - output, or adding request in the engine again. - """ - raise NotImplementedError - - -class ParallelSampleSequenceGroup(SequenceGroupBase): - - @staticmethod - def add_request(request_id: str, engine, params, **kwargs): - original_params = params - group = ParallelSampleSequenceGroup(request_id) - seqs = [] - for i in range(original_params.n): - request_id_i = f"{request_id}_parallel_sample_{i}" - group.seq_id_to_index[request_id_i] = i - params = original_params.clone() - params.n = 1 - if params.seed is not None: - params.seed += i - seq_group = engine._add_processed_request( - request_id_i, - params=params, - **kwargs, - ) # type: ignore - assert seq_group is not None - engine.seq_id_to_seq_group[request_id_i] = group - group.to_be_finished[request_id_i] = seq_group - seqs.append(seq_group.seqs[0]) - - # for parallel sampling, the `assembled_seq_group` is always - # available, since we have all the sequences ready, and they - # will not change. - group.assembled_seq_group = SequenceGroup( - request_id=request_id, - seqs=seqs, - arrival_time=seq_group.arrival_time, - sampling_params=original_params, - lora_request=seq_group.lora_request, - pooling_params=seq_group.pooling_params, - pooled_data=seq_group.pooled_data, - encoder_seq=seq_group.encoder_seq, - trace_headers=seq_group.trace_headers, - priority=seq_group.priority, - ) - - group.streaming = params.output_kind == RequestOutputKind.DELTA - group.output_produced = False - - def maybe_assemble_group( - self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - - # in the streaming mode, we will return the assembled sequence - # for the first remaining sequence, and then return None for the - # rest of sequences - if self.streaming: - first_remaining_id = next(iter(self.to_be_finished)) - if seq_group.request_id == first_remaining_id: - return self.assembled_seq_group - return None - - # in the non-streaming mode, we will return the assembled sequence - # when the last sequences finishes, and then return None for the - # rest of the time - if (len(self.to_be_finished) == 1 - and seq_group.request_id in self.to_be_finished - and seq_group.is_finished()): - assert self.assembled_seq_group is not None - params = self.assembled_seq_group.sampling_params - assert isinstance(params, SamplingParams) - if not self.output_produced: - self.output_produced = True - if params._real_n is not None: - # Get the top-n sequences. - n = params._real_n or params.n - seqs = self.assembled_seq_group.seqs - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - self.assembled_seq_group.seqs = top_n_seqs - return self.assembled_seq_group - if self.output_produced: - return None - return None + # Placeholder. Remove. + pass diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py deleted file mode 100644 index e2d2846a28073..0000000000000 --- a/vllm/transformers_utils/detokenizer.py +++ /dev/null @@ -1,162 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -from vllm.logprobs import Logprob -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence, - SequenceGroup) - -from .detokenizer_utils import (convert_prompt_ids_to_tokens, - detokenize_incrementally) -from .tokenizer import AnyTokenizer - - -class Detokenizer: - """Provides methods to decode the output of a model into text.""" - - def __init__(self, tokenizer: AnyTokenizer): - self.tokenizer = tokenizer - - def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, - prompt_logprobs: list[Optional[dict[ - int, Logprob]]], - position_offset: int) -> None: - """Decodes the logprobs for the prompt of a sequence group. - - Args: - seq_group: The sequence group to decode. - prompt_logprobs: The logprobs to decode. - position_offset: Offset of the first index of the logprobs - relative to the start of the sequence (for chunked prefill). - - Returns: - The prompt logprobs with the decoded tokens. - """ - prms = seq_group.sampling_params - assert prms is not None - - # We can pick any sequence for the prompt. - seq = seq_group.get_seqs()[0] - # Only prompt, without the generated token. - all_token_ids = seq.get_token_ids() - prompt_token_ids = all_token_ids[:-1] - prefix_offset = 0 - read_offset = 0 - next_iter_prefix_offset = 0 - next_iter_read_offset = 0 - next_iter_tokens: list[str] = [] - prev_tokens = None - - for token_position_in_logprob, prompt_logprobs_for_token in enumerate( - prompt_logprobs): - - # Absolute token position equals the index in the logprobs - # list plus the offset of the entire logprobs list relative - # to the start of the sequence. - token_position = token_position_in_logprob + position_offset - if not prompt_logprobs_for_token: - continue - for token_id, sample_logprob in prompt_logprobs_for_token.items(): - if (sample_logprob.decoded_token is None - and token_id != VLLM_INVALID_TOKEN_ID): - prompt_token_ids_with_token = ( - prompt_token_ids[:token_position] + [token_id]) - (new_tokens, new_text, new_prefix_offset, - new_read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=prompt_token_ids_with_token, - prev_tokens=prev_tokens, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) - - sample_logprob.decoded_token = new_text - - # Use the offsets & prev tokens corresponding to - # real tokens to ensure detokenization is consistent - # actual with prompt. - if token_id == all_token_ids[token_position]: - next_iter_prefix_offset = new_prefix_offset - next_iter_read_offset = new_read_offset - next_iter_tokens = new_tokens - - # Advance to the next token position. - prefix_offset = next_iter_prefix_offset - read_offset = next_iter_read_offset - if prev_tokens is None: - prev_tokens = next_iter_tokens.copy() - else: - prev_tokens.extend(next_iter_tokens) - - def decode_sequence_inplace(self, seq: Sequence, - prms: SamplingParams) -> int: - """Decodes the new token for a sequence. In-place operation. - - Args: - seq: The sequence to decode. - prms: The sampling parameters used to generate the sequence. - - Returns: - The number of characters added to the output text. - """ - all_input_ids = seq.get_token_ids() - token_id_generated_this_iteration = all_input_ids[-1] - - # Convert prompt token IDs to tokens if necessary. - # Do it here so that we don't have to repeat this - # computation for each logprob. - if seq.tokens is None: - (seq.tokens, seq.prefix_offset, - seq.read_offset) = convert_prompt_ids_to_tokens( - tokenizer=self.tokenizer, - prompt_ids=all_input_ids[:-1], - skip_special_tokens=prms.skip_special_tokens, - ) - - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=all_input_ids, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms.spaces_between_special_tokens, - ) - - # Decode logprobs - logprobs = seq.output_logprobs[-1] - if logprobs: - previous_tokens = all_input_ids[:-1] - for token_id, sample_logprob in logprobs.items(): - # If the token was generated this iteration, - # use the provided text. - if token_id == token_id_generated_this_iteration: - sample_logprob.decoded_token = new_decoded_token_text - continue - - if (sample_logprob.decoded_token is None - and token_id != VLLM_INVALID_TOKEN_ID): - all_input_ids_with_logprob = previous_tokens + [token_id] - (_, new_text, _, _) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=all_input_ids_with_logprob, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) - sample_logprob.decoded_token = new_text - - seq.tokens.extend(new_tokens) - seq.prefix_offset = prefix_offset - seq.read_offset = read_offset - seq.output_text += new_decoded_token_text - - return len(new_decoded_token_text) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index eaab976bf7f75..20fabef4f19b9 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -11,12 +11,12 @@ import torch.nn as nn from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, run_method, update_environment_variables, warn_for_unimplemented_methods) +from vllm.v1.outputs import SamplerOutput logger = init_logger(__name__) From 0ff8ebb2d700b2e39457a661ef979b0da2ad73b3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 21 Sep 2025 08:52:32 -0700 Subject: [PATCH 077/185] [V0 Deprecation] Remove async_output_proc, preemption mode, delay factor (#25334) Signed-off-by: Woosuk Kwon --- tests/detokenizer/test_stop_strings.py | 46 ++---------------- .../test_processor_multi_modal_uuids.py | 10 ---- tests/v1/test_oracle.py | 18 ------- vllm/config/__init__.py | 4 -- vllm/config/model.py | 48 +++---------------- vllm/config/scheduler.py | 15 +----- vllm/engine/arg_utils.py | 34 ------------- vllm/entrypoints/llm.py | 4 -- vllm/executor/uniproc_executor.py | 4 -- vllm/platforms/cpu.py | 4 -- vllm/platforms/cuda.py | 10 ---- vllm/platforms/interface.py | 7 --- vllm/platforms/rocm.py | 10 ---- vllm/platforms/tpu.py | 4 -- vllm/platforms/xpu.py | 4 -- 15 files changed, 12 insertions(+), 210 deletions(-) diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index cb87c44cc3999..46f7d58c438cb 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -32,10 +32,6 @@ def _test_stopping(llm: LLM, assert output.stop_reason == expected_reason -def _set_async_mode(llm, is_async): - llm.llm_engine.scheduler[0].use_async_output_proc = is_async - - def _stop_basic(llm): _test_stopping(llm, stop=["."], @@ -103,40 +99,8 @@ def test_stop_strings(): # async output processing below. llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) - if envs.VLLM_USE_V1: - _stop_basic(llm) - else: - _set_async_mode(llm, True) - _stop_basic(llm) - - _set_async_mode(llm, False) - _stop_basic(llm) - - if envs.VLLM_USE_V1: - _stop_multi_tokens(llm) - else: - _set_async_mode(llm, True) - _stop_multi_tokens(llm) - - _set_async_mode(llm, False) - _stop_multi_tokens(llm) - - if envs.VLLM_USE_V1: - _stop_partial_token(llm) - else: - _set_async_mode(llm, True) - _stop_partial_token(llm) - - _set_async_mode(llm, False) - _stop_partial_token(llm) - - if envs.VLLM_USE_V1: - # FIXME: this does not respect include_in_output=False - # _stop_token_id(llm) - pass - else: - _set_async_mode(llm, True) - _stop_token_id(llm) - - _set_async_mode(llm, False) - _stop_token_id(llm) + _stop_basic(llm) + _stop_multi_tokens(llm) + _stop_partial_token(llm) + # FIXME: this does not respect include_in_output=False + # _stop_token_id(llm) diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index bdd41eece2317..3a7bcb9571825 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -6,7 +6,6 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig -from vllm.platforms.interface import UnspecifiedPlatform from vllm.sampling_params import SamplingParams from vllm.v1.engine import processor as processor_mod from vllm.v1.engine.processor import Processor @@ -33,15 +32,6 @@ def _mk_processor(monkeypatch, "__post_init__", lambda self, *args: None, raising=True) - monkeypatch.setattr(UnspecifiedPlatform, - "is_async_output_supported", - classmethod(lambda cls, enforce_eager: True), - raising=True) - monkeypatch.setattr( - ModelConfig, - "verify_async_output_proc", - lambda self, parallel_config, speculative_config, device_config: None, - raising=True) monkeypatch.setattr(ModelConfig, "verify_with_parallel_config", lambda self, parallel_config: None, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 28c24f62895ab..f6b8a18dd7c2c 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch): }, ).create_engine_config() - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - preemption_mode="swap", - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - disable_async_output_proc=True, - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - scheduler_delay_factor=1.2, - ).create_engine_config() - def test_enable_by_default_fallback(monkeypatch): with monkeypatch.context() as m: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ddd8de4324f6f..e31a78ba33baa 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -454,9 +454,6 @@ class VllmConfig: self.try_verify_and_update_config() if self.model_config is not None: - self.model_config.verify_async_output_proc(self.parallel_config, - self.speculative_config, - self.device_config) self.model_config.verify_with_parallel_config(self.parallel_config) self.model_config.verify_dual_chunk_attention_config( self.load_config) @@ -877,7 +874,6 @@ class VllmConfig: f"served_model_name={self.model_config.served_model_name}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa - f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}") diff --git a/vllm/config/model.py b/vllm/config/model.py index 921322bb475c5..b53029dc8c3ec 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -223,8 +223,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - use_async_output_proc: bool = True - """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" """The format of the model config to load:\n - "auto" will try to load the config in hf format if available else it @@ -1119,37 +1117,6 @@ class ModelConfig: raise ValueError("please set VLLM_ATTENTION_BACKEND to " f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - def verify_async_output_proc(self, parallel_config, speculative_config, - device_config) -> None: - if not self.use_async_output_proc: - # Nothing to check - return - - if parallel_config.pipeline_parallel_size > 1: - self.use_async_output_proc = False - return - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - self.use_async_output_proc = False - return - - if envs.VLLM_USE_RAY_SPMD_WORKER: - self.use_async_output_proc = False - return - - # Async postprocessor is not necessary for pooling models - # since there is no token generation - if self.runner_type == "pooling": - self.use_async_output_proc = False - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if speculative_config: - self.use_async_output_proc = False - def verify_with_parallel_config( self, parallel_config: ParallelConfig, @@ -1173,15 +1140,12 @@ class ModelConfig: self._verify_with_expert_parallelism() pipeline_parallel_size = parallel_config.pipeline_parallel_size - if pipeline_parallel_size > 1: - if not self.registry.is_pp_supported_model(self.architectures, - self): - raise NotImplementedError( - "Pipeline parallelism is not supported for this model. " - "Supported models implement the `SupportsPP` interface.") - - if self.use_async_output_proc: - self.use_async_output_proc = False + if (pipeline_parallel_size > 1 + and not self.registry.is_pp_supported_model( + self.architectures, self)): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") def get_sliding_window(self) -> Optional[int]: """Get the sliding window size from the HF text config if present.""" diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index f0f67bab9d6ff..daf094d2df5c8 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -3,7 +3,7 @@ import hashlib from dataclasses import field -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Union from pydantic import SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -18,7 +18,6 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, logger = init_logger(__name__) RunnerType = Literal["generate", "pooling", "draft"] -PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] @@ -78,10 +77,6 @@ class SchedulerConfig: 3. more than one value (e.g. 1 2 128) is provided, then the capture list will follow the provided list.""" - delay_factor: float = 0.0 - """Apply a delay (of delay factor multiplied by previous - prompt latency) before scheduling next prompt.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -103,14 +98,6 @@ class SchedulerConfig: NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.""" - preemption_mode: Optional[PreemptionMode] = None - """Whether to perform preemption by swapping or - recomputation. If not specified, we determine the mode as follows: - We use recomputation by default since it incurs lower overhead than - swapping. However, when the sequence group has multiple sequences - (e.g., beam search), recomputation is not currently supported. In - such a case, we use swapping instead.""" - send_delta_data: bool = False """Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 242fcf501bfc4..fef4177b3a333 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -409,9 +409,7 @@ class EngineArgs: get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: Optional[Union[str, List[str]]] = LoadConfig.ignore_patterns - preemption_mode: Optional[str] = SchedulerConfig.preemption_mode - scheduler_delay_factor: float = SchedulerConfig.delay_factor enable_chunked_prefill: Optional[ bool] = SchedulerConfig.enable_chunked_prefill disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input @@ -439,7 +437,6 @@ class EngineArgs: ObservabilityConfig.otlp_traces_endpoint collect_detailed_traces: Optional[list[DetailedTraceModules]] = \ ObservabilityConfig.collect_detailed_traces - disable_async_output_proc: bool = not ModelConfig.use_async_output_proc scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls @@ -561,14 +558,6 @@ class EngineArgs: **model_kwargs["enable_prompt_embeds"]) model_group.add_argument("--served-model-name", **model_kwargs["served_model_name"]) - # This one is a special case because it is the - # opposite of ModelConfig.use_async_output_proc - model_group.add_argument( - "--disable-async-output-proc", - action="store_true", - default=EngineArgs.disable_async_output_proc, - help="Disable async output processing. This may result in " - "lower performance.") model_group.add_argument("--config-format", **model_kwargs["config_format"]) # This one is a special case because it can bool @@ -897,10 +886,6 @@ class EngineArgs: **scheduler_kwargs["long_prefill_token_threshold"]) scheduler_group.add_argument("--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]) - scheduler_group.add_argument("--scheduler-delay-factor", - **scheduler_kwargs["delay_factor"]) - scheduler_group.add_argument("--preemption-mode", - **scheduler_kwargs["preemption_mode"]) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument("--scheduling-policy", @@ -1029,7 +1014,6 @@ class EngineArgs: interleave_mm_strings=self.interleave_mm_strings, media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, - use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, @@ -1395,11 +1379,9 @@ class EngineArgs: max_model_len=model_config.max_model_len, cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, - delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, - preemption_mode=self.preemption_mode, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, @@ -1492,22 +1474,6 @@ class EngineArgs: recommend_to_remove=False) return False - if self.preemption_mode != SchedulerConfig.preemption_mode: - _raise_or_fallback(feature_name="--preemption-mode", - recommend_to_remove=True) - return False - - if (self.disable_async_output_proc - != EngineArgs.disable_async_output_proc): - _raise_or_fallback(feature_name="--disable-async-output-proc", - recommend_to_remove=True) - return False - - if self.scheduler_delay_factor != SchedulerConfig.delay_factor: - _raise_or_fallback(feature_name="--scheduler-delay-factor", - recommend_to_remove=True) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0ab806fcb8b5c..092d3f276d1c5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -137,8 +137,6 @@ class LLM: back to the eager mode. disable_custom_all_reduce: See [ParallelConfig][vllm.config.ParallelConfig]. - disable_async_output_proc: Disable async output processing. - This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files . If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). @@ -188,7 +186,6 @@ class LLM: enforce_eager: bool = False, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, @@ -286,7 +283,6 @@ class LLM: enforce_eager=enforce_eager, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 3b566e88a9ec2..7a753d608a43a 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor): def _init_executor(self) -> None: """Initialize the worker and load the model. """ - assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ - ("ExecutorWithExternalLauncher needs deterministic " - "execution, so it" - "does not support delay_factor in scheduling") if envs.VLLM_USE_V1: assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \ ("To get deterministic execution in V1, " diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 544e091491bf5..cd41832bc2ea4 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -126,10 +126,6 @@ class CpuPlatform(Platform): """ torch.cpu.set_device(device) - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def inference_mode(cls): return torch.no_grad() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 87d8f2b7481bb..c263e2afe83b0 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -96,16 +96,6 @@ class CudaPlatformBase(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def is_fully_connected(cls, device_ids: list[int]) -> bool: raise NotImplementedError diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 53fc762dce540..c43580ac5da13 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -275,13 +275,6 @@ class Platform: """Get the total memory of a device in bytes.""" raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - """ - Check if the current platform supports async output. - """ - raise NotImplementedError - @classmethod def inference_mode(cls): """A device-specific wrapper of `torch.inference_mode`. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 4f540fe965e22..dce2924ac7a9d 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -310,16 +310,6 @@ class RocmPlatform(Platform): device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: from vllm.config.compilation import CUDAGraphMode diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 4e4db116abca0..9852d948bc4b8 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -75,10 +75,6 @@ class TpuPlatform(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def get_punica_wrapper(cls) -> str: return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU" diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 67ef058df10f1..4d3bef4b42947 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -98,10 +98,6 @@ class XPUPlatform(Platform): device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return True - @classmethod def inference_mode(cls): return torch.no_grad() From c438b2951c5adafd4db831605696d097ab3e513a Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Sun, 21 Sep 2025 22:34:45 +0530 Subject: [PATCH 078/185] feat: Enable engine-level arguments with speculators models (#25250) Signed-off-by: Rahul Tuli Co-authored-by: Claude --- .../speculators/test_eagle3.py | 62 ++++++++++++------- vllm/config/model.py | 12 +--- vllm/engine/arg_utils.py | 35 ++++------- vllm/transformers_utils/config.py | 46 +++++++++++--- .../configs/speculators/base.py | 58 +++++++++++------ 5 files changed, 128 insertions(+), 85 deletions(-) diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py index 45ddb2178722a..368238b3a7200 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -3,38 +3,52 @@ import pytest import torch +from vllm.config import SpeculativeConfig from vllm.model_executor.models.interfaces import supports_eagle3 -@pytest.mark.parametrize( - "model_path", - [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) -def test_llama(vllm_runner, example_prompts, model_path, monkeypatch): +@pytest.mark.parametrize("model_path", [ + pytest.param( + "nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized", + id="llama3-eagle3-speculator"), + pytest.param( + "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized", + id="qwen3-eagle3-speculator"), +]) +def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path, + monkeypatch): + """ + Test Eagle3 speculators models properly initialize speculative decoding. + + This test verifies: + 1. Eagle3 support is detected for the model + 2. Speculative config is automatically initialized from embedded config + 3. The draft model path is correctly set to the speculators model + 4. Speculative tokens count is valid + 5. Text generation works with speculative decoding enabled + """ # Set environment variable for V1 engine serialization monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + # Verify Eagle3 support is detected eagle3_supported = vllm_model.apply_model(supports_eagle3) - assert eagle3_supported + assert eagle3_supported, f"Eagle3 should be supported for {model_path}" + + vllm_config = vllm_model.llm.llm_engine.vllm_config + + assert isinstance(vllm_config.speculative_config, SpeculativeConfig), \ + "Speculative config should be initialized for speculators model" + + spec_config = vllm_config.speculative_config + assert spec_config.num_speculative_tokens > 0, \ + (f"Expected positive speculative tokens, " + f"got {spec_config.num_speculative_tokens}") + + assert spec_config.model == model_path, \ + f"Draft model should be {model_path}, got {spec_config.model}" vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) - print(vllm_outputs) - assert vllm_outputs - - -@pytest.mark.parametrize( - "model_path", - [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) -def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch): - # Set environment variable for V1 engine serialization - monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - - with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: - eagle3_supported = vllm_model.apply_model(supports_eagle3) - assert eagle3_supported - - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens=20) - print(vllm_outputs) - assert vllm_outputs + assert vllm_outputs, \ + f"No outputs generated for speculators model {model_path}" diff --git a/vllm/config/model.py b/vllm/config/model.py index b53029dc8c3ec..95fe52883db0d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -27,8 +27,7 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, maybe_override_with_speculators_target_model, - try_get_generation_config, try_get_safetensors_metadata, + is_interleaved, try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.runai_utils import (ObjectStorageModel, is_runai_obj_uri) @@ -416,15 +415,6 @@ class ModelConfig: self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - if (backend := envs.VLLM_ATTENTION_BACKEND ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: raise ValueError( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fef4177b3a333..7e00260caa39a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -41,7 +41,8 @@ from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 -from vllm.transformers_utils.config import get_model_path, is_interleaved +from vllm.transformers_utils.config import (get_model_path, is_interleaved, + maybe_override_with_speculators) from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -1082,29 +1083,8 @@ class EngineArgs: provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ - - from vllm.transformers_utils.config import get_config - from vllm.transformers_utils.configs.speculators.base import ( - SpeculatorsConfig) - if self.speculative_config is None: - hf_config = get_config( - self.hf_config_path or target_model_config.model, - self.trust_remote_code, self.revision, self.code_revision, - self.config_format) - - # if loading a SpeculatorsConfig, load the speculative_config - # details from the config directly - # no user input required / expected - if isinstance(hf_config, SpeculatorsConfig): - # We create one since we don't create one - self.speculative_config = {} - self.speculative_config[ - "num_speculative_tokens"] = hf_config.num_lookahead_tokens - self.speculative_config["model"] = target_model_config.model - self.speculative_config["method"] = hf_config.method - else: - return None + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine @@ -1139,6 +1119,15 @@ class EngineArgs: device_config = DeviceConfig( device=cast(Device, current_platform.device_type)) + + (self.model, self.tokenizer, + self.speculative_config) = maybe_override_with_speculators( + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code, + vllm_speculative_config=self.speculative_config, + ) model_config = self.create_model_config() # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 52e2c18a77847..9eed466788661 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -463,15 +463,29 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_target_model( +def maybe_override_with_speculators( model: str, tokenizer: str, trust_remote_code: bool, revision: Optional[str] = None, + vllm_speculative_config: Optional[dict[str, Any]] = None, **kwargs, -) -> tuple[str, str]: +) -> tuple[str, str, Optional[dict[str, Any]]]: """ - If running a speculators config, override running model with target model + Resolve model configuration when speculators are detected. + + Checks if the provided model is a speculators model and if so, extracts + the target model configuration and builds the speculative config. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + trust_remote_code: Whether to trust remote code + revision: Model revision + vllm_speculative_config: Existing vLLM speculative config + + Returns: + Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ is_gguf = check_gguf_file(model) if is_gguf: @@ -487,11 +501,27 @@ def maybe_override_with_speculators_target_model( token=_get_hf_token(), **kwargs, ) - spec_config = config_dict.get("speculators_config", None) - # Return the target model - if spec_config is not None: - model = tokenizer = spec_config["verifier"]["name_or_path"] - return model, tokenizer + speculators_config = config_dict.get("speculators_config") + + if speculators_config is None: + # No speculators config found, return original values + return model, tokenizer, vllm_speculative_config + + # Speculators format detected - process overrides + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) + + vllm_speculative_config = SpeculatorsConfig.extract_vllm_speculative_config( + config_dict=config_dict) + + # Set the draft model to the speculators model + vllm_speculative_config["model"] = model + + # Override model and tokenizer with the verifier model from config + verifier_model = speculators_config["verifier"]["name_or_path"] + model = tokenizer = verifier_model + + return model, tokenizer, vllm_speculative_config def get_config( diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index d7c16e180c709..53128b4eecb03 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -24,6 +24,12 @@ class SpeculatorsConfig(PretrainedConfig): config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + vllm_config = cls.extract_vllm_speculative_config(config_dict) + return cls(**vllm_config) + + @classmethod + def extract_vllm_speculative_config( + cls, config_dict: dict[str, Any]) -> dict[str, Any]: speculators_model_type = config_dict.get("speculators_model_type") if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: raise ValueError( @@ -34,11 +40,12 @@ class SpeculatorsConfig(PretrainedConfig): # TODO: @dsikka - use speculators pydantic model to validate cls.validate_speculators_config(config_dict=config_dict) # Convert from speculators config -> format that can be ingested by vLLM - vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) + vllm_config = cls.build_vllm_speculative_config( + config_dict=config_dict) # Apply anything specific to the supported algorithm algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] algo_updater(config_dict=config_dict, vllm_config=vllm_config) - return cls(**vllm_config) + return vllm_config @classmethod def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: @@ -60,32 +67,45 @@ class SpeculatorsConfig(PretrainedConfig): "'transformer_layer_config' must be a dictionary if provided") @classmethod - def convert_speculators_to_vllm( + def build_vllm_speculative_config( cls, config_dict: dict[str, Any]) -> dict[str, Any]: """ - Convert speculators config format to vLLM format. - - This method handles the translation of field names and structure - between speculators and vLLM formats. - - Returns: - Dictionary with vLLM-compatible configuration - """ - # Currently we only support one proposal method - spec_config = config_dict["speculators_config"] - first_method = spec_config.get("proposal_methods")[0] - num_lookahead_tokens = first_method.get("speculative_tokens") + Build vLLM-compatible speculative configuration from speculators format. - if num_lookahead_tokens is None: + This method extracts and transforms speculative configuration from the + speculators format into the structure expected by vLLM. + + Args: + config_dict: Configuration dictionary in speculators format + + Returns: + Dictionary with vLLM-compatible speculative configuration + """ + # Extract speculators configuration + spec_config = config_dict["speculators_config"] + + # Currently we only support one proposal method + proposal_methods = spec_config.get("proposal_methods") + if not proposal_methods: + raise ValueError("No proposal methods found in speculators config") + + first_method = proposal_methods[0] + num_speculative_tokens = first_method.get("speculative_tokens") + + if num_speculative_tokens is None: raise ValueError( "Missing 'speculative_tokens' in proposal method. " f"Got: {first_method}") - # Build base vLLM config + # Build base vLLM speculative configuration vllm_config = { "method": config_dict.get("speculators_model_type"), - "num_lookahead_tokens": num_lookahead_tokens, + "num_speculative_tokens": num_speculative_tokens, "target_model": spec_config.get("verifier")["name_or_path"] } - vllm_config.update(config_dict["transformer_layer_config"]) + + # Merge transformer layer configuration if present + transformer_config = config_dict.get("transformer_layer_config", {}) + vllm_config.update(transformer_config) + return vllm_config From 1c3ffdbeccaf75f7b885b1700a4011fb4f63a7ab Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 21 Sep 2025 10:37:11 -0700 Subject: [PATCH 079/185] [V0 Deprecation] Remove V0 sampling metadata (#25345) Signed-off-by: Woosuk Kwon --- .../vllm_add_dummy_model/my_llava.py | 8 +++----- .../vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py | 8 +++----- vllm/model_executor/__init__.py | 2 -- vllm/model_executor/layers/logits_processor.py | 2 -- vllm/model_executor/models/apertus.py | 5 +---- vllm/model_executor/models/arcee.py | 7 +++---- vllm/model_executor/models/arctic.py | 5 +---- vllm/model_executor/models/aria.py | 7 ++----- vllm/model_executor/models/aya_vision.py | 5 +---- vllm/model_executor/models/baichuan.py | 5 +---- vllm/model_executor/models/bailing_moe.py | 5 +---- vllm/model_executor/models/bamba.py | 5 +---- vllm/model_executor/models/blip2.py | 5 +---- vllm/model_executor/models/bloom.py | 5 +---- vllm/model_executor/models/chameleon.py | 5 +---- vllm/model_executor/models/chatglm.py | 5 +---- vllm/model_executor/models/cohere2_vision.py | 5 +---- vllm/model_executor/models/commandr.py | 6 ++---- vllm/model_executor/models/dbrx.py | 5 +---- vllm/model_executor/models/deepseek.py | 5 +---- vllm/model_executor/models/deepseek_eagle.py | 5 +---- vllm/model_executor/models/deepseek_mtp.py | 9 ++------- vllm/model_executor/models/deepseek_v2.py | 5 +---- vllm/model_executor/models/deepseek_vl2.py | 5 +---- vllm/model_executor/models/dots1.py | 5 +---- vllm/model_executor/models/ernie45_moe.py | 5 +---- vllm/model_executor/models/ernie45_vl.py | 5 +---- vllm/model_executor/models/ernie45_vl_moe.py | 5 +---- vllm/model_executor/models/ernie_mtp.py | 8 ++------ vllm/model_executor/models/exaone.py | 5 +---- vllm/model_executor/models/exaone4.py | 5 +---- vllm/model_executor/models/falcon.py | 5 +---- vllm/model_executor/models/falcon_h1.py | 5 +---- vllm/model_executor/models/fuyu.py | 4 +--- vllm/model_executor/models/gemma.py | 5 +---- vllm/model_executor/models/gemma2.py | 5 +---- vllm/model_executor/models/gemma3.py | 5 +---- vllm/model_executor/models/gemma3_mm.py | 5 +---- vllm/model_executor/models/gemma3n.py | 5 +---- vllm/model_executor/models/gemma3n_mm.py | 5 +---- vllm/model_executor/models/glm4.py | 5 +---- vllm/model_executor/models/glm4_1v.py | 5 +---- vllm/model_executor/models/glm4_moe.py | 5 +---- vllm/model_executor/models/glm4_moe_mtp.py | 9 ++------- vllm/model_executor/models/gpt2.py | 5 +---- vllm/model_executor/models/gpt_bigcode.py | 5 +---- vllm/model_executor/models/gpt_j.py | 4 +--- vllm/model_executor/models/gpt_neox.py | 5 +---- vllm/model_executor/models/gpt_oss.py | 7 ++----- vllm/model_executor/models/granite.py | 9 +++------ vllm/model_executor/models/granite_speech.py | 7 +------ vllm/model_executor/models/granitemoe.py | 9 +++------ vllm/model_executor/models/granitemoehybrid.py | 5 +---- vllm/model_executor/models/granitemoeshared.py | 9 +++------ vllm/model_executor/models/grok1.py | 5 +---- vllm/model_executor/models/hunyuan_v1.py | 5 +---- vllm/model_executor/models/hyperclovax_vision.py | 5 +---- vllm/model_executor/models/idefics3.py | 7 ++----- vllm/model_executor/models/interfaces_base.py | 3 --- vllm/model_executor/models/internlm2.py | 5 +---- vllm/model_executor/models/interns1.py | 5 +---- vllm/model_executor/models/internvl.py | 5 +---- vllm/model_executor/models/jais.py | 5 +---- vllm/model_executor/models/jamba.py | 5 +---- vllm/model_executor/models/keye.py | 5 +---- vllm/model_executor/models/kimi_vl.py | 5 +---- vllm/model_executor/models/lfm2.py | 7 ++----- vllm/model_executor/models/llama.py | 5 +---- vllm/model_executor/models/llama_eagle3.py | 5 +---- vllm/model_executor/models/llava.py | 5 +---- vllm/model_executor/models/llava_next.py | 5 +---- vllm/model_executor/models/llava_next_video.py | 5 +---- vllm/model_executor/models/llava_onevision.py | 5 +---- vllm/model_executor/models/mamba.py | 7 ++----- vllm/model_executor/models/mamba2.py | 7 ++----- vllm/model_executor/models/medusa.py | 3 +-- vllm/model_executor/models/midashenglm.py | 4 +--- vllm/model_executor/models/mimo.py | 5 +---- vllm/model_executor/models/mimo_mtp.py | 8 ++------ vllm/model_executor/models/minicpm.py | 5 +---- vllm/model_executor/models/minicpm_eagle.py | 5 +---- vllm/model_executor/models/minicpmv.py | 4 +--- vllm/model_executor/models/minimax_text_01.py | 7 ++----- vllm/model_executor/models/minimax_vl_01.py | 5 +---- vllm/model_executor/models/mistral3.py | 5 +---- vllm/model_executor/models/mixtral.py | 5 +---- vllm/model_executor/models/mllama4.py | 5 +---- vllm/model_executor/models/molmo.py | 7 ++----- vllm/model_executor/models/mpt.py | 5 +---- vllm/model_executor/models/nano_nemotron_vl.py | 5 +---- vllm/model_executor/models/nemotron.py | 5 +---- vllm/model_executor/models/nemotron_h.py | 5 +---- vllm/model_executor/models/nemotron_nas.py | 5 +---- vllm/model_executor/models/nemotron_vl.py | 5 +---- vllm/model_executor/models/olmo.py | 5 +---- vllm/model_executor/models/olmo2.py | 5 +---- vllm/model_executor/models/olmoe.py | 7 ++----- vllm/model_executor/models/opt.py | 5 +---- vllm/model_executor/models/orion.py | 5 +---- vllm/model_executor/models/ovis.py | 4 +--- vllm/model_executor/models/ovis2_5.py | 4 +--- vllm/model_executor/models/paligemma.py | 5 +---- vllm/model_executor/models/persimmon.py | 5 +---- vllm/model_executor/models/phi.py | 4 +--- vllm/model_executor/models/phi3v.py | 5 +---- vllm/model_executor/models/phi4_multimodal.py | 5 +---- vllm/model_executor/models/phi4flash.py | 3 --- vllm/model_executor/models/phi4mm.py | 5 +---- vllm/model_executor/models/phimoe.py | 7 ++----- vllm/model_executor/models/pixtral.py | 5 +---- vllm/model_executor/models/plamo2.py | 5 +---- vllm/model_executor/models/qwen.py | 5 +---- vllm/model_executor/models/qwen2.py | 5 +---- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +---- vllm/model_executor/models/qwen2_5_vl.py | 5 +---- vllm/model_executor/models/qwen2_audio.py | 5 +---- vllm/model_executor/models/qwen2_moe.py | 5 +---- vllm/model_executor/models/qwen2_vl.py | 5 +---- vllm/model_executor/models/qwen3.py | 5 +---- vllm/model_executor/models/qwen3_moe.py | 5 +---- vllm/model_executor/models/qwen3_next.py | 5 +---- vllm/model_executor/models/qwen3_next_mtp.py | 5 +---- vllm/model_executor/models/qwen3_vl.py | 5 +---- vllm/model_executor/models/seed_oss.py | 5 +---- vllm/model_executor/models/skyworkr1v.py | 5 +---- vllm/model_executor/models/solar.py | 7 ++----- vllm/model_executor/models/stablelm.py | 5 +---- vllm/model_executor/models/starcoder2.py | 5 +---- vllm/model_executor/models/step3_text.py | 7 ++----- vllm/model_executor/models/step3_vl.py | 5 +---- vllm/model_executor/models/tarsier.py | 5 +---- vllm/model_executor/models/transformers.py | 5 +---- vllm/model_executor/models/ultravox.py | 7 ++----- vllm/model_executor/models/voxtral.py | 5 +---- vllm/model_executor/models/whisper.py | 7 ++----- vllm/model_executor/models/zamba2.py | 5 +---- vllm/model_executor/sampling_metadata.py | 7 ------- vllm/v1/spec_decode/eagle.py | 9 +++------ vllm/v1/spec_decode/medusa.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 9 ++++----- vllm/v1/worker/tpu_model_runner.py | 2 +- 141 files changed, 172 insertions(+), 583 deletions(-) delete mode 100644 vllm/model_executor/sampling_metadata.py diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index da97cf7e2b40b..b431ad1ed0928 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -9,7 +9,6 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder, LlavaForConditionalGeneration, LlavaMultiModalProcessor, LlavaProcessingInfo) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -18,11 +17,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY dummy_inputs=LlavaDummyInputsBuilder) class MyLlava(LlavaForConditionalGeneration): - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # this dummy model always predicts the first token - logits = super().compute_logits(hidden_states, sampling_metadata) + logits = super().compute_logits(hidden_states) if logits is not None: logits.zero_() logits[:, 0] += 1.0 diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index 8c34407e3e071..a6fafff98e9c5 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -6,16 +6,14 @@ from typing import Optional import torch from vllm.model_executor.models.opt import OPTForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata class MyOPTForCausalLM(OPTForCausalLM): - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # this dummy model always predicts the first token - logits = super().compute_logits(hidden_states, sampling_metadata) + logits = super().compute_logits(hidden_states) if logits is not None: logits.zero_() logits[:, 0] += 1.0 diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index a59aebfac4ff9..3c094cfdb553f 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -3,11 +3,9 @@ from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed __all__ = [ - "SamplingMetadata", "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 8226437cb1898..2110aa2769b93 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -10,7 +10,6 @@ from vllm.distributed import (tensor_model_parallel_all_gather, from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform @@ -50,7 +49,6 @@ class LogitsProcessor(CustomOp): self, lm_head: VocabParallelEmbedding, hidden_states: torch.Tensor, - sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[torch.Tensor] = None, ) -> Optional[torch.Tensor]: if self.logits_as_input: diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index f6400b05e110a..6dab4ed14345f 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -566,10 +565,8 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index be82c2fd59644..1ee378af76c9f 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -399,11 +399,10 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds=inputs_embeds) return model_output - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # Compute final logits from hidden states (last pipeline rank only) - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b6dd559968415..55d16fd75cebb 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -456,10 +455,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index a7cb6b35a4ab4..35c1adbdd00b6 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -644,10 +643,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 687c82ded9d0a..0f05f9b4efcd6 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -16,7 +16,6 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( get_optimal_tiled_canvas) from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, @@ -464,7 +463,5 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index ae25033410407..db8d0a8710471 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant @@ -421,10 +420,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 5f6025abf315c..82cd4a26a1baa 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -623,10 +622,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 397089f31cdf6..584981ef3ebfd 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -571,10 +570,8 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index a3131aa3812ef..b7455fba62c02 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -12,7 +12,6 @@ from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig, from vllm.config import CacheConfig, VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -704,10 +703,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4c37622b049c8..30816f72a2678 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP, SupportsQuant @@ -355,10 +354,8 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7a56236483749..79d648d749c6a 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -28,7 +28,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -1046,10 +1045,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) # Disallow image tokens which does not include special # begin-image and end-image tokens diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 1fc2da3e4d7ca..879508400222f 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig @@ -437,10 +436,8 @@ class ChatGLMBaseModel(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 179cc2af8eb3f..6d67eb68d51a8 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, @@ -478,7 +477,5 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 7f87e31abdcd3..f3929ef3b5938 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -448,15 +447,14 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: is_not_lora = hasattr(self.model.embed_tokens, 'weight') if is_not_lora: logits = self.logits_processor(self.model.embed_tokens, - hidden_states, sampling_metadata) + hidden_states) else: logits = self.logits_processor(self.model.embed_tokens.base_layer, - hidden_states, sampling_metadata) + hidden_states) return logits diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 003cf4563a22f..f863b1da5505c 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -462,10 +461,8 @@ class DbrxForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 59c9921881497..ffc843fe033cc 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -488,10 +487,8 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 2770ddebc48ab..ed7e7614800fc 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -19,7 +19,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, DeepseekV3ForCausalLM) -from vllm.model_executor.sampling_metadata import SamplingMetadata from .utils import AutoWeightsLoader, maybe_prefix @@ -222,10 +221,8 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 8fbf16d206a86..92f311ab465b5 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -15,7 +15,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .deepseek_v2 import (DeepseekV2DecoderLayer, @@ -124,15 +123,13 @@ class DeepSeekMultiTokenPredictor(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)] logits = self.logits_processor(mtp_layer.shared_head.head, - mtp_layer.shared_head(hidden_states), - sampling_metadata) + mtp_layer.shared_head(hidden_states)) return logits @@ -161,11 +158,9 @@ class DeepSeekMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.model.compute_logits(hidden_states, sampling_metadata, - spec_step_idx) + return self.model.compute_logits(hidden_states, spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 636554bd648f2..a99a6679a5696 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -56,7 +56,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import cdiv, direct_register_custom_op @@ -914,10 +913,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index d7ae8206baca5..c8ed759d2e972 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -15,7 +15,6 @@ from transformers import BatchFeature from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class @@ -647,10 +646,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 20555e48b73d4..2a09234b59ed1 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -52,7 +52,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -534,10 +533,8 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index ebab018ed67e7..d262e9e9da50e 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -591,10 +590,8 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 0d4aced93ca1c..74b358034ef3d 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -39,7 +39,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -1292,11 +1291,9 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: """compute logits""" - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def _vision_forward( self, diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 7f791852ceb91..f55016f7ccb36 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .ernie45_moe import Ernie4_5_MoeMLP @@ -587,10 +586,8 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index c446265230313..288fbe736c32f 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -138,12 +137,10 @@ class ErnieMultiTokenPredictor(nn.Module): self, hidden_states: torch.Tensor, lm_head: ParallelLMHead, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: self.layers[str(self.mtp_start_layer_idx + spec_step_idx)] - logits = self.logits_processor(lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(lm_head, hidden_states) return logits @@ -180,11 +177,10 @@ class ErnieMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: return self.model.compute_logits(hidden_states, self.lm_head, - sampling_metadata, spec_step_idx) + spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index f503fb0f9364a..5dafcd595e4a5 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -534,10 +533,8 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 9f7d57d938140..c78eedff66700 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -517,10 +516,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 42c378e5c389a..0c50056d1c527 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import RWConfig @@ -496,10 +495,8 @@ class FalconForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 757051b3b1447..83efdd2e433fc 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -33,7 +33,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP @@ -675,10 +674,8 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 90af859ab92ec..53e9e6fe6e460 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -29,7 +29,6 @@ from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.models.persimmon import PersimmonForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -389,10 +388,9 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.language_model.logits_processor( - self.language_model.lm_head, hidden_states, sampling_metadata) + self.language_model.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 12eb27503870c..c19425b6cb6d6 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -412,10 +411,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 0bdb6c6bf7ae9..3f76e1e7d42a2 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -409,10 +408,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 7246308d59028..77c0ef8cb91d2 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from ...attention.layers.encoder_only_attention import EncoderOnlyAttention @@ -542,10 +541,8 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index bee9fbd2c084a..0630ee07c347e 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -14,7 +14,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -704,10 +703,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index ffec3408702c9..f4d288fd887e9 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant @@ -814,10 +813,8 @@ class Gemma3nForCausalLM(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: Optional[SamplingMetadata], ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8d3079aee0dfb..2acdba54a257d 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -685,10 +684,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 5e2908a82c418..b9d5e24e9f6fa 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -289,10 +288,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 308b0cb602bc9..56ec634386909 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -52,7 +52,6 @@ from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -1654,10 +1653,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1acbd18091fb3..947c6ce62f551 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -703,10 +702,8 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 322c5619c1783..c572978e62206 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name @@ -155,15 +154,13 @@ class Glm4MoeMultiTokenPredictor(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)] logits = self.logits_processor(mtp_layer.shared_head.head, - mtp_layer.shared_head(hidden_states), - sampling_metadata) + mtp_layer.shared_head(hidden_states)) return logits @@ -192,11 +189,9 @@ class Glm4MoeMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.model.compute_logits(hidden_states, sampling_metadata, - spec_step_idx) + return self.model.compute_logits(hidden_states, spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 0f6521e44e6be..24274db148bd7 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from ..layers.pooler import DispatchPooler, Pooler @@ -307,10 +306,8 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 745d0b7759991..162018450e7c0 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -329,10 +328,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 77df6ae6f30c8..698387fab946c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -329,10 +328,9 @@ class GPTJForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, self.lm_head.bias) + self.lm_head.bias) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index e97db188e27eb..7570aefb6e96e 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -321,10 +320,8 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.embed_out, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.embed_out, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index b49fd0d8f88af..4fe59f91124dd 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import cdiv @@ -670,10 +669,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP): return self.model(input_ids, positions, intermediate_tensors, inputs_embeds) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 4f9cc2532bd8c..795b38e724eab 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -463,11 +462,9 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return model_output - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 221023f1fb657..a5849184339b1 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -776,12 +775,8 @@ class GraniteSpeechForConditionalGeneration( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits( - hidden_states, - sampling_metadata, - ) + return self.language_model.compute_logits(hidden_states) def load_weights( self, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index da16c72000c0e..07200fef4799d 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -511,11 +510,9 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 79c6d8146ba9c..e89a1a4a0f7d3 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -672,10 +671,8 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 0b568a4b22685..a5d118f084e6c 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .granitemoe import GraniteMoeAttention, GraniteMoeModel, GraniteMoeMoE @@ -311,11 +310,9 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index a591134383371..996e41fe84ff3 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -528,10 +527,8 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 4110c8a1fd08d..8a23a6b45bc70 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -1004,10 +1003,8 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 870addd0dcbca..54167f9f10995 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -31,7 +31,6 @@ from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -962,10 +961,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights( self, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 9153a0e2c1e5a..18446d126b51a 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -738,10 +737,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 19a3ef1a3b800..8fdf70e35a2b8 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -13,11 +13,9 @@ from vllm.utils import supports_kw if TYPE_CHECKING: from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler - from vllm.model_executor.sampling_metadata import SamplingMetadata else: VllmConfig = Any Pooler = Any - SamplingMetadata = Any logger = init_logger(__name__) @@ -100,7 +98,6 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): def compute_logits( self, hidden_states: T, - sampling_metadata: SamplingMetadata, ) -> Optional[T]: """Return `None` if TP rank > 0.""" ... diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ce94328797ed6..221ff08b43843 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -29,7 +29,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -358,10 +357,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.output, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.output, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index b59d1b88cf5ce..ba72c288b2b12 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -21,7 +21,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interns1_vit import InternS1VisionModel from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -812,10 +811,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 6a5c565b52e85..f4004e518e3ba 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -1399,10 +1398,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 4fee8c32fd581..0eb1578b43610 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import JAISConfig @@ -332,10 +331,8 @@ class JAISLMHeadModel(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 5b8fbc7226866..12a49029195ff 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -32,7 +32,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaMLP as JambaMLP from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -581,10 +580,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index afe33b4d4ad26..2e5e276cc1c7d 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -21,7 +21,6 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -1556,10 +1555,8 @@ class BaseKeyeModule(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 94a5933a61416..f554077935bf3 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -67,7 +67,6 @@ from vllm.model_executor.models.interfaces import (SupportsMultiModal, SupportsPP) from vllm.model_executor.models.moonvit import MoonVitPretrainedModel from vllm.model_executor.models.utils import merge_multimodal_embeddings -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -484,10 +483,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, return hidden_states def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, **kwargs) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, **kwargs) + logits = self.logits_processor(self.lm_head, hidden_states, **kwargs) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 927f78c4e4b45..dd97afbeb668a 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, @@ -542,10 +541,8 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f8ea2111fed57..1b03cbef501b3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP @@ -601,10 +600,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 7027138dfcb17..fb10af6c53c90 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaForCausalLM) -from vllm.v1.sample.metadata import SamplingMetadata from .utils import AutoWeightsLoader, maybe_prefix @@ -244,10 +243,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) if self.draft_id_to_target_id is None: assert logits.shape[1] == self.config.vocab_size, \ "Expected logits to have shape " \ diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 4f15e1b5762ea..e2d7b9f23b28a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -760,10 +759,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index beb3c33100595..c9133fde14552 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -13,7 +13,6 @@ from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.multimodal.parse import ImageSize @@ -563,10 +562,8 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index cf9852de633f3..610fb188d57d2 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -13,7 +13,6 @@ from transformers import (BatchFeature, LlavaNextVideoConfig, from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.clip import CLIPVisionModel -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -464,10 +463,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 46d54452a52d8..cee9ddaf94cc4 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -14,7 +14,6 @@ from transformers.models.llava_onevision.modeling_llava_onevision import ( from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -934,10 +933,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 9d1017dac8aa1..36141a5d50641 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -26,7 +26,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree, SupportsPP) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -299,10 +298,8 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index b1a4138cb8f6c..9c3108146d2e5 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -30,7 +30,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -335,10 +334,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index b0a96fca2ff8a..0ae59dc8dfc23 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -104,12 +104,11 @@ class Medusa(nn.Module): def compute_logits( self, hidden_states: list[torch.Tensor], - sampling_metadata, ) -> list[torch.Tensor]: logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): - _logits = self.logits_processor(lm_head, hs, sampling_metadata) + _logits = self.logits_processor(lm_head, hs) if _logits is None: # _logits should only be None on rank > 0, in which case diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 140800dd41c76..82648ba668ca5 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -784,9 +783,8 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.decoder.compute_logits(hidden_states, sampling_metadata) + return self.decoder.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index ea5292d0df202..d256c1f3eed7b 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix @@ -183,9 +182,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: hidden_states = self.model.norm(hidden_states) - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 09194e9f95d0e..b4abe458e4771 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2DecoderLayer -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import maybe_prefix @@ -140,12 +139,10 @@ class MiMoMultiTokenPredictor(nn.Module): self, hidden_states: torch.Tensor, lm_head: ParallelLMHead, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: self.mtp_layers[str(self.mtp_start_layer_idx + spec_step_idx)] - logits = self.logits_processor(lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(lm_head, hidden_states) return logits @@ -178,11 +175,10 @@ class MiMoMTP(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: return self.model.compute_logits(hidden_states, self.lm_head, - sampling_metadata, spec_step_idx) + spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 240c23ea2b25d..0986ea07406a9 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -583,10 +582,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 848a97b8bb2a0..2af0d546ce63d 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -376,10 +375,8 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 9b2d84e32151a..a17c4f004d75c 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -50,7 +50,6 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1194,9 +1193,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.llm.compute_logits(hidden_states, sampling_metadata) + return self.llm.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 6ce883be0a83c..1d2c7dea811e0 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid @@ -742,10 +741,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states.float(), - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states.float()) return logits diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index cc7db849a28bf..b2f020f3323e8 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -14,7 +14,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors @@ -420,10 +419,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index d15776a39362d..94e3d7234b6f4 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -606,10 +605,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 8b3474d809532..bebf0b5adac52 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -594,10 +593,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 2f0e8a2a5e575..131a66b713235 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.utils import initialize_model from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -856,10 +855,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def separate_weights( self, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2475fe1316097..201bf83cac581 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -26,7 +26,6 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU, SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm @@ -1527,10 +1526,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 48ac91fa6dde0..64d669e8ac3e1 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -320,10 +319,8 @@ class MPTForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 4f8652c006941..ae50f1aefc6f7 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -37,7 +37,6 @@ from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, MultiModalKwargsItems, @@ -1192,10 +1191,8 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): adapter_dict = dict(self.mlp1.named_parameters()) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 21f785e4b91af..6bb2f7392cb49 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronConfig @@ -498,10 +497,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 1e1f0524bd063..ff571541a60a5 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -54,7 +54,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager, from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronHConfig from vllm.utils import LayerBlockType @@ -622,10 +621,8 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f8e38dcd80b5a..d474c8db41b2f 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasNoOps, SupportsLoRA, SupportsPP @@ -468,10 +467,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index acda2027401d9..3abbff8c717d4 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -26,7 +26,6 @@ from vllm.model_executor.models.internvl import ( BaseInternVLProcessingInfo, InternVLImageEmbeddingInputs, InternVLImageInputs, InternVLImagePixelInputs, InternVLProcessor) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import NestedTensors @@ -632,10 +631,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 7be3c16528b52..9fa8760073c15 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -391,10 +390,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 3e4c580a11211..2e0b1fb2a13f7 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -54,7 +54,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Olmo3Config @@ -427,10 +426,8 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 892e967e4a21f..77ece544d4900 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -471,10 +470,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 365aab205b211..4c3ce9f61efb3 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -399,10 +398,8 @@ class OPTForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 944a9151d75d3..586fea343d6f9 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -28,7 +28,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -339,10 +338,8 @@ class OrionForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index f1bb18716b40d..052e143b27f6e 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -39,7 +39,6 @@ from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -558,9 +557,8 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.llm.compute_logits(hidden_states, sampling_metadata) + logits = self.llm.compute_logits(hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 5e4758ef8ea5d..f18e38ce154d2 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -19,7 +19,6 @@ from vllm.model_executor.models.siglip2navit import Siglip2NavitModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -630,9 +629,8 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.llm.compute_logits(hidden_states, sampling_metadata) + logits = self.llm.compute_logits(hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d6eec77ebcee5..aef5102304614 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -9,7 +9,6 @@ from transformers import BatchFeature, PaliGemmaConfig from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, @@ -403,10 +402,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3e854e4d561ff..23fb7bb85215c 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -334,10 +333,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 6f39afbecf35b..9cf288e850057 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -346,10 +345,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, self.lm_head.bias) + self.lm_head.bias) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4522c7043d01a..a2b201fe4228d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -29,7 +29,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -681,10 +680,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 25df9e9261d91..d2a3a8cc04969 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1451,10 +1450,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index aa7c434a44aeb..ae153558e37aa 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -29,7 +29,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsV0Only) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import make_layers, maybe_prefix @@ -695,12 +694,10 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: processed_logits = self.logits_processor( self.lm_head, hidden_states, - sampling_metadata, self.embedding_bias, ) return processed_logits diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index b3fc55dab6eca..47b5ad55ab2d0 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -18,7 +18,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1257,10 +1256,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 01d16f1f2c387..3ce67ce37a7ab 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -667,10 +666,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 142d3251bc67a..7b197844c8b63 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalUUIDDict, NestedTensors) @@ -480,10 +479,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 9f1ee36366fdd..33ee1cf44afd1 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -52,7 +52,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager, from vllm.model_executor.models.utils import ( is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -1022,10 +1021,8 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 7470948499005..e0c08a6a88271 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -282,10 +281,8 @@ class QWenBaseModel(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e13e87b93429d..c536b0f60c30d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import is_interleaved @@ -510,10 +509,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a7e71309b6074..5f27230c913b4 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -50,7 +50,6 @@ from vllm.model_executor.models.qwen2_5_vl import ( from vllm.model_executor.models.qwen2_audio import ( Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths) from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -955,10 +954,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index dbf486374bcf3..73b27572a8ebd 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,7 +43,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm # yapf: disable @@ -1256,10 +1255,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index c797b71b5d2e1..762ab42e5929e 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -34,7 +34,6 @@ from transformers.models.qwen2_audio import (Qwen2AudioConfig, from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (AudioItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -481,10 +480,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 6c6276a930453..6a9acaf2c3fe0 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -546,10 +545,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index dd4e7731e0b08..b3c42c2572566 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -46,7 +46,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -1527,10 +1526,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index dddb47048a1fc..ae72fd30c3993 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP @@ -328,10 +327,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 029309c49efd4..0661b3707ff44 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -690,10 +689,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ce917f92bd2e5..24cebc5bfdd82 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -53,7 +53,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -1208,10 +1207,8 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + return self.logits_processor(self.lm_head, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index c755eeb9b4eaa..c054339842e64 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer, Qwen3NextRMSNorm) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig @@ -266,11 +265,9 @@ class Qwen3NextMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + return self.logits_processor(self.lm_head, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index ca232e03767b1..aa28c07ddcebe 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -45,7 +45,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -1493,10 +1492,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index e3c7c700f8fa1..a217c820fedf0 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -472,10 +471,8 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 9857ccdcbe2d4..893ce4497c319 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -22,7 +22,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -897,10 +896,8 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 94c862258b7ad..c774171b9dcd2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -495,10 +494,8 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return model_output - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 9e880ebd50813..e4dfe8d5a9a3b 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -332,10 +331,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 62ff9b6182755..7f379ab95a03d 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -339,10 +338,8 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 6a5b540fc8174..0cce0c78f8dc6 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -29,7 +29,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -405,10 +404,8 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index c2940f8e44450..f667266b77bfa 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1055,10 +1054,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index c66867315e553..67cf3ccf315d1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.llava import LlavaDummyInputsBuilder -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems @@ -638,10 +637,8 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 3bd4d10316ec6..475a68bc642b9 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalUUIDDict, @@ -798,10 +797,8 @@ class TransformersForCausalLM(TransformersBase): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f1f11c5fe8f00..12ae9487ad9dc 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -18,7 +18,6 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -616,10 +615,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): inputs_embeds=inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 16a97389cd21b..b33e8d09c4be1 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -30,7 +30,6 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys # yapf: disable from vllm.model_executor.models.whisper import WhisperEncoder # yapf: enable -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, MultiModalUUIDDict, @@ -454,10 +453,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) @classmethod def get_speech_to_text_config(cls, model_config: ModelConfig, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 41ae7b129782d..de3e4f0592a62 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -936,10 +935,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, return WhisperAudioInputs(input_features=input_features) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.proj_out, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.proj_out, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index e601bc3adb6e9..4350e38e02f96 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid @@ -1036,7 +1035,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: """Compute logits for next token prediction. @@ -1047,8 +1045,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): Returns: Logits for next token prediction """ - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py deleted file mode 100644 index 8c4548ff7f7dc..0000000000000 --- a/vllm/model_executor/sampling_metadata.py +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - - -class SamplingMetadata: - # Placeholder until it can be safely removed. - pass diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 2a178ddf48777..5dacf60886966 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -239,7 +239,7 @@ class EagleProposer: else: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + logits = self.model.compute_logits(sample_hidden_states) # Early exit if there is only one draft token to be generated. if self.num_speculative_tokens == 1: @@ -367,8 +367,7 @@ class EagleProposer: else: last_hidden_states, hidden_states = ret_hidden_states hidden_states = hidden_states[:batch_size] - logits = self.model.compute_logits(last_hidden_states[:batch_size], - None) + logits = self.model.compute_logits(last_hidden_states[:batch_size]) draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) @@ -678,9 +677,7 @@ class EagleProposer: # Get the output logits for the draft tokens. logits = self.model.compute_logits( draft_last_hidden_states.reshape(batch_size * level_num_drafts, - -1), - None, - ) + -1)) # Sample a draft token for each child at the next tree level. num_children = self.child_drafts_per_level[level + 1] diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 3e90179e78d99..70b29c05c2a50 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -41,7 +41,7 @@ class MedusaProposer: ) -> list[list[int]]: # Generate blocks and compute logits blocks = self.model(target_hidden_states) - logits = self.model.compute_logits(blocks, None) + logits = self.model.compute_logits(blocks) # Get draft tokens and transpose the result # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0946e8c5d7d8..b0cd0f4133079 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2240,7 +2240,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return output sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + logits = self.model.compute_logits(sample_hidden_states) else: # Rare case. assert not self.is_pooling_model @@ -2258,8 +2258,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = None else: sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, - None) + logits = self.model.compute_logits(sample_hidden_states) model_output_broadcast_data = {} if logits is not None: @@ -2706,7 +2705,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_idx = self.input_batch.req_id_to_index[req_id] offset = self.query_start_loc.np[req_idx].item() prompt_hidden_states = hidden_states[offset:offset + num_logits] - logits = self.model.compute_logits(prompt_hidden_states, None) + logits = self.model.compute_logits(prompt_hidden_states) # Get the "target" tokens for each index. For prompt at index i, # the token at prompt index i+1 is the "sampled" token we want @@ -3105,7 +3104,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # To avoid breaking the sampler, we use a random tensor here instead. hidden_states = torch.rand_like(hidden_states) - logits = self.model.compute_logits(hidden_states, None) + logits = self.model.compute_logits(hidden_states) num_reqs = logits.size(0) dummy_tensors = lambda v: torch.full( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 48070c1e3e7cb..dd11b1dcbe94c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1692,7 +1692,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): @torch.compile(backend="openxla", fullgraph=True, dynamic=False) def compute_logits(self, sample_hidden_states: torch.Tensor) -> torch.Tensor: - return self.model.compute_logits(sample_hidden_states, None) + return self.model.compute_logits(sample_hidden_states) # TODO: Under SPMD mode, sample_from_logits has correctness issue. # Re-enable the torch.compile once the issue is fixed in torchxla. From af7dfb0d1a95fb097a93de156ddd6c1eb5c72796 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 22 Sep 2025 04:12:45 +0800 Subject: [PATCH 080/185] [Perf] Further optimization for Qwen3-VL `fast_pos_embed_interpolate` (#25347) Signed-off-by: Isotr0py --- vllm/model_executor/models/qwen3_vl.py | 46 +++++++++++++++++--------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index aa28c07ddcebe..98d65dea27393 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -405,25 +405,39 @@ class Qwen3_VisionTransformer(nn.Module): dh = h_idxs - h_floor dw = w_idxs - w_floor - w00 = ((1 - dh)[:, None] * (1 - dw)[None, :]).reshape(-1) - w01 = ((1 - dh)[:, None] * dw[None, :]).reshape(-1) - w10 = (dh[:, None] * (1 - dw)[None, :]).reshape(-1) - w11 = (dh[:, None] * dw[None, :]).reshape(-1) + # Create meshgrid view for all h, w vars + dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing='ij') + h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, + w_floor, + indexing='ij') + h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, + w_ceil, + indexing='ij') + h_floor_grid_idx = h_floor_grid * num_grid_per_side + h_ceil_grid_idx = h_ceil_grid * num_grid_per_side - idx00 = (h_floor[:, None] * num_grid_per_side + - w_floor[None, :]).reshape(-1) - idx01 = (h_floor[:, None] * num_grid_per_side + - w_ceil[None, :]).reshape(-1) - idx10 = (h_ceil[:, None] * num_grid_per_side + - w_floor[None, :]).reshape(-1) - idx11 = (h_ceil[:, None] * num_grid_per_side + - w_ceil[None, :]).reshape(-1) + # original computation of weights + # w00 = (1 - dh_grid) * (1 - dw_grid) + # w01 = (1 - dh_grid) * dw_grid + # w10 = dh_grid * (1 - dw_grid) + # w11 = dh_grid * dw_grid + # we reuse w11 here to avoid duplicate + # dh_grid * dw_grid computation + w11 = dh_grid * dw_grid + w10 = dh_grid - w11 + w01 = dw_grid - w11 + w00 = 1 - dh_grid - dw_grid + w11 - indices = torch.stack([idx00, idx01, idx10, idx11], dim=0) + idx00 = h_floor_grid_idx + w_floor_grid + idx01 = h_floor_grid_idx + w_ceil_grid + idx10 = h_ceil_grid_idx + w_floor_grid + idx11 = h_ceil_grid_idx + w_ceil_grid + + indices = torch.stack([idx00, idx01, idx10, idx11], + dim=0).reshape(4, -1) weights = torch.stack([w00, w01, w10, w11], - dim=0).to(dtype=self.dtype, - device=self.device) - weights = weights.unsqueeze(-1) + dim=0).reshape(4, -1, 1) + weights = weights.to(dtype=self.dtype, device=self.device) embeds = self.pos_embed(indices) weighted_embeds = embeds * weights From bc6e542d9fc400d7002853b3878c81b14fbdc998 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 21 Sep 2025 16:03:28 -0700 Subject: [PATCH 081/185] Remove V0 attention backends (#25351) Signed-off-by: Woosuk Kwon --- examples/offline_inference/qwen_1m.py | 1 - tests/compile/test_fusion_attn.py | 5 +- tests/kernels/attention/test_attention.py | 6 +- .../attention/test_attention_selector.py | 1 + .../kernels/attention/test_prefix_prefill.py | 6 +- .../attention/test_rocm_attention_selector.py | 1 + tests/kernels/utils.py | 66 +- tests/models/test_initialization.py | 5 +- .../backends/differential_flash_attn.py | 931 ---------- .../backends/dual_chunk_flash_attn.py | 1495 ----------------- vllm/attention/backends/flash_attn.py | 929 ---------- vllm/attention/backends/flashmla.py | 227 --- vllm/attention/backends/mla/__init__.py | 0 vllm/attention/backends/mla/common.py | 1305 -------------- vllm/attention/backends/rocm_aiter_mla.py | 407 ----- vllm/attention/backends/rocm_flash_attn.py | 953 ----------- vllm/attention/backends/triton_mla.py | 111 -- vllm/attention/backends/utils.py | 14 +- vllm/attention/backends/xformers.py | 805 --------- vllm/config/model.py | 7 +- .../kv_transfer/kv_connector/utils.py | 2 +- vllm/engine/arg_utils.py | 15 +- vllm/envs.py | 1 - .../layers/mamba/mamba2_metadata.py | 19 +- vllm/model_executor/models/deepseek_v2.py | 3 +- vllm/platforms/cuda.py | 139 +- vllm/platforms/rocm.py | 63 +- vllm/utils/__init__.py | 2 - 28 files changed, 143 insertions(+), 7376 deletions(-) delete mode 100644 vllm/attention/backends/differential_flash_attn.py delete mode 100644 vllm/attention/backends/dual_chunk_flash_attn.py delete mode 100755 vllm/attention/backends/flash_attn.py delete mode 100644 vllm/attention/backends/flashmla.py delete mode 100644 vllm/attention/backends/mla/__init__.py delete mode 100644 vllm/attention/backends/mla/common.py delete mode 100644 vllm/attention/backends/rocm_aiter_mla.py delete mode 100644 vllm/attention/backends/rocm_flash_attn.py delete mode 100644 vllm/attention/backends/triton_mla.py delete mode 100644 vllm/attention/backends/xformers.py diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index d8d61667f688b..c8d0d91ce7b5c 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -5,7 +5,6 @@ from urllib.request import urlopen from vllm import LLM, SamplingParams -os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN" os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index b6bebbba915be..c3f1c7481d1b3 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -334,8 +334,9 @@ else: [7, 256, 533] if current_platform.is_cuda() else [8]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("model_name, model_class", MODELS) -@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if - current_platform.is_cuda() else [_Backend.ROCM_FLASH]) +@pytest.mark.parametrize("backend", + [_Backend.FLASHINFER] if current_platform.is_cuda() + else [_Backend.TRITON_ATTN_VLLM_V1]) @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 7083661575ef2..c7abf652f111b 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -18,7 +18,7 @@ if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask - from vllm.attention.backends.xformers import _make_alibi_bias + from tests.kernels.utils import make_alibi_bias FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. @@ -429,8 +429,8 @@ def test_multi_query_kv_attention( alibi_bias = None if use_alibi: alibi_slopes = torch.randn(num_query_heads, dtype=torch.float) - attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, - seq_lens) + attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, + seq_lens) output = torch.empty_like(query) start = 0 # Dynamic sequence length not supported with custom attn_bias. diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index f8454ad0a4c4d..38ab40f88ae0b 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -67,6 +67,7 @@ def generate_params(): return params +@pytest.mark.skip(reason="Skipped for now. Should be revisited.") @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) def test_env( diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 8544eab3acccd..0695f84aea1af 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -11,7 +11,7 @@ import torch from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask -from vllm.attention.backends.xformers import _make_alibi_bias +from tests.kernels.utils import make_alibi_bias from vllm.attention.ops.chunked_prefill_paged_decode import ( chunked_prefill_paged_decode) from vllm.attention.ops.prefix_prefill import context_attention_fwd @@ -470,7 +470,7 @@ def test_contexted_kv_attention_alibi( key = key.unsqueeze(0) value = value.unsqueeze(0) - attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) + attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) output_ref = torch.empty_like(output) seq_start = 0 query_start = 0 @@ -479,7 +479,7 @@ def test_contexted_kv_attention_alibi( # FIXME(DefTruth): Because xformers does not support dynamic sequence # lengths with custom attention bias, we process each prompt one by # one. This is inefficient, especially when we have many short prompts. - # modified from: vllm/attention/backends/xformers.py#L343 + # modified from: vllm/v1/attention/backends/xformers.py#L343 for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)): seq_end = seq_start + seq_len query_end = query_start + query_len diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index d56d3f4638f1c..af301d9de4350 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -16,6 +16,7 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() +@pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c9bf85f6e2a5c..8d6ce381976b5 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -513,10 +513,6 @@ def make_backend(backend_name: str) -> AttentionBackend: Construct the backend instance determined by the backend_name string argument. - "XFORMERS" -> construct xformers backend - - TODO: other backends - Note: at time of writing the Attention wrapper automatically selects its own backend for Attention.forward(); so the backend instance which you generate with this function is not meant to be used for *running* @@ -528,18 +524,68 @@ def make_backend(backend_name: str) -> AttentionBackend: * Backend instance ''' - if backend_name == STR_XFORMERS_ATTN_VAL: - # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. - from vllm.attention.backends.xformers import XFormersBackend - return XFormersBackend() - elif backend_name == STR_FLASH_ATTN_VAL: - from vllm.attention.backends.flash_attn import FlashAttentionBackend + if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"): + from vllm.v1.attention.backends.xformers import ( + XFormersAttentionBackend) + return XFormersAttentionBackend() + if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"): + from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend return FlashAttentionBackend() + if backend_name == "TRITON_ATTN_VLLM_V1": + from vllm.v1.attention.backends.triton_attn import ( + TritonAttentionBackend) + return TritonAttentionBackend() + if backend_name == "FLEX_ATTENTION": + from vllm.v1.attention.backends.flex_attention import ( + FlexAttentionBackend) + return FlexAttentionBackend() + if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"): + from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend + return TorchSDPABackend() + if backend_name == "FLASHINFER": + from vllm.v1.attention.backends.flashinfer import FlashInferBackend + return FlashInferBackend() raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") +def make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + seq_lens: list[int], +) -> list[Any]: + """Create ALiBi biases compatible with xFormers attention tests.""" + from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias + + if alibi_slopes is None: + return [None for _ in seq_lens] + + attn_biases: list[Any] = [] + num_heads = alibi_slopes.shape[0] + assert num_heads >= num_kv_heads, ( + "ALiBi slopes expect at least as many heads as KV heads") + + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + bias_tensor = torch.empty( + 1, + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias_tensor.mul_(alibi_slopes[:, None, None]) + attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor)) + + return attn_biases + + def _make_metadata_tensors( seq_lens: Optional[list[int]], context_lens: Optional[list[int]], diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b9601114a3183..bfde6e20a3b17 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -78,9 +78,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, return if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"): - # Phi4FlashForCausalLM and MotifForCausalLM - # only supports DIFFERENTIAL_FLASH_ATTN backend - m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN") + pytest.skip( + "Differential Flash Attention backend has been removed.") if model_arch == "GptOssForCausalLM": # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py deleted file mode 100644 index 87a4558e377d0..0000000000000 --- a/vllm/attention/backends/differential_flash_attn.py +++ /dev/null @@ -1,931 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""" An implementation of https://arxiv.org/pdf/2410.05258 """ -from collections import defaultdict -from dataclasses import dataclass -from itertools import accumulate -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -from einops import rearrange - -from vllm import _custom_ops as ops -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.flash_attn import FlashAttentionBackend -# yapf: enable -from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, - compute_slot_mapping, - compute_slot_mapping_start_idx, - is_all_cross_attn_metadata_set, - is_all_encoder_attn_metadata_set, - is_block_tables_empty) -from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, - get_flash_attn_version) -from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.utils import async_tensor_h2d, make_tensor_with_pad -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) - -logger = init_logger(__name__) - - -class DifferentialFlashAttentionBackend(AttentionBackend): - accept_output_buffer = False - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - if block_size % 16 != 0: - raise ValueError("Block size must be a multiple of 16.") - assert num_kv_heads % 2 == 0, "num_kv_heads must be divisible by 2" - return (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size) - - @staticmethod - def get_name() -> str: - return "DIFFERENTIAL_FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["DifferentialFlashAttentionImpl"]: - return DifferentialFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["DifferentialFlashAttentionMetadata"]: - return DifferentialFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["DifferentialFlashAttentionMetadataBuilder"]: - return DifferentialFlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - src_key_cache = src_kv_cache[0] - dst_key_cache = dst_kv_cache[0] - ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - src_value_cache = src_kv_cache[1] - dst_value_cache = dst_kv_cache[1] - ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class DifferentialFlashAttentionMetadata(AttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - - use_cuda_graph: bool - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional[ - "DifferentialFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional[ - "DifferentialFlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - # Cross-layer shared attention block tables - cross_layer_shared_block_tables: Optional[torch.Tensor] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata( - self) -> Optional["DifferentialFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - cross_layer_shared_block_tables = ( - None if self.cross_layer_shared_block_tables is None else - self.cross_layer_shared_block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = DifferentialFlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata( - self) -> Optional["DifferentialFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - cross_layer_shared_block_tables = ( - None if self.cross_layer_shared_block_tables is None else - self.cross_layer_shared_block_tables[self.num_prefills:]) - self._cached_decode_metadata = DifferentialFlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_decode_metadata - - -class DifferentialFlashAttentionMetadataBuilder( - AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]): - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.cross_layer_shared_block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - # TODO: add support for chunked prefill and prefix caching. - assert not chunked_prefill_enabled, \ - "chunked prefill is not supported for now" - assert not prefix_cache_hit, "prefix caching is not supported for now" - - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - - if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - cross_layer_shared_block_table = [] - if prefix_cache_hit: - cross_layer_shared_block_table = block_tables[seq_id] - elif block_tables is not None: - if curr_sliding_window_block == 0: - cross_layer_shared_block_table = block_tables[seq_id] - else: - cross_layer_shared_block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.cross_layer_shared_block_tables.append( - cross_layer_shared_block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables(self, num_seqs: int, - block_tables: List[List[int]], - graph_block_tables) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - # max_batch_size, max_blocks = self.runner.graph_block_tables.shape - max_batch_size, max_blocks = graph_block_tables.shape - assert max_batch_size >= num_seqs - - # graph_block_tables = self.runner.graph_block_tables[:num_seqs] - graph_block_tables = graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend([] * cuda_graph_pad_size) - - self.cross_layer_shared_block_tables.extend([] * - cuda_graph_pad_size) - - num_decode_tokens = batch_size - self.num_prefill_tokens - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables, self.runner.graph_block_tables) - cross_layer_shared_block_tables = \ - self._get_graph_runner_block_tables( - num_seqs, self.cross_layer_shared_block_tables, - self.runner.cross_layer_shared_graph_block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - cross_layer_shared_block_tables = make_tensor_with_pad( - self.cross_layer_shared_block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - - return DifferentialFlashAttentionMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=use_captured_graph, - ) - - -class DifferentialFlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - differential_flash_attention_config: Optional[Dict[str, Any]] = None, - ) -> None: - if differential_flash_attention_config is None: - differential_flash_attention_config = {} - self.differential_flash_attention_config = \ - differential_flash_attention_config - self.used_shared_kv_cache = kv_sharing_target_layer_name is not None - self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window - 1, - 0) if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - self.vllm_flash_attn_version = get_flash_attn_version( - requires_alibi=self.alibi_slopes is not None) - if is_quantized_kv_cache(self.kv_cache_dtype) and ( - not self.kv_cache_dtype.startswith("fp8") - or not flash_attn_supports_fp8()): - raise NotImplementedError( - f"FlashAttention does not support {self.kv_cache_dtype} " - "kv-cache on this device " - f"(FA supports fp8 = {flash_attn_supports_fp8()}).") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - logits_soft_cap = 0 - self.logits_soft_cap = logits_soft_cap - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - self.attn_type = attn_type - - self.lambda_full = None - self.subln = self.differential_flash_attention_config["subln"] - - def split_heads(self, x): - # split by num_heads, the stripe pattern is friendly to tensor parallel. - x = rearrange(x, "... (H two) D -> ... H two D", two=2) - x1 = x[..., 0, :] - x2 = x[..., 1, :] - return x1.contiguous(), x2.contiguous() - - def split_kv_cache(self, x): - # split by num_heads, the stripe pattern is friendly to tensor parallel. - if x.numel() == 0: - return torch.empty(0), torch.empty(0) - - x1, x2 = x[0], x[1] - return x1, x2 - - def populate_kv_cache(self, layer: AttentionLayer, key: torch.Tensor, - value: torch.Tensor, kv_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata): - if kv_cache.numel() > 0 and key is not None and value is not None: - updated_slot_mapping = attn_metadata.slot_mapping - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - updated_slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - def forward_generate_kv_cache( - self, query: torch.Tensor, key: Optional[torch.Tensor], - value: Optional[torch.Tensor], k_cache: torch.Tensor, - v_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata) -> torch.Tensor: - - head_size = self.head_size - num_heads = self.num_heads // 2 - num_kv_heads = self.num_kv_heads // 2 - - query = query.view(-1, num_heads, head_size) - if key is not None: - assert value is not None - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - else: - assert value is None - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[ - 0] == num_prefill_tokens + num_decode_tokens, "key shape mismatch" - assert value.shape[ - 0] == num_prefill_tokens + num_decode_tokens, "value shape mismatch" - - output = torch.empty_like(query) - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - # QKV for prefill. - query = query[:num_prefill_tokens] - if key is not None and value is not None: - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - assert query.shape[0] == num_prefill_tokens, "query shape mismatch" - assert decode_query.shape[ - 0] == num_decode_tokens, "decode query shape mismatch" - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if k_cache.numel() == 0 \ - or prefill_meta.block_tables is None \ - or prefill_meta.block_tables.numel() == 0: - # normal attention - prefill_output = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ) - assert prefill_output.shape == output[: - num_prefill_tokens].shape - output[:num_prefill_tokens] = prefill_output - else: - raise Exception("prefix caching not supported") - - if decode_meta := attn_metadata.decode_metadata: - block_tables_arg = decode_meta.block_tables - try: - output[num_prefill_tokens:] = flash_attn_with_kvcache( - q=decode_query.unsqueeze(1), - k_cache=k_cache, - v_cache=v_cache, - block_table=block_tables_arg, - cache_seqlens=decode_meta.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ).squeeze(1) - except Exception as e: - logger.error("Error in PagedAttention.forward_decode: %s", - str(e)) - raise e - - # Reshape the output tensor. - return output.view(-1, num_heads, head_size) - - def forward_with_kv_cache_only( - self, - query: torch.Tensor, - k_cache: torch.Tensor, - v_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata, - ): - if not attn_metadata.decode_metadata: - block_tables_arg = attn_metadata.cross_layer_shared_block_tables - else: - block_tables_arg = attn_metadata.block_tables - - output = flash_attn_with_kvcache( - q=query.unsqueeze(1), - k_cache=k_cache, - v_cache=v_cache, - block_table=block_tables_arg, - cache_seqlens=attn_metadata.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ).squeeze(1) - return output - - def forward( - self, - layer: AttentionLayer, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention. - - Args: - layer: Attention layer instance. - q: Query tensor with shape = [num_tokens, num_heads, head_size] - k: Key tensor with shape = [num_tokens, num_kv_heads, head_size] - v: Value tensor with shape = [num_tokens, num_kv_heads, head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size, num_kv_heads, head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Output tensor with shape [num_tokens, num_heads, head_size] - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - NOTE: It in-place updates the output tensor. - NOTE: FP8 quantization, flash-attn expect the size of - {q,k,v}_descale to be (num_sequences, num_kv_heads). - We use torch's .expand() to avoid duplicating values - """ - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for DifferentialFlashAttentionImpl") - - if self.lambda_full is None: - self.lambda_init = self.differential_flash_attention_config[ - "lambda_init"] - lambda_q1 = self.differential_flash_attention_config["lambda_q1"] - lambda_k1 = self.differential_flash_attention_config["lambda_k1"] - lambda_q2 = self.differential_flash_attention_config["lambda_q2"] - lambda_k2 = self.differential_flash_attention_config["lambda_k2"] - lambda_1 = torch.exp( - torch.sum(lambda_q1 * lambda_k1, dim=-1).float()).type_as(q) - lambda_2 = torch.exp( - torch.sum(lambda_q2 * lambda_k2, dim=-1).float()).type_as(q) - self.lambda_full = lambda_1 - lambda_2 + self.lambda_init - - if not self.used_shared_kv_cache: # need to generate kv-cache - q = q.view(-1, self.num_heads, self.head_size) - k = k.view(-1, self.num_kv_heads, self.head_size) - v = v.view(-1, self.num_kv_heads, self.head_size) - - q1, q2 = self.split_heads(q) - k1, k2 = self.split_heads(k) - v1, v2 = self.split_heads(v) - - # kv_cache shape is (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size) # noqa: E501 - # Split by half along the first dimension. - kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache) - assert kv_cache1.is_contiguous(), "kv_cache1 is not contiguous" - assert kv_cache2.is_contiguous(), "kv_cache2 is not contiguous" - - if kv_cache1.numel() != 0: - self.populate_kv_cache(layer, k1, v1, kv_cache1, attn_metadata) - self.populate_kv_cache(layer, k2, v2, kv_cache2, attn_metadata) - - key_cache1, value_cache1 = self.split_kv_cache(kv_cache1) - key_cache2, value_cache2 = self.split_kv_cache(kv_cache2) - else: - key_cache1, value_cache1 = torch.empty(0), torch.empty(0) - key_cache2, value_cache2 = torch.empty(0), torch.empty(0) - attn11 = self.forward_generate_kv_cache(q1, k1, v1, key_cache1, - value_cache1, - attn_metadata) - attn12 = self.forward_generate_kv_cache(q1, k1, v2, key_cache1, - value_cache2, - attn_metadata) - attn11 = attn11.view(q1.shape) - attn12 = attn12.view(q1.shape) - attn1 = torch.cat([attn11, attn12], dim=-1) - - attn21 = self.forward_generate_kv_cache(q2, k2, v1, key_cache2, - value_cache1, - attn_metadata) - attn22 = self.forward_generate_kv_cache(q2, k2, v2, key_cache2, - value_cache2, - attn_metadata) - attn21 = attn21.view(q2.shape) - attn22 = attn22.view(q2.shape) - attn2 = torch.cat([attn21, attn22], dim=-1) - - attn = attn1 - self.lambda_full * attn2 - # attn shape (-1, self.num_heads // 2, 2 * self.head_dim) - attn = self.subln(attn) - attn = attn * (1 - self.lambda_init) - # reshape back to 2 * num_head - attn_output = rearrange(attn, - "... H (two D) -> ... (H two) D", - two=2) - - else: # reuse the kv cache, full attention - q = q.view(-1, self.num_heads, self.head_size) - q1, q2 = self.split_heads(q) - # kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501 - kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache) - key_cache1, value_cache1 = kv_cache1[0], kv_cache1[1] - key_cache2, value_cache2 = kv_cache2[0], kv_cache2[1] - - attn11 = self.forward_with_kv_cache_only(q1, key_cache1, - value_cache1, - attn_metadata) - attn12 = self.forward_with_kv_cache_only(q1, key_cache1, - value_cache2, - attn_metadata) - attn11 = attn11.view(q1.shape) - attn12 = attn12.view(q1.shape) - attn1 = torch.cat([attn11, attn12], dim=-1) - - attn21 = self.forward_with_kv_cache_only(q2, key_cache2, - value_cache1, - attn_metadata) - attn22 = self.forward_with_kv_cache_only(q2, key_cache2, - value_cache2, - attn_metadata) - attn21 = attn21.view(q2.shape) - attn22 = attn22.view(q2.shape) - attn2 = torch.cat([attn21, attn22], dim=-1) - - attn = attn1 - self.lambda_full * attn2 - attn = self.subln(attn) - attn = attn * (1 - self.lambda_init) - # reshape back to 2 * num_head - attn_output = rearrange(attn, - "... H (two D) -> ... (H two) D", - two=2) - attn_output = attn_output.view(-1, self.num_heads * self.head_size) - return attn_output diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py deleted file mode 100644 index de47bb8ebd8fb..0000000000000 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ /dev/null @@ -1,1495 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with Dual chunk flash attention and sparse attention. -""" -import math -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -import torch.distributed -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionLayer, AttentionType -from vllm.attention.backends.flash_attn import (FlashAttentionBackend, - FlashAttentionImpl, - FlashAttentionMetadata, - FlashAttentionMetadataBuilder) -from vllm.distributed.parallel_state import get_tensor_model_parallel_rank -from vllm.logger import init_logger -from vllm.utils import async_tensor_h2d -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache, sparse_attn_func) - -logger = init_logger(__name__) - - -class DualChunkFlashAttentionBackend(FlashAttentionBackend): - - accept_output_buffer: bool = False - - @staticmethod - def get_name() -> str: - return "DUAL_CHUNK_FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["DualChunkFlashAttentionImpl"]: - return DualChunkFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["DualChunkFlashAttentionMetadata"]: - return DualChunkFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["DualChunkFlashAttentionMetadataBuilder"]: - return DualChunkFlashAttentionMetadataBuilder - - -@dataclass -class DualChunkFlashAttentionMetadata(FlashAttentionMetadata): - # Block size of the paged kv cache. - block_size: int = 16 - - # Original max position embeddings. - original_max_position_embeddings: int = 0 - - # Chunk size - chunk_size: int = 8192 - - # Local size - local_size: int = 1024 - - # (batch_size,). The orig sequence length per sequence. - orig_seq_lens: Optional[List[int]] = None - - # orig_seq_lens stored as a tensor. - orig_seq_lens_tensor: Optional[torch.Tensor] = None - - # Length scaling factor - scaling_factor: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for intra attention. - seq_lens_intra: Optional[torch.Tensor] = None - - # Max sequence length for intra attention. - max_seq_len_intra: Optional[int] = None - - # (batch_size, num_blocks). Block table for intra attention. - block_tables_intra: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for succ attention. - seq_lens_succ: Optional[torch.Tensor] = None - - # Max sequence length for succ attention. - max_seq_len_succ: Optional[int] = None - - # (batch_size, num_blocks). Block table for succ attention. - block_tables_succ: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for inter attention. - seq_lens_inter: Optional[torch.Tensor] = None - - # Max sequence length for inter attention. - max_seq_len_inter: Optional[int] = None - - _cached_prefill_metadata: Optional[ - "DualChunkFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["DualChunkFlashAttentionMetadata"] = None - - @property - def prefill_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - prefill_metadata = super().prefill_metadata - if prefill_metadata is None: - return None - - prefill_metadata = DualChunkFlashAttentionMetadata( - **prefill_metadata.asdict_zerocopy()) - - prefill_metadata.orig_seq_lens = ( - None if self.orig_seq_lens is None else - self.orig_seq_lens[:self.num_prefills]) - prefill_metadata.orig_seq_lens_tensor = ( - None if self.orig_seq_lens_tensor is None else - self.orig_seq_lens_tensor[:self.num_prefills]) - - if self.original_max_position_embeddings > 0: - assert prefill_metadata.orig_seq_lens_tensor is not None - prefill_metadata.scaling_factor = ( - 0.1 * torch.log(prefill_metadata.orig_seq_lens_tensor / - self.original_max_position_embeddings) + - 1.0).clip(min=1) - - self._cached_prefill_metadata = prefill_metadata - return prefill_metadata - - @property - def decode_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - - decode_metadata = super().decode_metadata - if decode_metadata is None: - return None - - decode_metadata = DualChunkFlashAttentionMetadata( - **decode_metadata.asdict_zerocopy()) - - decode_metadata.orig_seq_lens_tensor = ( - None if self.orig_seq_lens_tensor is None else - self.orig_seq_lens_tensor[self.num_prefills:]) - - assert decode_metadata.orig_seq_lens_tensor is not None - assert decode_metadata.block_tables is not None - - cache_seq_lens = decode_metadata.orig_seq_lens_tensor - chunk_len = self.chunk_size - self.local_size - chunk_num_curr = (cache_seq_lens - 1) // chunk_len - batch_size = decode_metadata.num_decode_tokens - - if self.original_max_position_embeddings > 0: - decode_metadata.scaling_factor = (0.1 * torch.log( - cache_seq_lens / self.original_max_position_embeddings) + - 1.0).clip(min=1) - - seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len - max_seq_len_intra = seq_lens_intra.max().item() - decode_metadata.seq_lens_intra = seq_lens_intra - decode_metadata.max_seq_len_intra = max_seq_len_intra - - block_tables_intra = torch.zeros( - batch_size, - (max_seq_len_intra - 1) // self.block_size + 1, - dtype=decode_metadata.block_tables.dtype, - device=decode_metadata.block_tables.device, - ) - for i in range(batch_size): - st = chunk_num_curr[i] * chunk_len // self.block_size - ed = min( - st + (max_seq_len_intra - 1) // self.block_size + 1, - (cache_seq_lens[i] - 1) // self.block_size + 1, - ) - block_tables_intra[i, :ed - - st] = decode_metadata.block_tables[i, st:ed] - decode_metadata.block_tables_intra = block_tables_intra - - seq_lens_succ = (chunk_num_curr - - (chunk_num_curr - 1).clip(min=0)) * chunk_len - max_seq_len_succ = seq_lens_succ.max().item() - decode_metadata.seq_lens_succ = seq_lens_succ - decode_metadata.max_seq_len_succ = max_seq_len_succ - if max_seq_len_succ: - block_tables_succ = torch.zeros( - batch_size, - (max_seq_len_succ - 1) // self.block_size + 1, - dtype=decode_metadata.block_tables.dtype, - device=decode_metadata.block_tables.device, - ) - for i in range(batch_size): - start = ((chunk_num_curr[i] - 1).clip(min=0) * chunk_len // - self.block_size) - end = min( - start + (max_seq_len_succ - 1) // self.block_size + 1, - (cache_seq_lens[i] - 1) // self.block_size + 1, - ) - block_tables_succ[ - i, :end - start] = decode_metadata.block_tables[i, - start:end] - decode_metadata.block_tables_succ = block_tables_succ - - seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len - max_seq_len_inter = seq_lens_inter.max().item() - decode_metadata.seq_lens_inter = seq_lens_inter - decode_metadata.max_seq_len_inter = max_seq_len_inter - - self._cached_decode_metadata = decode_metadata - return decode_metadata - - -class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder): - - def prepare(self): - super().prepare() - self.orig_seq_lens: List[int] = [] - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - super()._add_seq_group(inter_data, chunked_prefill_enabled, - prefix_cache_hit) - for prompt_len, seq_len in zip(inter_data.prompt_lens, - inter_data.seq_lens): - self.orig_seq_lens.append(max(prompt_len, seq_len)) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - attn_metadata = super().build(seq_lens, query_lens, - cuda_graph_pad_size, batch_size) - attn_metadata = DualChunkFlashAttentionMetadata( - **attn_metadata.asdict_zerocopy()) - - device = self.runner.device - attn_metadata.orig_seq_lens = self.orig_seq_lens - attn_metadata.orig_seq_lens_tensor = async_tensor_h2d( - self.orig_seq_lens, torch.int, device, self.runner.pin_memory) - - attn_metadata.block_size = self.runner.block_size - dual_chunk_attn_config = getattr(self.runner.model_config.hf_config, - "dual_chunk_attention_config", {}) - attn_metadata.original_max_position_embeddings = \ - dual_chunk_attn_config.get("original_max_position_embeddings", 0) - attn_metadata.chunk_size = dual_chunk_attn_config.get( - "chunk_size", 8192) - attn_metadata.local_size = dual_chunk_attn_config.get( - "local_size", 1024) - - return attn_metadata - - -class DualChunkFlashAttentionImpl(FlashAttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - The prompts might have different lengths, while the generation tokens - always have length 1. - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - layer_idx: int = -1, - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "DUAL_CHUNK_FLASH_ATTN backend.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if sliding_window is not None: - # NOTE(woosuk): flash-attn's sliding window does not work with - # paged KV cache. - raise ValueError( - "Sliding window is not supported in FlashAttention.") - - support_head_sizes = ( - DualChunkFlashAttentionBackend.get_supported_head_sizes()) - - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - - assert dual_chunk_attention_config is not None - self.chunk_size = dual_chunk_attention_config.get("chunk_size", 8192) - self.local_size = dual_chunk_attention_config.get("local_size", 1024) - self.original_max_position_embeddings = dual_chunk_attention_config.get( - "original_max_position_embeddings", 0) - self.sparse_attention_config = dual_chunk_attention_config.get( - "sparse_attention_config", None) - if not self.sparse_attention_config: - logger.warning_once("Sparse attention will not be enabled as " - "sparse attention config is not provided.") - self.sparse_attention_enabled = dual_chunk_attention_config.get( - "sparse_attention_enabled", self.sparse_attention_config - is not None) - self.sparse_attention_threshold = dual_chunk_attention_config.get( - "sparse_attention_threshold", 32768) - self.sparse_attention_last_q = dual_chunk_attention_config.get( - "sparse_attention_last_q", 64) - self.layer_idx = layer_idx - self.dual_chunk_attention_config = dual_chunk_attention_config - - if self.sparse_attention_config: - self.sparse_attention_config = { - int(i): j - for i, j in self.sparse_attention_config[ - self.layer_idx].items() - } - start_head = self.num_heads * get_tensor_model_parallel_rank() - end_head = start_head + self.num_heads - self.sparse_attention_config = [ - self.sparse_attention_config[i] - for i in range(start_head, end_head) - ] - - if self.sparse_attention_enabled: - self.arange = torch.arange(self.sparse_attention_last_q, - device="cuda") - self.last_q_mask = (self.arange[None, None, :, None] - >= self.arange[None, None, None, :]) - - def forward( # type: ignore - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: DualChunkFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with DualChunkFlashAttention. - Args: - query: shape = [num_tokens, num_heads * head_size] - query_succ: shape = [num_tokens, num_heads * head_size] - query_inter: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads * head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - assert output is None, "Output tensor not supported for DualChunk" - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for FlashAttentionImpl") - - ( - query, - query_succ, - query_inter, - query_succ_critical, - query_inter_critical, - ) = torch.split(query, query.shape[-1] // 5, dim=-1) - - assert ( - query_succ is not None and query_inter is not None - ), "query_succ and query_inter are required in Dual Chunk Attention." - - num_tokens, hidden_size = query.shape - - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - query_succ = query_succ.view(-1, self.num_heads, self.head_size) - query_inter = query_inter.view(-1, self.num_heads, self.head_size) - query_succ_critical = query_succ_critical.view(-1, self.num_heads, - self.head_size) - query_inter_critical = query_inter_critical.view( - -1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if self.original_max_position_embeddings > 0: - if prefill_meta := attn_metadata.prefill_metadata: - assert prefill_meta.scaling_factor is not None - assert prefill_meta.query_start_loc is not None - assert prefill_meta.orig_seq_lens is not None - current_start = 0 - query_start_loc_cpu = prefill_meta.query_start_loc.cpu() - for i in range(len(prefill_meta.orig_seq_lens)): - current_end = (current_start + - (query_start_loc_cpu[i + 1] - - query_start_loc_cpu[i]).item()) - key[current_start:current_end].mul_( - prefill_meta.scaling_factor[i]) - current_start = current_end - assert current_end <= attn_metadata.num_prefill_tokens - if decode_meta := attn_metadata.decode_metadata: - assert decode_meta.scaling_factor is not None - scaling_factor = decode_meta.scaling_factor - key[attn_metadata.num_prefill_tokens:].mul_( - scaling_factor.unsqueeze(-1).unsqueeze(-1)) - - if kv_cache is not None and kv_cache.numel() > 0: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - output = torch.empty_like(query) - - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - decode_query_succ = query_succ[num_prefill_tokens:] - decode_query_inter = query_inter[num_prefill_tokens:] - - # QKV for prefill. - query = query[:num_prefill_tokens] - query_succ = query_succ[:num_prefill_tokens] - query_inter = query_inter[:num_prefill_tokens] - query_succ_critical = query_succ_critical[:num_prefill_tokens] - query_inter_critical = query_inter_critical[:num_prefill_tokens] - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - assert query.shape[0] == num_prefill_tokens - assert decode_query.shape[0] == num_decode_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if (kv_cache is None or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - # normal attention, called during the profiling run. - out = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - ) - assert output[:num_prefill_tokens].shape == out.shape - output[:num_prefill_tokens] = out - else: - # prefix-enabled attention - assert prefill_meta.seq_lens is not None - assert prefill_meta.orig_seq_lens is not None - output[:num_prefill_tokens] = ( - self._dual_chunk_flash_attn_prefill( - q=query, - q_succ=query_succ, - q_inter=query_inter, - q_succ_critical=query_succ_critical, - q_inter_critical=query_inter_critical, - k=key_cache, - v=value_cache, - cu_seqlens_q=prefill_meta.query_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - orig_seq_lens=prefill_meta.orig_seq_lens, - scaling_factor=prefill_meta.scaling_factor, - softmax_scale=self.scale, - causal=True, - window_size=(-1, -1), - alibi_slopes=self.alibi_slopes, - block_table=prefill_meta.block_tables, - chunk_size=self.chunk_size, - local_size=self.local_size, - )) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - output[num_prefill_tokens:] = ( - self._dual_chunk_flash_attn_decoding( - decode_query.unsqueeze(1), - decode_query_succ.unsqueeze(1), - decode_query_inter.unsqueeze(1), - key_cache, - value_cache, - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - chunk_size=self.chunk_size, - local_size=self.local_size, - original_max_position_embeddings=self. - original_max_position_embeddings, - decode_meta=decode_meta, - ).squeeze(1)) - # Reshape the output tensor. - return output.view(num_tokens, hidden_size) - - def _dual_chunk_flash_attn_prefill( - self, - q, - q_succ, - q_inter, - q_succ_critical, - q_inter_critical, - k, - v, - cu_seqlens_q, - cu_seqlens_k, - orig_seq_lens: List[int], - scaling_factor: torch.Tensor, - softmax_scale: float, - causal: Optional[bool] = True, - window_size: Tuple[int, int] = (-1, -1), - alibi_slopes: Optional[torch.Tensor] = None, - block_table: Optional[torch.Tensor] = None, - chunk_size: int = 8192, - local_size: int = 1024, - ): - if alibi_slopes is not None: - raise ValueError( - "Dual Chunk Attention does not support alibi_slopes") - if not causal: - raise ValueError( - "Dual Chunk Attention does not support causal=False") - if window_size != (-1, -1): - raise ValueError( - "Dual Chunk Attention does not support window_size") - - cu_seqlens_q_cpu = cu_seqlens_q.cpu().tolist() - cu_seqlens_k_cpu = cu_seqlens_k.cpu().tolist() - all_outputs = [] - - for i in range(0, len(cu_seqlens_q_cpu) - 1): - qs = cu_seqlens_q_cpu[i] - qe = cu_seqlens_q_cpu[i:i + 2][-1] - ks = cu_seqlens_k_cpu[i] - ke = cu_seqlens_k_cpu[i:i + 2][-1] - - current_q = q[qs:qe] - current_q_succ = q_succ[qs:qe] - current_q_inter = q_inter[qs:qe] - current_q_succ_critical = q_succ_critical[qs:qe] - current_q_inter_critical = q_inter_critical[qs:qe] - - if block_table is None: - current_k = k[ks:ke] - current_v = v[ks:ke] - current_block_table = None - current_orig_seq_len = orig_seq_lens[i] - else: - current_block_table = block_table[i] - current_orig_seq_len = orig_seq_lens[i] - current_k = k - current_v = v - sparse_attn_enabled = (self.sparse_attention_enabled - and current_orig_seq_len - > self.sparse_attention_threshold) - - if current_q.shape[0] == 0: - continue - - if current_k.shape[0] == 0: - all_outputs.append( - torch.zeros( - (current_q.shape[0], current_q.shape[1], v.shape[2]), - device=q.device, - dtype=q.dtype, - )) - continue - - current_output = torch.empty_like(current_q) - group_size = int(current_q.size(-2) / current_k.size(-2)) - - if sparse_attn_enabled: - num_device_q_heads = current_q.size(-2) - heads_vertical_size = torch.empty(size=(num_device_q_heads, ), - dtype=torch.int32) - heads_slash_size = torch.empty(size=(num_device_q_heads, ), - dtype=torch.int32) - for head_id in range(current_q.size(-2)): - ( - ty, - vertical_size, - slash_size, - _, - ) = self.sparse_attention_config[head_id] - assert ty == "vertical_and_slash", "only support slash mode" - - if vertical_size == 30: - vertical_size += 100 - heads_vertical_size[head_id] = vertical_size - heads_slash_size[head_id] = slash_size - - current_output = self._dual_chunk_flash_attn_prefill_func( - current_q, # allheads - current_q_succ, - current_q_inter, - current_q_succ_critical, - current_q_inter_critical, - current_k, - current_v, - current_block_table, - softmax_scale, - chunk_size, - local_size, - scaling_factor[i].item(), - ke - ks, - sparse_attn_enabled=sparse_attn_enabled, - heads_vertical_size=heads_vertical_size, - heads_slash_size=heads_slash_size, - group_size=group_size) - else: - for head_id in range(current_q.size(-2)): - # (seq_len, num_heads, head_size) - current_q_head = current_q[:, head_id, :].unsqueeze(1) - current_q_succ_head = \ - current_q_succ[:, head_id, :].unsqueeze(1) - current_q_inter_head = \ - current_q_inter[:, head_id, :].unsqueeze(1) - current_q_succ_head_critical = \ - current_q_succ_critical[:, head_id, :].unsqueeze(1) - current_q_inter_head_critical = \ - current_q_inter_critical[:, head_id, :].unsqueeze(1) - if block_table is not None: - current_k_head = current_k[..., head_id // - group_size, :].unsqueeze(2) - current_v_head = current_v[..., head_id // - group_size, :].unsqueeze(2) - - else: - current_k_head = current_k[:, head_id, :].unsqueeze(1) - current_v_head = current_v[:, head_id, :].unsqueeze(1) - - current_out = self._dual_chunk_flash_attn_prefill_func( - current_q_head, - current_q_succ_head, - current_q_inter_head, - current_q_succ_head_critical, - current_q_inter_head_critical, - current_k_head, - current_v_head, - current_block_table, - softmax_scale, - chunk_size, - local_size, - scaling_factor[i].item(), - ke - ks, - sparse_attn_enabled=sparse_attn_enabled, - ) - current_output[:, head_id:head_id + 1, :] = current_out - all_outputs.append(current_output) - return torch.cat(all_outputs, dim=0) - - def _dual_chunk_flash_attn_prefill_func( - self, - q, - q_succ, - q_inter, - q_succ_critical, - q_inter_critical, - k, - v, - block_table, - softmax_scale: float, - chunk_size: int, - local_size: int, - scaling_factor: float, - k_length: int, - sparse_attn_enabled: Optional[bool] = True, - heads_vertical_size=None, - heads_slash_size=None, - group_size=None, - ): - flash_results = [] - chunk_len = chunk_size - local_size - - if block_table is not None: - block_size = v.shape[1] - if chunk_len % block_size != 0: - raise ValueError("chunk_len must be divisible by block_size.") - else: - block_size = 1 - - if self.original_max_position_embeddings > 0: - softmax_scale = softmax_scale * scaling_factor - - begin = k_length - q.shape[0] - while begin < k_length: - flash_per_chunk = [] - - prev_chunk_end_pos = (begin // chunk_len) * chunk_len - next_chunk_end_pos = prev_chunk_end_pos + chunk_len - end = min(next_chunk_end_pos, k_length) - qbegin = begin - (k_length - q.shape[0]) - qend = end - (k_length - q.shape[0]) - - qk_chunks = [] - q_states_intra = q[qbegin:qend] - # choose critical token - if block_table is not None: - block_tables_intra = _get_block(block_table, block_size, - prev_chunk_end_pos, end) - k_states_intra = k[block_tables_intra].view( - -1, *k.shape[-2:])[:(end - prev_chunk_end_pos)] - v_states_intra = v[block_tables_intra].view( - -1, *v.shape[-2:])[:(end - prev_chunk_end_pos)] - else: - block_tables_intra = None - k_states_intra = k[prev_chunk_end_pos:end] - v_states_intra = v[prev_chunk_end_pos:end] - - if sparse_attn_enabled: - last_q_size = min(qend - qbegin, self.sparse_attention_last_q) - _, num_device_k_heads, head_dim = k_states_intra.shape - k_states_intra = (k_states_intra.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, head_dim)) - v_states_intra = (v_states_intra.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, head_dim)) - qk_chunks.append( - (q_states_intra.transpose(0, 1)[:, -last_q_size:] * - softmax_scale) @ k_states_intra.permute(1, 2, 0)) - - if prev_chunk_end_pos - chunk_len >= 0: - q_states_succ = q_succ[qbegin:qend] - q_states_succ_critical = q_succ_critical[qbegin:qend] - if block_table is not None: - block_tables_succ = _get_block( - block_table, block_size, - prev_chunk_end_pos - chunk_len, prev_chunk_end_pos) - k_states_succ = k[block_tables_succ].view( - -1, *k.shape[-2:])[:chunk_len] - v_states_succ = v[block_tables_succ].view( - -1, *v.shape[-2:])[:chunk_len] - else: - k_states_succ = k[prev_chunk_end_pos - - chunk_len:prev_chunk_end_pos] - v_states_succ = v[prev_chunk_end_pos - - chunk_len:prev_chunk_end_pos] - - if sparse_attn_enabled: - k_states_succ = (k_states_succ.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - v_states_succ = (v_states_succ.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - qk_chunks.append((q_states_succ_critical.transpose( - 0, 1)[:, -last_q_size:] * softmax_scale) - @ k_states_succ.permute(1, 2, 0)) - - if prev_chunk_end_pos - chunk_len * 2 >= 0: - q_states_inter = q_inter[qbegin:qend] - q_states_inter_critical = q_inter_critical[qbegin:qend] - if block_table is not None: - block_tables_inter = _get_block( - block_table, block_size, 0, - prev_chunk_end_pos - chunk_len) - k_states_inter = k[block_tables_inter].view( - -1, *k.shape[-2:])[:(prev_chunk_end_pos - chunk_len)] - v_states_inter = v[block_tables_inter].view( - -1, *v.shape[-2:])[:(prev_chunk_end_pos - chunk_len)] - else: - k_states_inter = k[:prev_chunk_end_pos - chunk_len] - v_states_inter = v[:prev_chunk_end_pos - chunk_len] - - if sparse_attn_enabled: - k_states_inter = (k_states_inter.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - v_states_inter = (v_states_inter.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - qk_chunks.append((q_states_inter_critical.transpose( - 0, 1)[:, -last_q_size:] * softmax_scale) - @ k_states_inter.permute(1, 2, 0)) - - if sparse_attn_enabled: - reversed_qk = qk_chunks[::-1] - qk = torch.cat(reversed_qk, dim=-1) - - qk[:, :, -last_q_size:] = torch.where( - self.last_q_mask[..., -last_q_size:, - -last_q_size:].to(qk.device), - qk[:, :, -last_q_size:], -torch.inf) - qk = F.softmax(qk, dim=-1, dtype=torch.float32) - - vertical = qk.sum(-2, keepdim=True) - vertical[..., :30] = torch.inf - - # Avoid sorting by using the min/max ints to fill the indexer - # buffers. - int32_max = torch.iinfo(torch.int32).max - int32_min = torch.iinfo(torch.int32).min - n_heads = qk.size()[0] - max_slash_topk = torch.max(heads_slash_size).item() - max_vertical_topk = torch.max(heads_vertical_size).item() - # store each head's slash topk, vertical topk - vertical = vertical.reshape((n_heads, -1)) - # prevent out of range when prompt size < max_vertical_topk - max_vertical_topk = min(vertical.shape[-1], max_vertical_topk) - vertical_topk_buffer = torch.topk(vertical, max_vertical_topk, - -1).indices - slash_topk_buffer = torch.empty(size=(n_heads, max_slash_topk), - dtype=torch.int64, - device=qk.device) - for head_i in range(n_heads): - # (nqheads=1, lastq, k_len) - head_score = qk[head_i:head_i + 1, :, :] - slash_scores = _sum_all_diagonal_matrix(head_score) - if head_score.size(1) != 1: - # drop right up corner - slash_scores = slash_scores[..., :-last_q_size + 1] - slash_scores[..., -100:] = torch.inf - - head_slash_size = heads_slash_size[head_i] - head_slash_size = min(head_slash_size, vertical.size(-1)) - slash_topk = torch.topk(slash_scores, head_slash_size, - -1).indices - #(nheads, max_topk) - slash_topk_buffer[head_i, :head_slash_size] = slash_topk - - # reset heads topk - heads_slash_size[head_i] = head_slash_size - heads_vertical_size[head_i] = min( - heads_vertical_size[head_i], max_vertical_topk) - - # store - vertical_buffer = torch.full((n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - succ_vertical_buffer = torch.full((n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - succ_slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - inter_vertical_buffer = torch.full( - (n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - inter_slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - - vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - succ_vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - succ_slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - inter_vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - inter_slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - - for head_i in range(n_heads): - vertical_topk = vertical_topk_buffer[ - head_i, :heads_vertical_size[head_i]] - # intra - intra_vertical_indices = vertical_topk[ - vertical_topk >= - prev_chunk_end_pos] - prev_chunk_end_pos - if intra_vertical_indices.nelement() == 0: - intra_vertical_indices = torch.cat([ - intra_vertical_indices, - torch.arange(0, - k_states_intra.size(0), - max(1, - k_states_intra.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - slash_topk = slash_topk_buffer[ - head_i, :heads_slash_size[head_i]] - intra_slash_indices = ( - (qk.size(-1) - 1) - - slash_topk[slash_topk >= prev_chunk_end_pos]) - # fill buffer - v_count = intra_vertical_indices.nelement() - s_count = intra_slash_indices.nelement() - vertical_size_buffer[head_i] = v_count - slash_sizes_buffer[head_i] = s_count - vertical_buffer[head_i, :v_count].copy_( - intra_vertical_indices) - slash_buffer[head_i, :s_count].copy_(intra_slash_indices) - # succ - if prev_chunk_end_pos - chunk_len >= 0: - succ_vertical_indices = vertical_topk[ - (vertical_topk < prev_chunk_end_pos) - & (vertical_topk >= prev_chunk_end_pos - - chunk_len)] - (prev_chunk_end_pos - chunk_len) - # TODO: support no vertical - if succ_vertical_indices.nelement() == 0: - succ_vertical_indices = torch.cat([ - succ_vertical_indices, - torch.arange( - 0, - k_states_succ.size(0), - max(1, - k_states_succ.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - succ_slash_indices = ( - (prev_chunk_end_pos + (qend - qbegin) - 1) - - slash_topk[((slash_topk >= - (prev_chunk_end_pos - chunk_len)) & - (slash_topk < (prev_chunk_end_pos + - (qend - qbegin))))]) - if succ_slash_indices.nelement() == 0: - succ_slash_indices = torch.cat([ - succ_slash_indices, - torch.arange( - 0, - k_states_succ.size(0), - max(1, - k_states_succ.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - # fill buffer - v_count = succ_vertical_indices.nelement() - s_count = succ_slash_indices.nelement() - succ_vertical_size_buffer[head_i] = v_count - succ_slash_sizes_buffer[head_i] = s_count - succ_vertical_buffer[head_i, :v_count].copy_( - succ_vertical_indices) - succ_slash_buffer[head_i, :s_count].copy_( - succ_slash_indices) - - if prev_chunk_end_pos - 2 * chunk_len >= 0: - inter_vertical_indices = vertical_topk[ - vertical_topk < prev_chunk_end_pos - chunk_len] - - if inter_vertical_indices.nelement() == 0: - inter_vertical_indices = torch.cat([ - inter_vertical_indices, - torch.arange( - 0, - k_states_inter.size(0), - max(1, - k_states_inter.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - inter_slash_indices = ( - (prev_chunk_end_pos - chunk_len + - (qend - qbegin) - 1) - - slash_topk[slash_topk < (prev_chunk_end_pos - - chunk_len + - (qend - qbegin))]) - if inter_slash_indices.nelement() == 0: - inter_slash_indices = torch.cat([ - inter_slash_indices, - torch.arange( - 0, - k_states_inter.size(0), - max(1, - k_states_inter.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - # fill buffer - v_count = inter_vertical_indices.nelement() - s_count = inter_slash_indices.nelement() - inter_vertical_size_buffer[head_i] = v_count - inter_slash_sizes_buffer[head_i] = s_count - inter_vertical_buffer[head_i, :v_count].copy_( - inter_vertical_indices) - inter_slash_buffer[head_i, :s_count].copy_( - inter_slash_indices) - else: - intra_vertical_indices, intra_slash_indices = None, None - succ_vertical_indices, succ_slash_indices = None, None - inter_vertical_indices, inter_slash_indices = None, None - - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_intra, - k_states_intra, - v_states_intra, - softmax_scale=softmax_scale, - causal=True, - stage="intra", - vertical_indices=vertical_buffer, - slash_indices=slash_buffer, - vertical_indices_count=vertical_size_buffer, - slash_indices_count=slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_intra, - k_states_intra, - v_states_intra, - softmax_scale=softmax_scale, - causal=True, - stage="intra", - vertical_indices=intra_vertical_indices, - slash_indices=intra_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - if prev_chunk_end_pos - chunk_len >= 0: - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_succ, - k_states_succ, - v_states_succ, - softmax_scale=softmax_scale, - causal=False, - stage="succ", - vertical_indices=succ_vertical_buffer, - slash_indices=succ_slash_buffer, - vertical_indices_count=succ_vertical_size_buffer, - slash_indices_count=succ_slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_succ, - k_states_succ, - v_states_succ, - softmax_scale=softmax_scale, - causal=False, - stage="succ", - vertical_indices=succ_vertical_indices, - slash_indices=succ_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - if prev_chunk_end_pos - chunk_len * 2 >= 0: - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_inter, - k_states_inter, - v_states_inter, - softmax_scale=softmax_scale, - causal=False, - stage="inter", - vertical_indices=inter_vertical_buffer, - slash_indices=inter_slash_buffer, - vertical_indices_count=inter_vertical_size_buffer, - slash_indices_count=inter_slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_inter, - k_states_inter, - v_states_inter, - softmax_scale=softmax_scale, - causal=False, - stage="inter", - vertical_indices=inter_vertical_indices, - slash_indices=inter_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - flash_results.append(flash_per_chunk) - begin = end - - attn_output = self._merge_attn_outputs(flash_results) - del flash_results - return attn_output - - def _do_flash_attn( - self, - query_states: torch.Tensor, - key_states: torch.Tensor, - value_states: torch.Tensor, - softmax_scale: float, - causal: bool = True, - max_seqlen_k: Optional[int] = None, - stage: str = "intra", - vertical_indices: Optional[torch.Tensor] = None, - slash_indices: Optional[torch.Tensor] = None, - vertical_indices_count: Optional[torch.Tensor] = None, - slash_indices_count: Optional[torch.Tensor] = None, - mergehead_softmax_scale: Optional[float] = None, - sparse_attn_enabled: Optional[bool] = False, - ): - if max_seqlen_k is None: - max_seqlen_k = key_states.shape[0] - - q_len = query_states.shape[0] - q_heads = query_states.shape[1] - h_dim = query_states.shape[-1] - - if sparse_attn_enabled: - assert slash_indices is not None - if stage == "intra": - assert causal - else: - assert not causal - - query_states = query_states.unsqueeze(0).transpose(1, 2) - key_states = key_states.unsqueeze(0).transpose(1, 2) - value_states = value_states.unsqueeze(0).transpose(1, 2) - - q = query_states - k = key_states - v = value_states - - if (vertical_indices_count is not None and \ - slash_indices_count is not None): - assert mergehead_softmax_scale is not None - - res, s_lse = _vertical_slash_sparse_attention( - q, - k, - v, - vertical_indices, - slash_indices, - mergehead_softmax_scale, - causal=causal, - stage=stage, - vertical_indices_count=vertical_indices_count, - slash_indices_count=slash_indices_count) - res = res.view(q_heads, q_len, - h_dim).transpose(0, 1) # (qlen,nhead,h_dim) - s_lse = s_lse.view( - q_heads, q_len, - 1).squeeze(-1).unsqueeze(0).float() # (1, nhead,qlen) - else: - res, s_lse = _vertical_slash_sparse_attention(q, - k, - v, - vertical_indices, - slash_indices, - softmax_scale, - causal=causal, - stage=stage) - res = res.view(q_len, q_heads, h_dim) - s_lse = s_lse.view(q_len, q_heads, 1).transpose(0, 2).float() - return res, s_lse - - output, softmax_lse = flash_attn_varlen_func( - q=query_states, - k=key_states, - v=value_states, - softmax_scale=softmax_scale, - cu_seqlens_q=torch.tensor([0, query_states.shape[0]], - dtype=torch.int32, - device=query_states.device), - max_seqlen_q=query_states.shape[0], - cu_seqlens_k=torch.tensor([0, max_seqlen_k], - dtype=torch.int32, - device=query_states.device), - max_seqlen_k=max_seqlen_k, - causal=causal, - return_softmax_lse=True, - ) - softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, - 2).float() - return output, softmax_lse - - def _merge_attn_outputs( - self, - flash_results: List[List[Tuple[torch.Tensor, torch.Tensor]]], - return_lse: Optional[bool] = False, - ) -> torch.Tensor: - attn_outputs_all = [] - logits_all = [] - - for flash_per_chunk in flash_results: - if len(flash_per_chunk) == 1: - attn_outputs_all.append(flash_per_chunk[0][0]) - if return_lse: - logits_all.append(flash_per_chunk[0][1]) - continue - - attn_outputs = torch.stack([ - flash_attn_output[0] for flash_attn_output in flash_per_chunk - ]) - logits = torch.stack([ - flash_attn_output[1] for flash_attn_output in flash_per_chunk - ]) - logits = logits.to(torch.float32) - - if return_lse: - max_val = torch.max(logits, dim=0).values - diff = torch.abs(logits[0] - logits[1]) - log_sum_exp = max_val + torch.log1p(torch.exp(-diff)) - logits_all.append(log_sum_exp) - - max_logits = torch.max(logits, dim=0).values - stable_logits = logits - max_logits.unsqueeze(0) - lse_s = torch.exp(stable_logits).detach() - lse_sum = torch.sum(lse_s, dim=0) - lse_s /= lse_sum - attn_outputs *= lse_s.unsqueeze(-1).transpose(2, 3).squeeze(1) - attn_outputs_all.append(attn_outputs.sum(dim=0)) - - if return_lse: - return (torch.cat(attn_outputs_all, - dim=0), torch.cat(logits_all, dim=-1)) - else: - return torch.cat(attn_outputs_all, dim=0) - - def _dual_chunk_flash_attn_decoding( - self, - query: torch.Tensor, - query_succ: torch.Tensor, - query_inter: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_table: torch.Tensor, - cache_seqlens: torch.Tensor, - softmax_scale: float, - causal: bool, - alibi_slopes: Optional[torch.Tensor], - chunk_size: int, - local_size: int, - original_max_position_embeddings: int, - decode_meta: DualChunkFlashAttentionMetadata, - ): - if not causal: - raise ValueError( - "Dual Chunk Attention does not support causal=False") - - block_size = value_cache.shape[1] - chunk_len = chunk_size - local_size - if chunk_len % block_size != 0: - raise ValueError("chunk_len must be divisible by block_size.") - if original_max_position_embeddings > 0: - assert decode_meta.scaling_factor is not None - scaling_factor = decode_meta.scaling_factor - query = (query * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype - ) # possible for numerical issue, need to fused in the kernel - query_succ = (query_succ * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype) - query_inter = (query_inter * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype) - outputs_list = [] - softmax_lses_list = [] - - # intra-attention - intra_output, intra_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query, - key_cache, - value_cache, - decode_meta.block_tables_intra, - decode_meta.seq_lens_intra, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(intra_output) - softmax_lses_list.append(intra_softmax_lse) - - # succ-attention - if decode_meta.max_seq_len_succ: - succ_output, succ_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query_succ, - key_cache, - value_cache, - decode_meta.block_tables_succ, - decode_meta.seq_lens_succ, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(succ_output) - softmax_lses_list.append(succ_softmax_lse) - - # inter-attention - if decode_meta.max_seq_len_inter: - inter_output, inter_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query_inter, - key_cache, - value_cache, - block_table[:, :decode_meta.max_seq_len_inter], - decode_meta.seq_lens_inter, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(inter_output) - softmax_lses_list.append(inter_softmax_lse) - outputs = torch.stack(outputs_list, dim=0) - del outputs_list - softmax_lses = torch.stack(softmax_lses_list, dim=0).to(torch.float32) - del softmax_lses_list - max_logits = torch.max(softmax_lses, dim=0).values - stable_logits = softmax_lses - max_logits.unsqueeze(0) - lse_s = torch.exp(stable_logits).detach() - lse_sum = torch.sum(lse_s, dim=0) - lse_s /= lse_sum - outputs *= lse_s.unsqueeze(-1).transpose(2, 3) - return outputs.sum(0) - - def _dual_chunk_flash_attn_decoding_with_exp_sums( - self, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_table: torch.Tensor, - cache_seqlens: torch.Tensor, - softmax_scale: float, - alibi_slopes: Optional[torch.Tensor], - causal: bool, - ): - out, softmax_lse = flash_attn_with_kvcache( - q=query, - k_cache=key_cache, - v_cache=value_cache, - block_table=block_table, - cache_seqlens=cache_seqlens, - softmax_scale=softmax_scale, - alibi_slopes=alibi_slopes, - causal=causal, - return_softmax_lse=True, - ) - mask = (cache_seqlens == 0) - out[mask] = 0 - softmax_lse[mask] = -float("inf") - return out, softmax_lse - - -def _vertical_slash_sparse_attention( - query: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD] - key: torch.Tensor, # [BATCH, N_HEADS, N_KV_CTX, D_HEAD] - value: torch.Tensor, # [BATCH, N_HEADS, N_KV_CTX, D_HEAD] - v_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_V] - s_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_S] - softmax_scale: float, - causal: bool = True, - stage: str = "intra", - block_size_M: int = 64, - block_size_N: int = 64, - vertical_indices_count: torch.Tensor = None, # [N_HEADS,] - slash_indices_count: torch.Tensor = None, -): - if stage == "intra": - assert causal - else: - assert not causal - - batch_size, num_heads, context_size, head_dim = query.shape - _, _, kv_seq_len, _ = key.shape - - if head_dim not in [16, 32, 64, 128, 256, 512]: - target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim - query = F.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0]) - key = F.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0]) - value = F.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0]) - - v_idx = v_idx.to(torch.int32).reshape( - (batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0] - s_idx = s_idx.to(torch.int32).reshape( - (batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0] - q_seqlens = torch.tensor([context_size], - dtype=torch.int32, - device=query.device) - kv_seqlens = torch.tensor([kv_seq_len], - dtype=torch.int32, - device=query.device) - - if vertical_indices_count is not None and slash_indices_count is not None: - ( - block_count, - block_offset, - column_count, - column_index, - ) = ops.convert_vertical_slash_indexes_mergehead( - q_seqlens, kv_seqlens, v_idx, s_idx, vertical_indices_count, - slash_indices_count, context_size, block_size_M, block_size_N, - causal) - else: - ( - block_count, - block_offset, - column_count, - column_index, - ) = ops.convert_vertical_slash_indexes(q_seqlens, kv_seqlens, v_idx, - s_idx, context_size, - block_size_M, block_size_N, - causal) - - q = query.transpose(1, 2).contiguous() - k = key.transpose(1, 2).contiguous() - v = value.transpose(1, 2).contiguous() - out, lse = sparse_attn_func( - q, - k, - v, - block_count, - block_offset, - column_count, - column_index, - causal=causal, - softmax_scale=softmax_scale, - return_softmax_lse=True, - ) - out = out.transpose(1, 2).contiguous() - softmax_lse = lse.reshape(*lse.shape, 1) - return (out[..., :context_size, :head_dim], - softmax_lse[..., :context_size, :]) - - -def _sum_all_diagonal_matrix(mat: torch.tensor): - h, n, m = mat.shape - # Zero matrix used for padding - zero_mat = torch.zeros((h, n, n), device=mat.device) - # pads the matrix on left and right - mat_padded = torch.cat((zero_mat, mat, zero_mat), -1) - # Change the strides - mat_strided = mat_padded.as_strided((1, n, n + m), - (n * (2 * n + m), 2 * n + m + 1, 1)) - # Sums the resulting matrix's columns - sum_diags = torch.sum(mat_strided, 1) - return sum_diags[:, 1:] # drop left bottom corner - - -def _get_block(block_table: torch.Tensor, block_size: int, begin: int, - end: int): - begin_block = begin // block_size - end_block = (end - 1) // block_size + 1 - return block_table[begin_block:end_block] diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py deleted file mode 100755 index edb3afb4aa075..0000000000000 --- a/vllm/attention/backends/flash_attn.py +++ /dev/null @@ -1,929 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with FlashAttention.""" -from collections import defaultdict -from dataclasses import dataclass -from itertools import accumulate -from typing import Dict, List, Optional, Tuple, Type - -import torch - -from vllm import _custom_ops as ops -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -# yapf: enable -from vllm.attention.backends.utils import ( - PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping, - compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, - get_seq_len_block_table_args, is_all_cross_attn_metadata_set, - is_all_encoder_attn_metadata_set, is_block_tables_empty) -from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, - get_flash_attn_version) -from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.utils import async_tensor_h2d, make_tensor_with_pad -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) - -logger = init_logger(__name__) - - -class FlashAttentionBackend(AttentionBackend): - - accept_output_buffer: bool = True - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] - - @staticmethod - def get_name() -> str: - return "FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["FlashAttentionImpl"]: - return FlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return FlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: - return FlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - if block_size % 16 != 0: - raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - src_key_cache = src_kv_cache[0] - dst_key_cache = dst_kv_cache[0] - ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - src_value_cache = src_kv_cache[1] - dst_value_cache = dst_kv_cache[1] - ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class FlashAttentionMetadata(AttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - - use_cuda_graph: bool - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = FlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - self._cached_decode_metadata = FlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_decode_metadata - - -class FlashAttentionMetadataBuilder( - AttentionMetadataBuilder[FlashAttentionMetadata]): - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - - if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables( - self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs - - graph_block_tables = self.runner.graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend([] * cuda_graph_pad_size) - num_decode_tokens = batch_size - self.num_prefill_tokens - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - - return FlashAttentionMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=use_captured_graph, - ) - - -class FlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "FLASH_ATTN backend.") - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window - 1, - 0) if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - self.vllm_flash_attn_version = get_flash_attn_version( - requires_alibi=self.alibi_slopes is not None) - if is_quantized_kv_cache(self.kv_cache_dtype) and ( - not self.kv_cache_dtype.startswith("fp8") - or not flash_attn_supports_fp8()): - raise NotImplementedError( - f"FlashAttention does not support {self.kv_cache_dtype} " - "kv-cache on this device " - f"(FA supports fp8 = {flash_attn_supports_fp8()}).") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - logits_soft_cap = 0 - self.logits_soft_cap = logits_soft_cap - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: FlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention. - - Args: - query: shape = [num_tokens, num_heads, head_size] - key: shape = [num_tokens, num_kv_heads, head_size] - value: shape = [num_tokens, num_kv_heads, head_size] - output: shape = [num_tokens, num_heads, head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size, num_kv_heads, head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - NOTE: It in-place updates the output tensor. - NOTE: FP8 quantization, flash-attn expect the size of - {q,k,v}_descale to be (num_sequences, num_kv_heads). - We use torch's .expand() to avoid duplicating values - """ - assert output is not None, "Output tensor must be provided." - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for FlashAttentionImpl") - - # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache. - if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16: - assert ( - layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), ( - "key/v_scale is only supported in FlashAttention 3 with " - "base dtype bfloat16") - - attn_type = self.attn_type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - kv_cache_dtype: str = self.kv_cache_dtype - softmax_scale: float = self.scale - window_size = self.sliding_window - alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes - logits_soft_cap: Optional[float] = self.logits_soft_cap - fp8_attention = kv_cache_dtype.startswith("fp8") - - if fp8_attention and not flash_attn_supports_fp8(): - raise NotImplementedError( - "FlashAttention does not support FP8 kv-cache on this device.") - - if kv_cache.numel() > 0: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - # We skip updating the KV cache under two conditions: - # a. When the Attention Type is ENCODER. In this phase, we compute - # only the encoder attention without updating the cache. - # b. When both Key and Value are None. This occurs during - # cross-attention computation in the decoding phase, where the - # KV cache is already populated with the cross-attention - # tensor. Thus, we skip cache updates during this time. - if (attn_type != AttentionType.ENCODER) and (key is not None) and ( - value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory - # profiling run. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - updated_slot_mapping.flatten(), # type: ignore[union-attr] - kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - if fp8_attention: - kv_cache = kv_cache.view(torch.float8_e4m3fn) - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) - - if fp8_attention: - num_tokens, num_heads, head_size = query.shape - query, _ = ops.scaled_fp8_quant( - query.reshape( - (num_tokens, num_heads * head_size)).contiguous(), - layer._q_scale) - query = query.reshape((num_tokens, num_heads, head_size)) - - (num_prefill_query_tokens, num_prefill_kv_tokens, - num_decode_query_tokens) = \ - get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) - decode_query = query[num_prefill_query_tokens:] - decode_output = output[num_prefill_query_tokens:] - # QKV for prefill. - query = query[:num_prefill_query_tokens] - prefill_output = output[:num_prefill_query_tokens] - assert query.shape[0] == num_prefill_query_tokens - assert decode_query.shape[0] == num_decode_query_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if (kv_cache.numel() == 0 or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - # normal attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \ - _get_query_key_seq_metadata(prefill_meta, True, attn_type) - - key = key[:num_prefill_kv_tokens] - value = value[:num_prefill_kv_tokens] - - if fp8_attention: - num_kv_tokens, num_kv_heads, head_size = key.shape - - key, _ = ops.scaled_fp8_quant( - key.reshape((num_kv_tokens, - num_kv_heads * head_size)).contiguous(), - layer._k_scale) - key = key.reshape((num_kv_tokens, num_kv_heads, head_size)) - - value, _ = ops.scaled_fp8_quant( - value.reshape((num_kv_tokens, - num_kv_heads * head_size)).contiguous(), - layer._v_scale) - value = value.reshape( - (num_kv_tokens, num_kv_heads, head_size)) - - descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1]) - flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=q_seq_start_loc, - cu_seqlens_k=k_seq_start_loc, - max_seqlen_q=q_seq_len, - max_seqlen_k=k_seq_len, - softmax_scale=softmax_scale, - causal=_get_causal_option(attn_type), - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - out=prefill_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - else: - # prefix-enabled attention - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support prefix caching") - assert prefill_meta.seq_lens is not None - assert prefill_meta.query_start_loc is not None - max_seq_len = max(prefill_meta.seq_lens) - descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, - key.shape[1]) - flash_attn_varlen_func( # noqa - q=query, - k=key_cache, - v=value_cache, - cu_seqlens_q=prefill_meta.query_start_loc, - max_seqlen_q=prefill_meta.max_query_len, - seqused_k=prefill_meta.seq_lens_tensor, - max_seqlen_k=max_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - block_table=prefill_meta.block_tables, - softcap=logits_soft_cap, - out=prefill_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - # Use flash_attn_varlen_func kernel for speculative decoding - # because different queries might have different lengths. - - assert decode_meta.max_decode_query_len is not None - # use only for actual varlen decoding - if decode_meta.max_decode_query_len > 1: - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support max_decode_query_len > 1" - ) - assert decode_meta.query_start_loc is not None - descale_shape = (decode_meta.query_start_loc.shape[0] - 1, - key.shape[1]) - flash_attn_varlen_func( - q=decode_query, - k=key_cache, - v=value_cache, - cu_seqlens_q=decode_meta.query_start_loc, - max_seqlen_q=decode_meta.max_decode_query_len, - seqused_k=decode_meta.seq_lens_tensor, - max_seqlen_k=decode_meta.max_decode_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - block_table=decode_meta.block_tables, - out=decode_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - else: - # Use flash_attn_with_kvcache for normal decoding. - ( - seq_lens_arg, - _, - block_tables_arg, - ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2]) - flash_attn_with_kvcache( - q=decode_query.unsqueeze(1), - k_cache=key_cache, - v_cache=value_cache, - block_table=block_tables_arg, - cache_seqlens=seq_lens_arg, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - out=decode_output.unsqueeze(1), - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - return output - - -def _get_query_key_seq_metadata( - attn_metadata: FlashAttentionMetadata, - is_prompt: bool, - attn_type: str, -) -> tuple: - """ - Returns sequence metadata for key and query based on the specified - attention type and whether input is a prompt. - - This function computes the starting locations and maximum sequence lengths - for key and query sequences for different attention types. - - Args: - attn_metadata: The attention metadata object - is_prompt (bool): A flag indicating if the input is a prompt - attn_type (AttentionType): The type of attention being used. - - Returns: - tuple: A tuple containing four integers: - - Starting location for the query sequence. - - Maximum sequence length for the query sequence. - - Starting location for the key sequence. - - Maximum sequence length for the key sequence. - - Raises: - AttributeError: If an invalid attention type is provided. - """ - if attn_type == AttentionType.DECODER: - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - if is_prompt: - max_seq_len = attn_metadata.max_prefill_seq_len - else: - max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_start_loc, max_seq_len, - attn_metadata.seq_start_loc, max_seq_len) - - elif attn_type == AttentionType.ENCODER_DECODER: - # This is cross attention between the where the key - # is the precomputed encoder attention and query - # is the input sequence. - # Choose query max length based on whether it is prompt - # or not. - if is_prompt: - max_seq_len = attn_metadata.max_prefill_seq_len - else: - max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_start_loc, max_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len) - elif attn_type == AttentionType.ENCODER: - # For encoder attention both the query and the key are same i.e. the - # encoder sequence. - return (attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len) - elif attn_type == AttentionType.ENCODER_ONLY: - assert is_prompt, "Should not have decode for encoder only model." - return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len, - attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -def _get_causal_option(attn_type: str) -> bool: - """ - Determine whether the given attention type is suitable for causal - attention mechanisms. - - Args: - attn_type (AttentionType): The type of attention being evaluated - - Returns: - bool: Returns `True` if the attention type is suitable for causal - attention (i.e., not encoder, encoder-only, or encoder-decoder), - otherwise returns `False`. - """ - return not (attn_type == AttentionType.ENCODER - or attn_type == AttentionType.ENCODER_ONLY - or attn_type == AttentionType.ENCODER_DECODER) diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py deleted file mode 100644 index aeaa0ab631cfb..0000000000000 --- a/vllm/attention/backends/flashmla.py +++ /dev/null @@ -1,227 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from contextlib import contextmanager -from dataclasses import dataclass -from typing import List, Optional, Tuple, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata, - MLACommonMetadataBuilder, - MLACommonState) -from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, - get_mla_metadata, - is_flashmla_supported) - - -class FlashMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "FLASHMLA" - - @staticmethod - def get_impl_cls() -> Type["FlashMLAImpl"]: - return FlashMLAImpl - - @staticmethod - def get_metadata_cls() -> Type["FlashMLAMetadata"]: - return FlashMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: - return FlashMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["FlashMLAState"]: - return FlashMLAState - - -@dataclass -class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, - torch.Tensor]] = None - decode_num_splits: Optional[torch.Tensor] = None - - @property - def decode_metadata(self): - decode_metadata = super().decode_metadata - # TODO: cache assignment? - if decode_metadata is not None: - decode_metadata.decode_tile_scheduler_metadata=\ - self.decode_tile_scheduler_metadata - decode_metadata.decode_num_splits=\ - self.decode_num_splits - return decode_metadata - - -class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.num_q_heads = self.runner.model_config.get_num_attention_heads( - self.runner.parallel_config) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - m = super().build(seq_lens, query_lens, cuda_graph_pad_size, - batch_size) - - if m.num_decode_tokens > 0: - m.decode_tile_scheduler_metadata, m.decode_num_splits = \ - get_mla_metadata( - m.seq_lens_tensor[m.num_prefills:], - self.num_q_heads, - 1, # MQA for the decode path - ) - - return m - - -class FlashMLAState(MLACommonState[FlashMLAMetadata]): - - def __init__(self, *args, **kwds): - super().__init__(*args, **kwds) - - self.num_q_heads = self.runner.model_config.get_num_attention_heads( - self.runner.parallel_config) - - @contextmanager - def graph_capture(self, max_batch_size: int): - # Run a dummy `get_mla_metadata` so we can get the right shapes - self._graph_decoder_tile_scheduler_metadata, \ - self._graph_decode_num_splits = get_mla_metadata( - torch.ones( - max_batch_size, dtype=torch.int32, device=self.runner.device), - self.num_q_heads, - 1, # MQA for the decode path - ) - - with super().graph_capture(max_batch_size): - yield - - del self._graph_decoder_tile_scheduler_metadata - del self._graph_decode_num_splits - - def graph_capture_get_metadata_for_batch( - self, batch_size: int, is_encoder_decoder_model: bool = False): - metadata = super().graph_capture_get_metadata_for_batch( - batch_size, is_encoder_decoder_model) - assert metadata.num_decode_tokens > 0 - - decoder_tile_scheduler_metadata, decode_num_splits = get_mla_metadata( - self._graph_seq_lens[:batch_size], - self.num_q_heads, - 1, # MQA for the decode path - ) - - self._graph_decoder_tile_scheduler_metadata.copy_( - decoder_tile_scheduler_metadata) - self._graph_decode_num_splits[:batch_size + 1].copy_(decode_num_splits) - - metadata.decode_tile_scheduler_metadata=\ - self._graph_decoder_tile_scheduler_metadata - metadata.decode_num_splits=\ - self._graph_decode_num_splits[:batch_size + 1] - - return metadata - - def get_graph_input_buffers(self, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers = super().get_graph_input_buffers( - attn_metadata, is_encoder_decoder_model) - input_buffers["decode_tile_scheduler_metadata"] = \ - attn_metadata.decode_metadata.decode_tile_scheduler_metadata - input_buffers["decode_num_splits"] = \ - attn_metadata.decode_metadata.decode_num_splits - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata, - is_encoder_decoder_model: bool = False): - super().prepare_graph_input_buffers(input_buffers, attn_metadata, - is_encoder_decoder_model) - - input_buffers["decode_tile_scheduler_metadata"].copy_( - attn_metadata.decode_metadata.decode_tile_scheduler_metadata) - input_buffers["decode_num_splits"].copy_( - attn_metadata.decode_metadata.decode_num_splits) - - -class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str] = None, - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - is_supported, reason = is_flashmla_supported() - assert is_supported, reason - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "FlashMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashMLAImpl") - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "FlashMLA with FP8 KV cache not yet supported") - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: FlashMLAMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - - q = torch.cat([q_nope, q_pe], dim=-1)\ - .unsqueeze(1) # Add seqlen dim of 1 (decode) - - o, _ = flash_mla_with_kvcache( - q=q, - k_cache=kv_c_and_k_pe_cache.unsqueeze(-2), # Add head dim of 1 - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, - head_dim_v=self.kv_lora_rank, - tile_scheduler_metadata=decode_meta.decode_tile_scheduler_metadata, - num_splits=decode_meta.decode_num_splits, - softmax_scale=self.scale, - causal=True, - ) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py deleted file mode 100644 index 826b63e1ccda8..0000000000000 --- a/vllm/attention/backends/mla/common.py +++ /dev/null @@ -1,1305 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -# MLA Common Components - -This file implements common components for MLA implementations. - -First we define: - -Sq as Q sequence length -Skv as KV sequence length - -MLA has two possible ways of computing, a data-movement friendly approach and a -compute friendly approach, we generally want to use the compute friendly -approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) -and the data-movement friendly approach for "decode" (i.e. the ratio -Sq / Skv is "large"). - -NOTE what we deem small and large is currently determined by if its labelled -prefill or decode by the scheduler, but this is something we should probably -tune. - -Main reference: DeepseekV2 paper, and FlashInfer Implementation -(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). - -Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. -* For decode (i.e. the memory friendly approach) the attention "simulates" a -multi-head attention, while the compute is similar to multi-query attention. - -Below is example of both paths assuming batchsize = 1 - -## More Extent Definitions: - -C Context length, `Skv - Sq` -H hidden size -N number of attention heads -Lq latent dimension for Q 1536 in DSV3 -Lkv latent dimension for K/V 512 in DSV3 -P nope dimension, no rope. 128 in DSV3 -R rope dimension, goes through rope. 64 in DSV3 -V V head dim. 128 in DSV3 - -## Vector/Matrix Definitions - -h_t hidden states (input to attention) shape [Sq, H] -q_c latent/compressed Q shape [Sq, Lq] -q_nope uncompressed Q (no-rope) shape [Sq, N, P] -q_pe uncompressed Q (rope) shape [Sq, N, R] -kv_c latent/compressed KV shape [Skv, Lkv] -k_pe decoupled k position embeddings shape [Skv, R] -new_kv_c new kv_c from current iter shape [Sq, Lkv] -new_k_pe new k_pe from current iter shape [Sq, R] -cache_kv_c cached k_c from previous iters shape [C, Lkv] -cache_k_pe cached k_pe from previous iters shape [C, R] -W_DQ project h_t to q_c shape [H, Lq] -W_UQ project q_c to q_nope shape [Lq, N * P] -W_QR project q_c to q_pe shape [Lq, N * R] -W_DKV project h_t to kv_c shape [H, Lkv] -W_UK project kv_c to k_nope shape [Lkv, N, P] -W_KR project h_t to k_pe shape [H, R] -W_UV project kv_c to v shape [Lkv, N, V] -W_O project v to h_t shape [N * V, H] - - -## Compute Friendly Approach (i.e. "_forward_prefill"): - -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(Sq, N, P) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) -k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) -k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P) -v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V) - -// MHA with QK headdim = P + R -// V headdim = V -// spda_o shape [Sq, N, V] -spda_o = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), - v -) -return spda_o @ W_O - -NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatenated per head - `q_b_proj` is [W_UQ; W_QR] concatenated per head - `out_proj` is W_O - - -## Data-Movement Friendly Approach (i.e. "_forward_decode"): - -Runtime -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(-1, N, P) -ql_nope = einsum("snh,lnh->snl", q, W_UK) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) -k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) - -// MQA with QK headdim = Lkv + R -// V headdim = Lkv -// spda_o shape [Sq, N, Lkv] -// NOTE: this is less compute-friendly since Lkv > P -// but is more data-movement friendly since its MQA vs MHA -spda_o = scaled_dot_product_attention( - torch.cat([ql_nope, q_pe], dim=-1), - torch.cat([kv_c, k_pe], dim=-1), - kv_c -) - -o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV) -return o.view(-1, N * V) @ self.num_heads @ W_O - - -## Chunked Prefill - -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to -the data-movement friendly approach if the chunk (i.e. `Sq`) is small. - -However, the compute-friendly approach can potentially run out of memory if Skv -is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` - -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a -fixed workspace size. - -The chunked prefill approach is as follows: - -MCC Max chunk of context to process per iter, computed dynamically, - used to bound the memory usage - -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(Sq, N, P) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P) -new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V) - -// MHA between queries and new KV -// with QK headdim = P + R -// V headdim = V -// curr_o shape [Sq, N, V] -// curr_lse shape [N, Sq], this is just order FA returns -curr_o, curr_lse = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), - new_v, - casual=True, - return_softmax_lse=True -) - -// Compute attention with the already existing context -for chunk_idx in range(cdiv(C, MCC)): - chunk_start = chunk_idx * MCC - chunk_end = min(chunk_start + MCC, C) - Sc = chunk_end - chunk_start - cache_kv_c_chunk = cache_kv_c[chunk_start:chunk_end] - cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end] - cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P) - cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V) - - chunk_o, chunk_lse = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([cache_k_nope_chunk, - cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], - dim=-1), - cache_v_chunk, - casual=False, - return_softmax_lse=True - ) - - curr_o, curr_lse = merge_attn_states( - suffix_output=curr_o, - suffix_lse=curr_lse, - prefix_output=chunk_o, - prefix_lse=chunk_lse, - ) - -return curr_o @ W_O -""" - -import functools -from abc import abstractmethod -from collections import defaultdict -from contextlib import contextmanager -from dataclasses import dataclass -from itertools import accumulate -from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar - -import torch - -from vllm import _custom_ops as ops -from vllm import envs -from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionState, MLAAttentionImpl) -from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import get_flash_attn_version -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, - UnquantizedLinearMethod) -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.platforms import current_platform -from vllm.triton_utils import HAS_TRITON -from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down - -if HAS_TRITON: - from vllm.attention.ops.triton_flash_attention import triton_attention -else: - triton_attention = None - -try: - from vllm.vllm_flash_attn import flash_attn_varlen_func - is_vllm_fa = True -except ImportError: - is_vllm_fa = False - try: - # For rocm use upstream flash attention - from flash_attn import flash_attn_varlen_func - except ImportError: - flash_attn_varlen_func = None - -is_hip = current_platform.is_rocm() - - -class MLACommonBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "TRITON_MLA" - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return MLACommonMetadata - - @staticmethod - def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: - return MLACommonMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["MLACommonState"]: - return MLACommonState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, # assumed to be 1 for MLA - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - ops.copy_blocks_mla(kv_caches, src_to_dists) - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [576] - - -T = TypeVar("T", bound="MLACommonMetadata") - - -class MLACommonState(AttentionState, Generic[T]): - - def __init__(self, runner): - self.runner = runner - self._is_graph_capturing = False - - scheduler_config = runner.scheduler_config - self.model_config = runner.model_config - cache_config = runner.cache_config - - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled - self.enable_prefix_caching = cache_config.enable_prefix_caching - - if self.chunked_prefill_enabled or self.enable_prefix_caching: - self.context_chunk_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max( - 8 * self.model_config.max_model_len, 4 * - scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) - assert self.context_chunk_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size - - @contextmanager - def graph_capture(self, max_batch_size: int): - self._is_graph_capturing = True - - self._graph_slot_mapping = torch.full((max_batch_size, ), - PAD_SLOT_ID, - dtype=torch.long, - device=self.runner.device) - self._graph_seq_lens = torch.ones(max_batch_size, - dtype=torch.int32, - device=self.runner.device) - self._graph_block_tables = torch.from_numpy( - self.runner.graph_block_tables).to(device=self.runner.device) - - self._positions = torch.zeros((max_batch_size, ), - dtype=torch.long, - device=self.runner.device) - - yield - - self._is_graph_capturing = False - del self._graph_slot_mapping - del self._graph_seq_lens - del self._graph_block_tables - del self._positions - - def graph_clone(self, batch_size: int): - assert self._is_graph_capturing - return self.__class__(self.runner) - - def graph_capture_get_metadata_for_batch( - self, - batch_size: int, - is_encoder_decoder_model: bool = False) -> T: - assert self._is_graph_capturing - - attn_metadata = self.runner.attn_backend.make_metadata( - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - use_cuda_graph=True, - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size, - slot_mapping=self._graph_slot_mapping[:batch_size], - seq_lens=None, - seq_lens_tensor=self._graph_seq_lens[:batch_size], - max_query_len=1, - max_decode_query_len=1, - max_prefill_seq_len=0, - max_decode_seq_len=self.runner.max_seq_len_to_capture, - query_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=self._graph_block_tables[:batch_size], - head_dim=self.runner.model_config.get_head_size()) - - if is_encoder_decoder_model: - raise NotImplementedError( - "MLACommonState does not support encoder/decoder yet") - - return attn_metadata - - def get_graph_input_buffers(self, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers = { - "slot_mapping": attn_metadata.slot_mapping, - "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, - "block_tables": attn_metadata.decode_metadata.block_tables, - } - if is_encoder_decoder_model: - raise NotImplementedError( - "MLACommonState does not support encoder/decoder yet") - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers["seq_lens_tensor"].copy_( - attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True) - input_buffers["block_tables"].copy_( - attn_metadata.decode_metadata.block_tables, non_blocking=True) - if is_encoder_decoder_model: - raise NotImplementedError( - "TritonMLAState does not support encoder/decoder yet") - - def begin_forward(self, model_input): - if self.chunked_prefill_enabled or self.enable_prefix_caching: - if not hasattr(self, "context_chunk_workspace"): - # not self.runner.device does not return the correct device - # for this process, (init_device sets the correct device but - # only on the Worker). The only way Ive figured out to get the - # correct device is to allocate the workspace on the first call - # to begin_forward and use the device of the input tokens - assert model_input.input_tokens is not None - self.context_chunk_workspace = torch.empty( - (self.context_chunk_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=model_input.input_tokens.device, - ) - - model_input.attn_metadata.context_chunk_workspace = \ - self.context_chunk_workspace - - -@dataclass -class MLACommonMetadata(AttentionMetadata): - """Metadata for MLACommon. - - NOTE: Please read the comment at the top of the file before trying to - understand this class - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional[Any] = None - _cached_decode_metadata: Optional[Any] = None - - num_prefill_tokens: int - - # The dimension of the attention heads - head_dim: Optional[int] = None - - # Used when chunked prefill is enabled to simulate worst case workspace - # allocations, hopefully to avoid going OOM - is_profile_run: bool = False - - # New for MLA (compared to FlashAttention) - # For chunked prefill - context_chunk_cu_seq_lens: Optional[torch.Tensor] = None - context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[List[int]] = None - context_chunk_max_seq_lens: Optional[List[int]] = None - # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted - context_chunk_workspace: Optional[torch.Tensor] = None - - def __post_init__(self): - supported_head_sizes = MLACommonBackend.get_supported_head_sizes() - if self.head_dim is not None and self.head_dim \ - not in supported_head_sizes: - raise ValueError( - f"Only {supported_head_sizes} are supported for head_dim,", - f" received {self.head_dim}.") - - @property - def prefill_metadata(self): - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = self.__class__( - # Required by ModelRunner - use_cuda_graph=False, # Not Attention Related - # Required by Attention Metadata - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - head_dim=self.head_dim, - is_profile_run=self.is_profile_run, - # MLACommonMetadata Chunk prefill specific - context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens, - context_chunk_starts=self.context_chunk_starts, - context_chunk_seq_tot=self.context_chunk_seq_tot, - context_chunk_max_seq_lens=self.context_chunk_max_seq_lens, - ) - return self._cached_prefill_metadata - - @property - def decode_metadata(self): - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.seq_lens_tensor is not None - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - self._cached_decode_metadata = self.__class__( - # Required by ModelRunner - use_cuda_graph=self.use_cuda_graph, # Not Attention Related - # Required by Attention Metadata - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - head_dim=self.head_dim, - is_profile_run=self.is_profile_run) - return self._cached_decode_metadata - - -class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): - """ - NOTE: Please read the comment at the top of the file before trying to - understand this class - """ - BLOCK_TABLE_EXTENDER: list[list[int]] = [] - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - self.chunked_prefill_enabled = \ - self.runner.scheduler_config.chunked_prefill_enabled - self.enable_prefix_caching = \ - self.runner.cache_config.enable_prefix_caching - - if self.chunked_prefill_enabled or self.enable_prefix_caching: - attn_state = self.input_builder.runner.attn_state - self.context_chunk_workspace_size = \ - attn_state.context_chunk_workspace_size - self.page_size = self.runner.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables( - self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs - - graph_block_tables = self.runner.graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER * - cuda_graph_pad_size) - num_decode_tokens = batch_size - self.num_prefill_tokens - - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - - context_chunk_cu_seq_lens = None - context_chunk_starts = None - context_chunk_seq_tot = None - context_chunk_max_seq_lens = None - - if (self.chunked_prefill_enabled or self.enable_prefix_caching) \ - and self.num_prefills > 0 \ - and context_lens_tensor is not None \ - and context_lens_tensor[:self.num_prefills].max() > 0: - - # NOTE: it is recommended you read the `Chunked Prefill` section in - # the comment at the top of the file before trying to understand - # the following code - - num_prefills_with_context = \ - (context_lens_tensor[:self.num_prefills] > 0).sum().item() - - # currently we allocate an equal amount of workspace for each - # prefill in the batch, we could probably use a more advanced - # algorithm here and allocate more workspace to prefills with - # longer context lengths - max_context_chunk = \ - self.context_chunk_workspace_size // num_prefills_with_context - - # align max_context_chunk to page_size by rounding down, - # currently the `gather_and_maybe_dequant_cache` kernel cannot - # handle `context_chunk_starts` that are not aligned to page_size - max_context_chunk = round_down(max_context_chunk, self.page_size) - assert max_context_chunk > 0 - num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk) - - # if `max_context_chunk = 256`, `num_chunks = 3`, and - # `num_prefills_with_context = 4`, create a tensor that looks like - # [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]] - context_chunk_starts = \ - torch.arange(num_chunks, device=device, dtype=torch.int32)\ - .unsqueeze(1).expand(-1, self.num_prefills)\ - * max_context_chunk - chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\ - .unsqueeze(0), context_chunk_starts + max_context_chunk) - chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0) - _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to( - torch.int32) - zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\ - .unsqueeze(-1) - context_chunk_cu_seq_lens = \ - torch.cat([zero, _context_chunk_cu_seq_lens], dim=1) - context_chunk_max_seq_lens = \ - chunk_seq_lens.max(dim=1).values.tolist() - context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist() - assert max(context_chunk_seq_tot) <= \ - self.context_chunk_workspace_size - - return self.runner.attn_backend.make_metadata( - # Required by ModelRunner - use_cuda_graph=use_captured_graph, # Not Attention Related - # Required by Attention Metadata - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, # Not Attention Related - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - head_dim=self.runner.model_config.get_head_size(), - is_profile_run=self.runner.in_profile_run, - # MLACommonMetadata Chunk prefill specific - context_chunk_cu_seq_lens=context_chunk_cu_seq_lens, - context_chunk_starts=context_chunk_starts, - context_chunk_seq_tot=context_chunk_seq_tot, - context_chunk_max_seq_lens=context_chunk_max_seq_lens, - ) - - -class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): - """ - NOTE: Please read the comment at the top of the file before trying to - understand this class - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - q_lora_rank: Optional[int], - kv_lora_rank: int, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - qk_head_dim: int, - v_head_dim: int, - kv_b_proj: ColumnParallelLinear, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing not supported in V0.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - self.kv_cache_dtype = kv_cache_dtype - - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_head_dim = qk_head_dim - self.v_head_dim = v_head_dim - self.kv_b_proj = kv_b_proj - - self.triton_fa_func = triton_attention - # Handle the differences between the flash_attn_varlen from flash_attn - # and the one from vllm_flash_attn. The former is used on RoCM and the - # latter has an additional parameter to control FA2 vs FA3 - self.flash_attn_varlen_func = flash_attn_varlen_func - self.vllm_flash_attn_version = get_flash_attn_version() - if self.vllm_flash_attn_version is not None: - self.flash_attn_varlen_func = \ - functools.partial(flash_attn_varlen_func, - fa_version=self.vllm_flash_attn_version) - - # For MLA the v head dim is smaller than qk head dim so we pad out - # v with 0s to match the qk head dim for attention backends that do - # not support different headdims - # We don't need to pad V if we are on a hopper system with FA3 - self._pad_v = self.vllm_flash_attn_version is None or not ( - self.vllm_flash_attn_version == 3 - and current_platform.get_device_capability()[0] == 9) - - def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale, - return_softmax_lse, **kwargs): - maybe_padded_v = v - if self._pad_v: - maybe_padded_v = torch.nn.functional.pad( - v, [0, q.shape[-1] - v.shape[-1]], value=0) - - if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \ - and not return_softmax_lse: - attn_out = self.triton_fa_func( - q, - k, - maybe_padded_v, - None, # output - kwargs["cu_seqlens_q"], - kwargs["cu_seqlens_k"], - kwargs["max_seqlen_q"], - kwargs["max_seqlen_k"], - kwargs["causal"], - softmax_scale, - None, # bias - ) - elif is_vllm_fa: - attn_out = self.flash_attn_varlen_func( - q=q, - k=k, - v=maybe_padded_v, - return_softmax_lse=return_softmax_lse, - softmax_scale=softmax_scale, - **kwargs, - ) - else: - # Use return_attn_probs instead of return_softmax_lse for RoCM - attn_out = self.flash_attn_varlen_func( - q=q, - k=k, - v=maybe_padded_v, - return_attn_probs=return_softmax_lse, - softmax_scale=softmax_scale, - **kwargs, - ) - - # Unpack the output if there is multiple results, - # triton always returns (output, softmax_lse), - # vllm_flash_attn returns (output, softmax_lse) when - # `return_softmax_lse = True` - # flash_attn (RoCM) returns (output, softmax_lse, ...) when - # `return_attn_probs = True` - rest = None - if isinstance(attn_out, tuple): - attn_out, *rest = attn_out - - # Remain consistent with old `flash_attn_varlen_func` where there - # is only one output tensor if `return_softmax_lse` is False. - if return_softmax_lse: - assert rest is not None - return attn_out, rest[0] - return attn_out - - def _v_up_proj(self, x): - # Convert from (B, N, L) to (N, B, L) - x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) - # Multiply (N, B, L) x (N, L, V) -> (N, B, V) - x = torch.bmm(x, self.W_UV) - # Convert from (N, B, V) to (B, N * V) - return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - - def get_layer_weight(layer): - WEIGHT_NAMES = ("weight", "qweight", "weight_packed") - for attr in WEIGHT_NAMES: - if hasattr(layer, attr): - return getattr(layer, attr) - raise AttributeError( - f"Layer '{layer}' has no recognized weight attribute:" - f" {WEIGHT_NAMES}.") - - def get_and_maybe_dequant_weights(layer: LinearBase): - if not isinstance(layer.quant_method, UnquantizedLinearMethod): - # NOTE: This should only be used offline, since it's O(N^3) - eye = torch.eye(layer.input_size_per_partition, - dtype=act_dtype, - device=get_layer_weight(layer).device) - dequant_weights = layer.quant_method.apply(layer, - eye, - bias=None) - del eye - # standardize to (output, input) - return dequant_weights.T - return layer.weight - - # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform - # the bmm's in 16-bit, the extra memory overhead of this is fairly low - kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T - assert kv_b_proj_weight.shape == ( - self.kv_lora_rank, - self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), ( - f"{kv_b_proj_weight.shape=}, " - f"{self.kv_lora_rank=}, " - f"{self.num_heads=}, " - f"{self.qk_nope_head_dim=}, " - f"{self.v_head_dim=}") - kv_b_proj_weight = kv_b_proj_weight.view( - self.kv_lora_rank, - self.num_heads, - self.qk_nope_head_dim + self.v_head_dim, - ) - - W_UK, W_UV = kv_b_proj_weight.split( - [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - # Convert from (L, N, V) to (N, L, V) - self.W_UV = W_UV.transpose(0, 1) - # Convert from (L, N, P) to (N, P, L) - self.W_UK_T = W_UK.permute(1, 2, 0) - - def _compute_prefill_context( - self, - q: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - k_scale: torch.Tensor, - ): - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - assert prefill_metadata.context_chunk_seq_tot is not None - assert prefill_metadata.context_chunk_cu_seq_lens is not None - assert prefill_metadata.context_chunk_starts is not None - assert prefill_metadata.context_chunk_max_seq_lens is not None - assert prefill_metadata.context_lens_tensor is not None - - output = None - iters = len(prefill_metadata.context_chunk_seq_tot) - - # Fetch from attn_metadata directly, since it late bound by - # MLAAttentionState, grabbing it directly `attn_metadata` can avoid - # any weirdness around prefill_metadata caching - assert attn_metadata.context_chunk_workspace is not None - workspace = attn_metadata.context_chunk_workspace - - for i in range(iters): - toks = prefill_metadata.context_chunk_seq_tot[i] - - ops.gather_and_maybe_dequant_cache( - src_cache=kv_c_and_k_pe_cache, - dst=workspace, - block_table=prefill_metadata.block_tables, - cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i], - batch_size=prefill_metadata.num_prefills, - kv_cache_dtype=self.kv_cache_dtype, - scale=k_scale, - seq_starts=prefill_metadata.context_chunk_starts[i], - ) - - kv_c_normed = workspace[:toks]\ - [..., :self.kv_lora_rank] - k_pe = workspace[:toks]\ - [..., self.kv_lora_rank:].unsqueeze(1) - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), - dim=-1) - - attn_output, attn_softmax_lse = \ - self._flash_attn_varlen_diff_headdims( - q=q, - k=k, - v=v, - cu_seqlens_q=prefill_metadata.query_start_loc, - cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i], - max_seqlen_q=prefill_metadata.max_query_len, - max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i], - softmax_scale=self.scale, - causal=False, # Context is unmasked - return_softmax_lse=True, - ) - - if output is None: - output = attn_output - output_lse = attn_softmax_lse - else: - output_tmp = torch.empty_like(output) - output_lse_tmp = torch.empty_like(output_lse) - merge_attn_states( - output=output_tmp, - output_lse=output_lse_tmp, - prefix_output=output, - prefix_lse=output_lse, - suffix_output=attn_output, - suffix_lse=attn_softmax_lse, - ) - output = output_tmp - output_lse = output_lse_tmp - - return output, output_lse - - def _forward_prefill( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - k_scale: torch.Tensor, - ) -> torch.Tensor: - - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - - has_context = prefill_metadata.context_lens_tensor is not None \ - and prefill_metadata.context_lens_tensor.max() > 0 - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) - - output = self._flash_attn_varlen_diff_headdims( - q=q, - k=k, - v=v, - cu_seqlens_q=prefill_metadata.query_start_loc, - cu_seqlens_k=prefill_metadata.query_start_loc, - max_seqlen_q=prefill_metadata.max_prefill_seq_len, - max_seqlen_k=prefill_metadata.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - return_softmax_lse=has_context, - ) - - if has_context: - # ROCm flash_attn_varlen_func will return 3 objects instead of 2 - suffix_output, suffix_lse = output - context_output, context_lse = self._compute_prefill_context( \ - q, kv_c_and_k_pe_cache, attn_metadata, k_scale) - - output = torch.empty_like(suffix_output) - merge_attn_states( - output=output, - prefix_output=context_output, - prefix_lse=context_lse, - suffix_output=suffix_output, - suffix_lse=suffix_lse, - ) - - # unpad if necessary - if self._pad_v: - output = output[..., :v.shape[-1]] - - return output.flatten(start_dim=-2) - - @abstractmethod - def _forward_decode( - self, - ql_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: T, - ) -> torch.Tensor: - raise NotImplementedError - - def forward( - self, - layer: AttentionLayer, - q: torch.Tensor, # query in unified attn - k_c_normed: torch.Tensor, # key in unified attn - k_pe: torch.Tensor, # value in unified attn - kv_cache: torch.Tensor, - attn_metadata: T, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - if output is not None: - raise NotImplementedError( - "output is not yet supported for MLAImplBase") - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for MLAImplBase") - - if attn_metadata.is_profile_run and \ - attn_metadata.context_chunk_workspace is not None: - # During the profile run try to simulate to worse case output size - # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context` - # since this can be large - _ = torch.empty( - (attn_metadata.context_chunk_workspace.shape[0], - self.num_heads, self.qk_nope_head_dim + self.v_head_dim), - device=k_c_normed.device, - dtype=k_c_normed.dtype, - ) - - has_decode = attn_metadata.decode_metadata is not None - has_prefill = attn_metadata.prefill_metadata is not None - - num_prefill_tokens: int = attn_metadata.num_prefill_tokens - q = q.view(-1, self.num_heads, self.qk_head_dim) - - decode_q = q[num_prefill_tokens:] - - prefill_q = q[:num_prefill_tokens] - prefill_k_pe = k_pe[:num_prefill_tokens] - prefill_k_c_normed = k_c_normed[:num_prefill_tokens] - - # write the latent and rope to kv cache - if kv_cache.numel() > 0: - ops.concat_and_cache_mla( - k_c_normed, - k_pe.squeeze(1), - kv_cache, - attn_metadata.slot_mapping.flatten(), - kv_cache_dtype=self.kv_cache_dtype, - scale=layer._k_scale, - ) - - output = torch.empty(attn_metadata.num_prefill_tokens + - attn_metadata.num_decode_tokens, - self.v_head_dim * self.num_heads, - device=q.device, - dtype=q.dtype) - if has_prefill: - output[:num_prefill_tokens] = self._forward_prefill( - prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache, - attn_metadata, layer._k_scale) - - if has_decode: - decode_q_nope, decode_q_pe = decode_q.split( - [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - # Convert from (B, N, P) to (N, B, P) - decode_q_nope = decode_q_nope.transpose(0, 1) - # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) - # Convert from (N, B, L) to (B, N, L) - decode_ql_nope = decode_ql_nope.transpose(0, 1) - - output[num_prefill_tokens:] = self._forward_decode( - decode_ql_nope, decode_q_pe, kv_cache, attn_metadata) - - return output diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py deleted file mode 100644 index 587d08858b92b..0000000000000 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ /dev/null @@ -1,407 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from contextlib import contextmanager -from dataclasses import dataclass -from typing import Optional, Type, Union - -import torch - -import vllm.envs as envs -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata, - MLACommonMetadataBuilder, - MLACommonState) -from vllm.attention.backends.utils import (compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) -from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd, - get_aiter_mla_metadata) - - -def is_aiter_mla_enabled() -> bool: - return envs.VLLM_ROCM_USE_AITER \ - and envs.VLLM_ROCM_USE_AITER_MLA - - -class AiterMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "ROCM_AITER_MLA" - - @staticmethod - def get_impl_cls() -> Type["AiterMLAImpl"]: - return AiterMLAImpl - - @staticmethod - def get_metadata_cls() -> Type["AiterMLAMetadata"]: - return AiterMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]: - return AiterMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["AiterMLAState"]: - return AiterMLAState - - -@dataclass -class AiterMLAMetadata(MLACommonMetadata): - # The following 5 tensors are for current version of AITER MLA - block_table_bound: Optional[torch.Tensor] = None - # The indptr of the paged kv cache, shape: [batch_size + 1] - paged_kv_indptr: Optional[torch.Tensor] = None - # The page indices of the paged kv cache - paged_kv_indices: Optional[torch.Tensor] = None - # The number of entries in the last page of each request in - # the paged kv cache, shape: [batch_size] - paged_kv_last_page_lens: Optional[torch.Tensor] = None - - # This is just to make new AITER MLA API work - # -- MTP support is not added yet. - qo_indptr: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self): - prefill_metadata = super().prefill_metadata - self._cached_prefill_metadata = prefill_metadata - - if prefill_metadata is not None: - prefill_metadata.paged_kv_indptr = self.paged_kv_indptr - prefill_metadata.paged_kv_indices = self.paged_kv_indices - prefill_metadata\ - .paged_kv_last_page_lens = self.paged_kv_last_page_lens - prefill_metadata.block_table_bound = self.block_table_bound - prefill_metadata.qo_indptr = self.qo_indptr - - # update the cache - self._cached_prefill_metadata = self.__class__( - **prefill_metadata.__dict__) - - return self._cached_prefill_metadata - - @property - def decode_metadata(self): - decode_metadata = super().decode_metadata - - self._cached_decode_metadata = decode_metadata - - if decode_metadata is not None: - decode_metadata.paged_kv_indptr = self.paged_kv_indptr - decode_metadata.paged_kv_indices = self.paged_kv_indices - decode_metadata\ - .paged_kv_last_page_lens = self.paged_kv_last_page_lens - decode_metadata.block_table_bound = self.block_table_bound - decode_metadata.qo_indptr = self.qo_indptr - - # update the cache - self._cached_decode_metadata = self.__class__( - **decode_metadata.__dict__) - - return self._cached_decode_metadata - - -class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - BLOCK_TABLE_EXTENDER: list[list[int]] = [[]] - - def __init__(self, input_builder): - super().__init__(input_builder) - assert self.block_size == 1, "AITER MLA requires only block size 1." - - def prepare(self): - super().prepare() - self.paged_kv_indices: list[int] = [] - self.paged_kv_indptr: list[int] = [0] - self.paged_kv_last_page_lens: list[int] = [] - self.total_blocks = 0 - self.qo_indptr: list[int] = [0] - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - if is_profile_run: - return - - # Update paged_kv_* tensors only for non-profile run - block_table = block_tables[seq_id] - self._update_paged_kv_tensors(block_table, seq_len) - - def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int): - # Get the number of valid blocks based on sequence length. - # If seq_len = 16, block_size = 16, - # block_table_bound is 1 with 1 valid block. - # If seq_len = 15, block_size = 16, - # block_table_bound is 0 + 1 with 1 valid block. - self.total_blocks += len(block_table) - block_table_bound = seq_len // self.block_size + 1 \ - if seq_len % self.block_size != 0 \ - else seq_len // self.block_size - self.paged_kv_indices.extend(block_table[:block_table_bound]) - self.paged_kv_indptr.append(self.paged_kv_indptr[-1] + - block_table_bound) - self.qo_indptr.append(self.qo_indptr[-1] + 1) - - last_page_len = seq_len % self.block_size - if last_page_len == 0: - last_page_len = self.block_size - self.paged_kv_last_page_lens.append(last_page_len) - - def build(self, seq_lens: list[int], query_lens: list[int], - cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata: - metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size, - batch_size) - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - if use_captured_graph: - last_paged_kv_indptr = self.paged_kv_indptr[-1] - self.paged_kv_indptr.extend([last_paged_kv_indptr] * - cuda_graph_pad_size) - self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size) - last_qo_indptr = self.qo_indptr[-1] - self.qo_indptr.extend([last_qo_indptr] * cuda_graph_pad_size) - - # For current version of AITER MLA - if len(self.paged_kv_indptr) > 0: - # extend to the maximum number of blocks as returned by the - # scheduler - self.paged_kv_indices.extend( - [0] * (self.total_blocks - len(self.paged_kv_indices))) - paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices, - device=device, - dtype=torch.int) - paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr, - device=device, - dtype=torch.int) - paged_kv_last_page_lens_tensor = torch.tensor( - self.paged_kv_last_page_lens, device=device, dtype=torch.int) - block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) - - 1, - device=device, - dtype=torch.int) - - qo_indptr = torch.tensor(self.qo_indptr, - device=device, - dtype=torch.int) - else: - paged_kv_indices_tensor = None - paged_kv_indptr_tensor = None - paged_kv_last_page_lens_tensor = None - block_table_bound_tensor = None - qo_indptr = None - - metadata.paged_kv_indptr = paged_kv_indptr_tensor - metadata.paged_kv_indices = paged_kv_indices_tensor - metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor - metadata.block_table_bound = block_table_bound_tensor - metadata.qo_indptr = qo_indptr - - return metadata - - -class AiterMLAState(MLACommonState[AiterMLAMetadata]): - - @contextmanager - def graph_capture(self, max_batch_size: int): - kv_indices, kv_indptr, last_page_lens, qo_indptr = \ - get_aiter_mla_metadata( - max_batch_size=max_batch_size, - block_size=self.runner.block_size, - max_block_per_batch=\ - self.runner.get_max_block_per_batch(), - device=self.runner.device) - self._paged_kv_indices_tensor = kv_indices - self._paged_kv_indptr_tensor = kv_indptr - self._paged_kv_last_page_lens_tensor = last_page_lens - self._qo_indptr_tensor = qo_indptr - - with super().graph_capture(max_batch_size): - yield - - del self._paged_kv_indices_tensor - del self._paged_kv_indptr_tensor - del self._paged_kv_last_page_lens_tensor - del self._qo_indptr_tensor - - def graph_capture_get_metadata_for_batch( - self, - batch_size: int, - is_encoder_decoder_model: bool = False) -> AiterMLAMetadata: - - metadata = super().graph_capture_get_metadata_for_batch( - batch_size, is_encoder_decoder_model) - - paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1] - paged_kv_indices = self._paged_kv_indices_tensor - paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[: - batch_size] - qo_indptr = self._qo_indptr_tensor[:batch_size + 1] - - metadata.paged_kv_indptr = paged_kv_indptr - metadata.paged_kv_indices = paged_kv_indices - metadata.paged_kv_last_page_lens = paged_kv_last_page_lens - metadata.qo_indptr = qo_indptr - - return metadata - - def get_graph_input_buffers(self, - attn_metadata: AiterMLAMetadata, - is_encoder_decoder_model: bool = False): - input_buffers = super().get_graph_input_buffers( - attn_metadata, is_encoder_decoder_model) - input_buffers[ - 'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr - input_buffers[ - "paged_kv_indices"] = attn_metadata.\ - decode_metadata.paged_kv_indices - input_buffers[ - "paged_kv_last_page_lens"] = attn_metadata.\ - decode_metadata.paged_kv_last_page_lens - input_buffers['qo_indptr'] = attn_metadata.qo_indptr - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata: AiterMLAMetadata, - is_encoder_decoder_model: bool = False): - super().prepare_graph_input_buffers(input_buffers, attn_metadata, - is_encoder_decoder_model) - - num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[ - 0] - input_buffers["paged_kv_indptr"].copy_( - attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True) - input_buffers["paged_kv_indices"][:num_total_blocks].copy_( - attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True) - input_buffers["paged_kv_last_page_lens"].copy_( - attn_metadata.decode_metadata.paged_kv_last_page_lens, - non_blocking=True) - input_buffers["qo_indptr"].copy_( - attn_metadata.decode_metadata.qo_indptr, non_blocking=True) - - -class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[list[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "Aiter MLA does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - from aiter import flash_attn_varlen_func - self.flash_attn_varlen_func = flash_attn_varlen_func - - def _flash_attn_varlen_diff_headdims( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - softmax_scale: float, return_softmax_lse: bool, - **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]: - output = self.flash_attn_varlen_func( - q, - k, - v, - **kwargs, - ) - - return output - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: AiterMLAMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - B = q_nope.shape[0] - - q = torch.cat([q_nope, q_pe], dim=-1) - o = torch.empty(B, - self.num_heads, - self.kv_lora_rank, - dtype=q.dtype, - device=q.device) - - kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) - - aiter_mla_decode_fwd(q, kv_buffer, o, self.scale, - attn_metadata.qo_indptr, - attn_metadata.max_query_len, - attn_metadata.paged_kv_indptr, - attn_metadata.paged_kv_indices, - attn_metadata.paged_kv_last_page_lens) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py deleted file mode 100644 index 9262144e37b54..0000000000000 --- a/vllm/attention/backends/rocm_flash_attn.py +++ /dev/null @@ -1,953 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer ROCm GPUs.""" -import itertools -from dataclasses import dataclass -from functools import cache -from typing import List, Optional, Tuple, Type - -import torch - -import vllm.envs as envs -from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import (CommonAttentionState, - CommonMetadataBuilder) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.config import get_current_vllm_config -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - QuantKey, kFp8StaticTensorSym) -from vllm.platforms import current_platform - -logger = init_logger(__name__) -_PARTITION_SIZE_ROCM = 256 - - -@cache -def is_rocm_aiter_paged_attn_enabled() -> bool: - return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \ - and envs.VLLM_ROCM_USE_AITER \ - - -@cache -def _get_paged_attn_module() -> PagedAttention: - """ - Initializes the appropriate PagedAttention module from `attention/ops`, - which is used as helper function - by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`. - - The choice of attention module depends on whether - AITER paged attention is enabled: - - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`. - - Otherwise, it defaults to using the original `PagedAttention`. - """ - if is_rocm_aiter_paged_attn_enabled(): - # Import AITERPagedAttention only when the flag is enabled - from vllm.attention.ops.rocm_aiter_paged_attn import ( - AITERPagedAttention) - return AITERPagedAttention() - return PagedAttention() - - -class ROCmFlashAttentionBackend(AttentionBackend): - accept_output_buffer: bool = True - - @staticmethod - def get_name() -> str: - return "ROCM_FLASH" - - @staticmethod - def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: - return ROCmFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return ROCmFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]: - return ROCmFlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - paged_attn = _get_paged_attn_module() - return paged_attn.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - paged_attn = _get_paged_attn_module() - paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - paged_attn = _get_paged_attn_module() - paged_attn.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - assert self.block_tables is not None - - self._cached_prefill_metadata = ROCmFlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=self.slot_mapping[:self.num_prefill_tokens], - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=self.seq_lens[:self.num_prefills], - seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1], - seq_start_loc=None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1], - context_lens_tensor=None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills], - block_tables=self.block_tables[:self.num_prefills], - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.block_tables is not None - assert self.seq_lens_tensor is not None - - self._cached_decode_metadata = ROCmFlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=self.slot_mapping[self.num_prefill_tokens:], - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], - max_query_len=None, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=self.block_tables[self.num_prefills:], - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - # Batch may be composed of prefill|decodes, adjust query start indices - # to refer to the start of decodes when the two are split apart. - # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - if self._cached_decode_metadata.query_start_loc is not None: - qs = self._cached_decode_metadata.query_start_loc - self._cached_decode_metadata.query_start_loc = qs - qs[0] - return self._cached_decode_metadata - - -class ROCmFlashAttentionMetadataBuilder( - CommonMetadataBuilder[ROCmFlashAttentionMetadata]): - - _metadata_cls = ROCmFlashAttentionMetadata - - -def _make_alibi_bias(alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: Optional[List[int]], - make_attn_mask: bool = True) -> List[torch.Tensor]: - attn_biases = [] - if seq_lens: - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat( - (num_heads, 1, 1)).to(alibi_slopes.device) - bias.mul_(alibi_slopes[:, None, None]) - if make_attn_mask: - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to( - alibi_slopes.device) - attn_biases.append((bias + inf_mask).to(dtype)) - else: - attn_biases.append(bias.to(dtype)) - - return attn_biases - - -def _get_seq_len_block_table_args( - attn_metadata: ROCmFlashAttentionMetadata, - attn_type: str, -) -> tuple: - ''' - The particular choice of sequence-length - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths - Encoder attn -> select encoder sequence lengths fields - Encoder-only attn -> select prefill sequence lengths with - bidirectional attention - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention op - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention, encoder-only - - Returns: - - * Appropriate sequence-lengths tensors for query and key - * Appropriate max sequence-length scalar - * Causal masking flag - ''' - - if attn_type == AttentionType.ENCODER: - assert attn_metadata.encoder_seq_lens is not None - assert attn_metadata.encoder_seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)), - device=attn_metadata.encoder_seq_lens_tensor.device, - dtype=attn_metadata.encoder_seq_lens_tensor.dtype) - causal_mask = False - - # No block tables associated with encoder attention - return (query_seq_start_loc, attn_metadata.max_encoder_seq_len, - query_seq_start_loc, attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_lens, causal_mask) - - elif attn_type == AttentionType.ENCODER_ONLY: - # For encoder-only models, we use the prefill sequence lengths - assert attn_metadata.seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - max_seq_len = attn_metadata.max_prefill_seq_len - # Encoder-only models typically use bidirectional attention - causal_mask = False - - return (query_seq_start_loc, max_seq_len, query_seq_start_loc, - max_seq_len, attn_metadata.seq_lens, causal_mask) - - elif attn_type == AttentionType.DECODER: - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - assert attn_metadata.seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - max_seq_len = attn_metadata.max_prefill_seq_len - causal_mask = True - - return (query_seq_start_loc, max_seq_len, query_seq_start_loc, - max_seq_len, attn_metadata.seq_lens, causal_mask) - elif attn_type == AttentionType.ENCODER_DECODER: - assert attn_metadata.seq_lens is not None - assert attn_metadata.encoder_seq_lens_tensor is not None - query_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.encoder_seq_lens_tensor.device, - dtype=attn_metadata.encoder_seq_lens_tensor.dtype) - - assert attn_metadata.encoder_seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - key_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - causal_mask = False - - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (query_start_loc, attn_metadata.max_prefill_seq_len, - key_seq_start_loc, attn_metadata.max_encoder_seq_len, - attn_metadata.seq_lens, causal_mask) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class ROCmFlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prompt_tokens -------------->| - |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->| - - Otherwise, the layout is as follows: - |<------------------ num_generation_tokens (M) ----------------->| - |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens ----------->| - |<-prompt_0->|...|<-prompt_N-1->|<-generation_0->|...|<-generation_M-1->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "ROCM_FLASH backend.") - if use_irope: - logger.warning_once( - "Using irope in ROCm Flash Attention is not supported yet, it " - "will fail back to global attention for long context.") - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - self.logits_soft_cap = 0.0 - else: - self.logits_soft_cap = logits_soft_cap - self.attn_type = attn_type - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - self.paged_attn_module = _get_paged_attn_module() - supported_head_sizes = self.paged_attn_module.get_supported_head_sizes( - ) - - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.use_naive_attn = False - # NOTE: Allow for switching between Triton and CK. Defaulting to triton. - self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN - if self.use_triton_flash_attn: - if logits_soft_cap is not None: - raise ValueError( - "ROCm Triton FlashAttention does not support attention" - " logits soft capping." - " please try using the ROCm CK " - "FA backend instead by setting the env var " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") - - from vllm.attention.ops.triton_flash_attention import ( # noqa: F401 - triton_attention) - self.triton_attn_func = triton_attention - logger.debug("Using Triton FA in ROCmBackend") - if self.sliding_window != (-1, -1): - logger.warning("ROCm Triton FA does not currently support " - "sliding window attention. If using half " - "precision, please try using the ROCm CK " - "FA backend instead by setting the env var " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") - else: - # if not using triton, navi3x/navi21/navi10 do not use flash-attn - # either - if not current_platform.has_device_capability(90): - self.use_naive_attn = True - else: - try: - from flash_attn import flash_attn_varlen_func # noqa: F401 - self.fa_attn_func = flash_attn_varlen_func - logger.debug("Using CK FA in ROCmBackend") - except ModuleNotFoundError: - self.use_naive_attn = True - - if self.use_naive_attn: - if logits_soft_cap is not None: - raise ValueError( - "ROCm Naive FlashAttention does not support " - "attention logits soft capping.") - - self.sdpa_attn_func = _sdpa_attention - logger.debug("Using naive (SDPA) attention in ROCmBackend") - - self.aiter_kv_scales_initialized = False - self.force_fp8_attention = ( - get_current_vllm_config() is not None - and get_current_vllm_config().model_config.override_attention_dtype - == "fp8") - - def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: - """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" - tokens, n_kv_heads, head_dim = x.shape - return (x[:, :, - None, :].expand(tokens, n_kv_heads, n_rep, - head_dim).reshape(tokens, n_kv_heads * n_rep, - head_dim)) - - def fused_output_quant_supported(self, quant_key: QuantKey): - if self.use_triton_flash_attn: - return quant_key == kFp8StaticTensorSym - - # Only supported in the Triton backend - return False - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: ROCmFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention and PagedAttention. - - For decoder-only models: query, key and value must be non-None. - - For encoder/decoder models: - * ROCmFlashAttentionImpl.forward() may be invoked for both self- and - cross-attention layers. - * For self-attention: query, key and value must be non-None. - * For cross-attention: - * Query must be non-None - * During prefill, key and value must be non-None; key and value - get cached for use during decode. - * During decode, key and value may be None, since: - (1) key and value tensors were cached during prefill, and - (2) cross-attention key and value tensors do not grow during - decode - - A note on how the attn_type (attention type enum) argument impacts - attention forward() behavior: - - * DECODER: normal decoder-only behavior; - use decoder self-attention block table - * ENCODER: no KV caching; pass encoder sequence - attributes (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) to kernel, in lieu of decoder - sequence attributes (seq_lens/seq_lens_tensor/max_seq_len) - * ENCODER_DECODER: cross-attention behavior; - use cross-attention block table for caching KVs derived - from encoder hidden states; since KV sequence lengths - will match encoder sequence lengths, pass encoder sequence - attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) - * ENCODER_ONLY: bidirectional attention with no KV caching; - use prefill sequence attributes - - Args: - layer: Attention layer instance. - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size * num_kv_heads * head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Optional output tensor. - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - assert output is not None, "Output tensor must be provided." - - if output_scale is not None and not self.use_triton_flash_attn: - raise NotImplementedError( - "fused output quantization only supported for Triton" - " implementation in ROCMFlashAttentionImpl for now") - - if output_block_scale is not None: - raise NotImplementedError( - "fused nvfp4 output quantization is not supported" - " for ROCMFlashAttentionImpl") - - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - paged_attn = self.paged_attn_module - - # Reshaping kv tensors is required for AITER paged attention kernel - # because it works on a different tensor shape, - # when the size of one element is one byte (int8/fp8 dtypes). - # This reshaping is only required on the first forward call - # and the kv cache must not be empty. - if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1 - and not self.aiter_kv_scales_initialized - and kv_cache.shape != torch.Size([0])): - num_blocks = kv_cache.shape[1] - block_size = kv_cache.shape[2] // (self.num_kv_heads * - self.head_size) - k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), - dtype=torch.float32, - device=kv_cache.device) - v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), - dtype=torch.float32, - device=kv_cache.device) - self.aiter_kv_scales_initialized = True - k_scale.fill_(layer._k_scale.item()) - v_scale.fill_(layer._v_scale.item()) - layer._k_scale = k_scale - layer._v_scale = v_scale - - # Only update KV cache for decoder self-attention - # and encoder-decoder cross-attention - if self.attn_type not in [ - AttentionType.ENCODER, AttentionType.ENCODER_ONLY - ] and kv_cache.numel() > 0: - key_cache, value_cache = paged_attn.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if key is not None and value is not None: - # Reshape the input keys and values and store them in the - # cache. If kv_cache is not provided, the new key and value - # tensors are not cached. This happens during the initial - # memory profiling run. - paged_attn.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping - if self.attn_type != AttentionType.ENCODER_DECODER else - attn_metadata.cross_slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - if self.attn_type != AttentionType.ENCODER: - num_prefill_tokens = attn_metadata.num_prefill_tokens - elif self.attn_type == AttentionType.ENCODER_ONLY: - # For encoder-only models, all tokens are processed in one go - num_prefill_tokens = query.shape[0] - else: - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - # QKV for prefill. - query = query[:num_prefill_tokens] - - # For encoder-only and encoder models, - # we process all tokens at once - # For decoder and encoder-decoder, - # we may need to limit key/value to prefill tokens - if key is not None and value is not None \ - and self.attn_type not in [AttentionType.ENCODER_DECODER, - AttentionType.ENCODER_ONLY]: - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - # normal attention and DECODER - if self.attn_type == AttentionType.DECODER and ( - kv_cache.numel() == 0 or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - (query_seq_start_loc, query_max_seq_len, key_seq_start_loc, - key_max_seq_len, seq_lens, - causal_mask) = (prefill_meta.seq_start_loc, - prefill_meta.max_prefill_seq_len, - prefill_meta.seq_start_loc, - prefill_meta.max_prefill_seq_len, - attn_metadata.seq_lens, True) - # prefix-enabled attention and ENCODER/ENCODER_DECODER - else: - (query_seq_start_loc, query_max_seq_len, key_seq_start_loc, - key_max_seq_len, seq_lens, - causal_mask) = _get_seq_len_block_table_args( - prefill_meta, self.attn_type) - # Prompt run. - if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: - # triton attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - attn_masks = None - if self.use_triton_flash_attn: - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, - query.dtype, - seq_lens, - make_attn_mask=causal_mask) # type: ignore - - use_fp8_scales = (layer._q_scale and layer._k_scale - and layer._v_scale and layer._prob_scale - and (self.kv_cache_dtype == "fp8" - or self.force_fp8_attention)) - - full_scales = ( - layer._q_scale.item(), layer._k_scale.item(), - layer._v_scale.item(), - layer._prob_scale.item()) if use_fp8_scales else None - self.triton_attn_func( - query, - key, - value, - output[:num_prefill_tokens], - query_seq_start_loc, - key_seq_start_loc, - query_max_seq_len, - key_max_seq_len, - causal_mask, - self.scale, - attn_masks[0][None] - if attn_masks is not None else None, - full_scales, - output_scale, - ) - elif self.use_naive_attn: - if self.num_kv_heads != self.num_heads: - # Interleave for MQA workaround. - key = self.repeat_kv(key, self.num_queries_per_kv) - value = self.repeat_kv(value, self.num_queries_per_kv) - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, - query.dtype, - attn_metadata.seq_lens, - make_attn_mask=causal_mask) # type: ignore - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - # sdpa math backend attention - self.sdpa_attn_func( - query, - key, - value, - output[:num_prefill_tokens], - query_seq_start_loc, - num_prefill_tokens, - self.num_heads, - self.head_size, - self.scale, - attn_masks, - ) - else: - # upstream FA does not support an output arg, copy - output[:num_prefill_tokens] = self.fa_attn_func( - q=query, - k=key, - v=value, - cu_seqlens_q=query_seq_start_loc, - cu_seqlens_k=key_seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=key_max_seq_len, - softmax_scale=self.scale, - causal=causal_mask, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - ) - - else: - # prefix-enabled attention - - # not applicable for encoder-only models - if self.attn_type != AttentionType.ENCODER_ONLY: - output[:num_prefill_tokens] = paged_attn.forward_prefix( - query, - key, - value, - self.kv_cache_dtype, - key_cache, - value_cache, - prefill_meta.block_tables, - prefill_meta.query_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.max_query_len, - self.alibi_slopes, - self.sliding_window[0], - layer._k_scale, - layer._v_scale, - ) - # Skip decode phase for encoder-only models - if (decode_meta := attn_metadata.decode_metadata) and ( - self.attn_type != AttentionType.ENCODER_ONLY): - # Decoding run. - # Whether to use rocm custom paged attention or not - num_seqs, num_heads, head_size = decode_query.shape - block_size = value_cache.shape[3] - gqa_ratio = num_heads // self.num_kv_heads - from vllm.platforms.rocm import use_rocm_custom_paged_attention - use_custom = use_rocm_custom_paged_attention( - decode_query.dtype, head_size, block_size, gqa_ratio, - decode_meta.max_decode_seq_len, self.sliding_window, - self.kv_cache_dtype, self.alibi_slopes) - - if use_custom: - max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type - != AttentionType.ENCODER_DECODER else - decode_meta.max_encoder_seq_len) - assert max_seq_len is not None - max_num_partitions = ( - (max_seq_len + _PARTITION_SIZE_ROCM - 1) // - _PARTITION_SIZE_ROCM) - assert _PARTITION_SIZE_ROCM % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=query.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - - query_start_loc = None - ops.paged_attention_rocm( - output[num_prefill_tokens:], - exp_sums, - max_logits, - tmp_output, - decode_query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - decode_meta.block_tables - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.cross_block_tables, - decode_meta.seq_lens_tensor - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.encoder_seq_lens_tensor, - query_start_loc, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - output_scale, - ) - else: - # PagedAttention does not support fused quant, manually quantize - if output_scale is None: - out_pa = output[num_prefill_tokens:] - else: - out_pa = torch.empty_like(output[num_prefill_tokens:], - dtype=query.dtype) - - out_pa[:] = paged_attn.forward_decode( - decode_query, - key_cache, - value_cache, - decode_meta.block_tables - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.cross_block_tables, - decode_meta.seq_lens_tensor - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.encoder_seq_lens_tensor, - decode_meta.max_decode_seq_len - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.max_encoder_seq_len, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Manually perform quantization - if output_scale is not None: - out_uq = out_pa.view(-1, self.num_heads * self.head_size) - out_q = output.view(-1, self.num_heads * self.head_size) - ops.scaled_fp8_quant(out_uq, - output_scale, - output=out_q[num_prefill_tokens:]) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - -def _sdpa_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - output: torch.Tensor, - seq_lens: torch.Tensor, - num_tokens: int, - num_heads: int, - head_size: int, - scale: float, - attn_masks: Optional[List[torch.Tensor]] = None, -) -> torch.Tensor: - start = 0 - assert output.shape == (num_tokens, num_heads, head_size) - assert output.dtype == query.dtype - assert output.device == query.device - - for i, seq_len in enumerate(seq_lens): - end = start + seq_len - with torch.nn.attention.sdpa_kernel( - torch.nn.attention.SDPBackend.MATH): - sub_out = torch.nn.functional.scaled_dot_product_attention( - query[:, start:end, :], - key[:, start:end, :], - value[:, start:end, :], - dropout_p=0.0, - is_causal=attn_masks is None, - attn_mask=attn_masks[i] if attn_masks else None, - scale=scale).movedim(query.dim() - 2, 0) - output[start:end, :, :] = sub_out - start = end - - return output diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py deleted file mode 100644 index fba5b5f6bca86..0000000000000 --- a/vllm/attention/backends/triton_mla.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata) -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd - - -class TritonMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "TRITON_MLA" - - @staticmethod - def get_impl_cls() -> Type["TritonMLAImpl"]: - return TritonMLAImpl - - -class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "TritonMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "TritonMLAImpl") - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "TritonMLA with FP8 KV cache not yet supported") - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - B = q_nope.shape[0] - - q = torch.cat([q_nope, q_pe], dim=-1) - o = torch.zeros(B, - self.num_heads, - self.kv_lora_rank, - dtype=q.dtype, - device=q.device) - - num_kv_splits = 4 # TODO: heuristic - - # TODO(lucas) Allocate ahead of time - attn_logits = torch.empty( - ( - B, - self.num_heads, - num_kv_splits, - # NOTE(lucas) idk why the +1 is here but sglang has it so we - # just mirror that - self.kv_lora_rank + 1, - ), - dtype=torch.float32, - device=q.device, - ) - - # Add a head dim of 1 - kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2) - kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank] - PAGE_SIZE = kv_c_and_k_pe_cache.size(1) - - # Run MQA - decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o, - decode_meta.block_tables, - decode_meta.seq_lens_tensor, attn_logits, - num_kv_splits, self.scale, PAGE_SIZE) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index b28e6a4237cb6..3f15580872c7f 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -338,10 +338,9 @@ class CommonAttentionState(AttentionState): # The encoder decoder model works only with XFormers and # Flash Attention backend. Assert the same. assert self.runner.attn_backend.get_name() in \ - ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \ - f"Expected attn_backend name to be either 'XFORMERS'," \ - f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ - f"got '{self.runner.attn_backend.get_name()}'" + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or " \ + f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -360,10 +359,9 @@ class CommonAttentionState(AttentionState): # The encoder decoder model works only with XFormers and # Flash Attention backend. Assert the same. assert self.runner.attn_backend.get_name() in \ - ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \ - f"Expected attn_backend name to be either 'XFORMERS'," \ - f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ - f"got '{self.runner.attn_backend.get_name()}'" + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or " \ + f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'" self._add_additional_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py deleted file mode 100644 index 302d3d7ea903f..0000000000000 --- a/vllm/attention/backends/xformers.py +++ /dev/null @@ -1,805 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with xFormers and PagedAttention.""" -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type - -import torch -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (AttentionBias, - BlockDiagonalCausalMask, - BlockDiagonalMask, - LowerTriangularMaskWithTensorBias) - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import ( - CommonAttentionState, CommonMetadataBuilder, - get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, - is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class XFormersBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "XFORMERS" - - @staticmethod - def get_impl_cls() -> Type["XFormersImpl"]: - return XFormersImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return XFormersMetadata - - @staticmethod - def get_builder_cls() -> Type["XFormersMetadataBuilder"]: - return XFormersMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], - ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for XFormersbackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # FIXME: It is for flash attn. - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] = None - - # FIXME: It is for flash attn. - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] = None - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - - # Self-attention prefill/decode metadata cache - _cached_prefill_metadata: Optional["XFormersMetadata"] = None - _cached_decode_metadata: Optional["XFormersMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # FIXME: It is for flash attn. - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[AttentionBias]] = None - self.encoder_attn_bias: Optional[List[AttentionBias]] = None - self.cross_attn_bias: Optional[List[AttentionBias]] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata(self) -> Optional["XFormersMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - # Recover cached prefill-phase attention - # metadata structure - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - # Construct & cache prefill-phase attention metadata structure - self._cached_prefill_metadata = XFormersMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["XFormersMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - # Recover cached decode-phase attention - # metadata structure - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - # Construct & cache decode-phase attention metadata structure - self._cached_decode_metadata = XFormersMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - block_tables=block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - - # Batch may be composed of prefill|decodes, adjust query start indices - # to refer to the start of decodes when the two are split apart. - # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - if self._cached_decode_metadata.query_start_loc is not None: - qs = self._cached_decode_metadata.query_start_loc - self._cached_decode_metadata.query_start_loc = qs - qs[0] - return self._cached_decode_metadata - - -def _get_attn_bias( - attn_metadata: XFormersMetadata, - attn_type: str, -) -> Optional[AttentionBias]: - ''' - Extract appropriate attention bias from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate attention bias value given the attention type - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - return attn_metadata.attn_bias - elif attn_type == AttentionType.ENCODER: - return attn_metadata.encoder_attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - return attn_metadata.cross_attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -def _set_attn_bias( - attn_metadata: XFormersMetadata, - attn_bias: List[Optional[AttentionBias]], - attn_type: str, -) -> None: - ''' - Update appropriate attention bias field of attention metadata, - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_bias: The desired attention bias value - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - attn_metadata.attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER: - attn_metadata.encoder_attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - attn_metadata.cross_attn_bias = attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]): - - _metadata_cls = XFormersMetadata - - -class XFormersImpl(AttentionImpl[XFormersMetadata]): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "XFORMERS backend.") - if logits_soft_cap is not None: - logger.warning_once("XFormers does not support logits soft cap. " - "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in XFormers is not supported yet, it will fall" - " back to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: Optional[torch.Tensor], - value: Optional[torch.Tensor], - kv_cache: torch.Tensor, - attn_metadata: "XFormersMetadata", - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with xFormers and PagedAttention. - - For decoder-only models: query, key and value must be non-None. - - For encoder/decoder models: - * XFormersImpl.forward() may be invoked for both self- and cross- - attention layers. - * For self-attention: query, key and value must be non-None. - * For cross-attention: - * Query must be non-None - * During prefill, key and value must be non-None; key and value - get cached for use during decode. - * During decode, key and value may be None, since: - (1) key and value tensors were cached during prefill, and - (2) cross-attention key and value tensors do not grow during - decode - - A note on how the attn_type (attention type enum) argument impacts - attention forward() behavior: - - * DECODER: normal decoder-only behavior; - use decoder self-attention block table - * ENCODER: no KV caching; pass encoder sequence - attributes (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) to kernel, in lieu of decoder - sequence attributes (seq_lens/seq_lens_tensor/max_seq_len). - Used for encoder branch of encoder-decoder models. - * ENCODER_ONLY: no kv_caching, uses the normal attention - attributes (seq_lens/seq_lens_tensor/max_seq_len). - * ENCODER_DECODER: cross-attention behavior; - use cross-attention block table for caching KVs derived - from encoder hidden states; since KV sequence lengths - will match encoder sequence lengths, pass encoder sequence - attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) - - Args: - layer: Attention layer instance. - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size * num_kv_heads * head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Optional output tensor. - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for XFormersImpl") - - attn_type = self.attn_type - # Check that appropriate attention metadata attributes are - # selected for the desired attention type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - # Self-attention vs. cross-attention will impact - # which KV cache memory-mapping & which - # seqlen datastructures we utilize - - if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): - # KV-cache during decoder-self- or - # encoder-decoder-cross-attention, but not - # during encoder attention. - # - # Even if there are no new key/value pairs to cache, - # we still need to break out key_cache and value_cache - # i.e. for later use by paged attention - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if (key is not None) and (value is not None): - - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - # During cross-attention decode, key & value will be None, - # preventing this IF-statement branch from running - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory - # profiling run. - PagedAttention.write_to_paged_cache( - key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) - (num_prefill_query_tokens, num_prefill_kv_tokens, - num_decode_query_tokens) = \ - get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) - - output = torch.empty_like(query) - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_query_tokens:] - # QKV for prefill. - query = query[:num_prefill_query_tokens] - if key is not None and value is not None: - key = key[:num_prefill_kv_tokens] - value = value[:num_prefill_kv_tokens] - - assert query.shape[0] == num_prefill_query_tokens - assert decode_query.shape[0] == num_decode_query_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: - # normal attention. - # block tables are empty if the prompt does not have a cached - # prefix. - out = self._run_memory_efficient_xformers_forward( - query, key, value, prefill_meta, attn_type=attn_type) - assert out.shape == output[:num_prefill_query_tokens].shape - output[:num_prefill_query_tokens] = out - else: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have prefix attention.") - - assert prefill_meta.query_start_loc is not None - assert prefill_meta.max_query_len is not None - - # prefix-enabled attention - # TODO(Hai) this triton kernel has regression issue (broke) to - # deal with different data types between KV and FP8 KV cache, - # to be addressed separately. - out = PagedAttention.forward_prefix( - query, - key, - value, - self.kv_cache_dtype, - key_cache, - value_cache, - prefill_meta.block_tables, - prefill_meta.query_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.max_query_len, - self.alibi_slopes, - self.sliding_window, - layer._k_scale, - layer._v_scale, - ) - assert output[:num_prefill_query_tokens].shape == out.shape - output[:num_prefill_query_tokens] = out - - if decode_meta := attn_metadata.decode_metadata: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have decode metadata.") - - ( - seq_lens_arg, - max_seq_len_arg, - block_tables_arg, - ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - - output[num_prefill_query_tokens:] = PagedAttention.forward_decode( - decode_query, - key_cache, - value_cache, - block_tables_arg, - seq_lens_arg, - max_seq_len_arg, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - def _run_memory_efficient_xformers_forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_metadata: XFormersMetadata, - attn_type: str = AttentionType.DECODER, - ) -> torch.Tensor: - """Attention for 1D query of multiple prompts. Multiple prompt - tokens are flattened in to `query` input. - - See https://facebookresearch.github.io/xformers/components/ops.html - for API spec. - - Args: - query: shape = [num_prefill_tokens, num_heads, head_size] - key: shape = [num_prefill_tokens, num_kv_heads, head_size] - value: shape = [num_prefill_tokens, num_kv_heads, head_size] - attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally - """ - - original_query = query - if self.num_kv_heads != self.num_heads: - # GQA/MQA requires the shape [B, M, G, H, K]. - # Note that the output also has the same shape (which is different - # from a spec from the doc). - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, key.shape[-1]) - value = value[:, :, - None, :].expand(value.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - - # Set attention bias if not provided. This typically happens at - # the very attention layer of every iteration. - # FIXME(woosuk): This is a hack. - attn_bias = _get_attn_bias(attn_metadata, attn_type) - if attn_bias is None: - if self.alibi_slopes is None: - - # Cross attention block of decoder branch of encoder-decoder - # model uses seq_lens for dec / encoder_seq_lens for enc - if (attn_type == AttentionType.ENCODER_DECODER): - assert attn_metadata.seq_lens is not None - assert attn_metadata.encoder_seq_lens is not None - - # Cross-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.seq_lens, - attn_metadata.encoder_seq_lens, - device=query.device) - - # Encoder branch of encoder-decoder model uses - # attn_metadata.encoder_seq_lens - elif attn_type == AttentionType.ENCODER: - - assert attn_metadata.encoder_seq_lens is not None - - # Encoder self-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.encoder_seq_lens, device=query.device) - - # Self-attention block of encoder-only model just - # uses the seq_lens directly. - elif attn_type == AttentionType.ENCODER_ONLY: - assert attn_metadata.seq_lens is not None - - # Encoder self-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.seq_lens, device=query.device) - - # Self-attention block of decoder branch just - # uses the seq_lens directly - elif attn_type == AttentionType.DECODER: - assert attn_metadata.seq_lens is not None - - # Decoder self-attention mask is causal - attn_bias = BlockDiagonalCausalMask.from_seqlens( - attn_metadata.seq_lens, device=query.device) - else: - raise ValueError("Unknown AttentionType: %s", attn_type) - - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - attn_bias = [attn_bias] - else: - assert attn_type == AttentionType.DECODER - assert attn_metadata.seq_lens is not None - attn_bias = _make_alibi_bias(self.alibi_slopes, - self.num_kv_heads, query.dtype, - attn_metadata.seq_lens) - - _set_attn_bias(attn_metadata, attn_bias, attn_type) - - # No alibi slopes. - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - if self.alibi_slopes is None: - # Add the batch dimension. - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - out = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=attn_bias[0], - p=0.0, - scale=self.scale) - return out.view_as(original_query) - - # Attention with alibi slopes. - # FIXME(woosuk): Because xformers does not support dynamic sequence - # lengths with custom attention bias, we process each prompt one by - # one. This is inefficient, especially when we have many short prompts. - assert attn_metadata.seq_lens is not None - output = torch.empty_like(original_query) - start = 0 - for i, seq_len in enumerate(attn_metadata.seq_lens): - end = start + seq_len - out = xops.memory_efficient_attention_forward( - query[None, start:end], - key[None, start:end], - value[None, start:end], - attn_bias=attn_bias[i], - p=0.0, - scale=self.scale) - # TODO(woosuk): Unnecessary copy. Optimize. - output[start:end].copy_(out.view_as(original_query[start:end])) - start += seq_len - return output - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[AttentionBias]: - attn_biases: List[AttentionBias] = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - # Calculate a matrix where each element represents ith element- jth - # element. - bias = bias[None, :] - bias[:, None] - - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - 1, # batch size - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - attn_biases.append(LowerTriangularMaskWithTensorBias(bias)) - - return attn_biases diff --git a/vllm/config/model.py b/vllm/config/model.py index 95fe52883db0d..33e5d3ea04a48 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,8 +32,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.runai_utils import (ObjectStorageModel, is_runai_obj_uri) from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, - LazyLoader, common_broadcastable_dtype) +from vllm.utils import LayerBlockType, LazyLoader, common_broadcastable_dtype if TYPE_CHECKING: from transformers import PretrainedConfig @@ -1103,10 +1102,6 @@ class ModelConfig: self.hf_config.dual_chunk_attention_config[ "sparse_attention_enabled"] = True - if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: - raise ValueError("please set VLLM_ATTENTION_BACKEND to " - f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - def verify_with_parallel_config( self, parallel_config: ParallelConfig, diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 911d77ba36fa0..efa4c9abf47f7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -44,7 +44,7 @@ class model_aware_kv_ops_helper: # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading # to a kv_cache shape of [2, num_blks, blk_size, # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim]. - # For more details, see vllm/attention/backends/mla/common.py. + # For more details, see vllm/v1/attention/backends/mla/common.py. if self.is_deepseek_mla and self.use_mla_opt: head_size = model_config.kv_lora_rank + \ model_config.qk_rope_head_dim diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7e00260caa39a..b09d43f705580 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,8 +44,8 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.config import (get_model_path, is_interleaved, maybe_override_with_speculators) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, get_ip, is_in_ray_actor) +from vllm.utils import (FlexibleArgumentParser, GiB_bytes, get_ip, + is_in_ray_actor) from vllm.v1.sample.logits_processor import LogitsProcessor # yapf: enable @@ -1163,17 +1163,6 @@ class EngineArgs: self._set_default_args_v0(model_config) assert self.enable_chunked_prefill is not None - if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]: - assert self.enforce_eager, ( - "Cuda graph is not supported with DualChunkFlashAttention. " - "To run the model in eager mode, set 'enforce_eager=True' " - "or use '--enforce-eager' in the CLI.") - assert current_platform.is_cuda(), ( - "DualChunkFlashAttention is only supported on CUDA platform.") - assert not use_v1, ( - "DualChunkFlashAttention is not supported on V1 engine. " - "To run the model in V0 engine, try set 'VLLM_USE_V1=0'") - sliding_window: Optional[int] = None if not is_interleaved(model_config.hf_text_config): # Only set CacheConfig.sliding_window if the model is all sliding diff --git a/vllm/envs.py b/vllm/envs.py index 3991a789d80f6..cbd1d5474e60f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -529,7 +529,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention # - "XFORMERS": use XFormers - # - "ROCM_FLASH": use ROCmFlashAttention # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA # - "FLASH_ATTN_MLA": use FlashAttention for MLA diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index c926e17a2c197..7f376b70a7ae0 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -53,13 +53,18 @@ class Mamba2Metadata: def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: """Returns the appropriate metadata classes for the current platform.""" if current_platform.is_rocm(): - from vllm.attention.backends.rocm_flash_attn import ( - ROCmFlashAttentionMetadata) - return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata) - elif current_platform.is_cuda(): - from vllm.attention.backends.flash_attn import FlashAttentionMetadata - from vllm.attention.backends.xformers import XFormersMetadata - return (FlashAttentionMetadata, XFormersMetadata, + from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) + from vllm.v1.attention.backends.triton_attn import ( + TritonAttentionMetadata) + return (AiterFlashAttentionMetadata, TritonAttentionMetadata, + PlaceholderAttentionMetadata) + if current_platform.is_cuda(): + from vllm.v1.attention.backends.flash_attn import ( + FlashAttentionMetadata) + from vllm.v1.attention.backends.xformers import ( + XFormersAttentionMetadata) + return (FlashAttentionMetadata, XFormersAttentionMetadata, PlaceholderAttentionMetadata) raise ValueError( f"Unsupported platform for Mamba2: {current_platform.device_type}") diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index a99a6679a5696..415d36c681d8c 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -478,7 +478,8 @@ class DeepseekV2MLAAttention(nn.Module): Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). - For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py + For more info see MLACommonImpl in: + vllm/v1/attention/backends/mla/utils.py """ def __init__( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c263e2afe83b0..05f129f513a0a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -226,8 +226,10 @@ class CudaPlatformBase(Platform): kv_cache_dtype, block_size, use_v1, use_mla, has_sink) -> str: if use_mla: - # TODO(lucas): refactor to be more concise - # we should probably consider factoring out V1 here + if not use_v1: + raise RuntimeError( + "MLA attention backends require the V1 engine. " + "Set VLLM_USE_V1=1 to enable them.") from vllm.attention.ops.flashmla import is_flashmla_supported from vllm.attention.utils.fa_utils import flash_attn_supports_mla @@ -246,35 +248,17 @@ class CudaPlatformBase(Platform): use_triton = selected_backend == _Backend.TRITON_MLA or ( selected_backend is None) - def _get_version(name, import_suffix) -> str: - if use_v1: - logger.info_once(f"Using {name} backend on V1 engine.") - return f"vllm.v1.attention.backends.mla.{import_suffix}" - else: - logger.info_once(f"Using {name} backend.") - return f"vllm.attention.backends.{import_suffix}" - if use_cutlassmla: - if use_v1: - logger.info_once("Using Cutlass MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "cutlass_mla.CutlassMLABackend") - else: - logger.warning( - "Cutlass MLA backend is only supported on V1 engine") + logger.info_once("Using Cutlass MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "cutlass_mla.CutlassMLABackend") if use_flashinfermla: - if use_v1: - from vllm.v1.attention.backends.utils import ( - set_kv_cache_layout) - set_kv_cache_layout("HND") - logger.info_once( - "Using FlashInfer MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "flashinfer_mla.FlashInferMLABackend") - else: - logger.warning( - "FlashInfer MLA backend is only supported on V1 engine" - ) + from vllm.v1.attention.backends.utils import ( + set_kv_cache_layout) + set_kv_cache_layout("HND") + logger.info_once("Using FlashInfer MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashinfer_mla.FlashInferMLABackend") if use_flashmla: if block_size != 64: logger.warning( @@ -282,20 +266,18 @@ class CudaPlatformBase(Platform): " (currently only supports block size 64).", block_size) else: - return _get_version("FlashMLA", "flashmla.FlashMLABackend") - if use_flashattn: - if use_v1: - logger.info_once( - "Using FlashAttention MLA backend on V1 engine.") + logger.info_once("Using FlashMLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." - "flashattn_mla.FlashAttnMLABackend") - else: - logger.warning( - "FlashAttention MLA backend is only supported on V1 " - "engine.") + "flashmla.FlashMLABackend") + if use_flashattn: + logger.info_once( + "Using FlashAttention MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashattn_mla.FlashAttnMLABackend") if use_triton: - return _get_version("Triton MLA", - "triton_mla.TritonMLABackend") + logger.info_once("Using Triton MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "triton_mla.TritonMLABackend") if use_v1: FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501 FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 @@ -382,78 +364,9 @@ class CudaPlatformBase(Platform): ) return FLEX_ATTENTION_V1 - # Backends for V0 engine - if selected_backend == _Backend.XFORMERS: - logger.info("Using XFormers backend.") - return "vllm.attention.backends.xformers.XFormersBackend" - elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN: - logger.info("Using DualChunkFlashAttention backend.") - return ("vllm.attention.backends.dual_chunk_flash_attn." - "DualChunkFlashAttentionBackend") - elif selected_backend == _Backend.DIFFERENTIAL_FLASH_ATTN: - logger.info("Using DifferentialFlashAttention backend.") - return ("vllm.attention.backends.differential_flash_attn." - "DifferentialFlashAttentionBackend") - elif selected_backend == _Backend.FLASH_ATTN: - pass - elif selected_backend: - raise ValueError( - f"Invalid attention backend for {cls.device_name}, " - f"with use_v1: {use_v1} use_mla: {use_mla}") - - target_backend = _Backend.FLASH_ATTN - if not cls.has_device_capability(80): - # Volta and Turing NVIDIA GPUs. - logger.info( - "Cannot use FlashAttention-2 backend for Volta and Turing " - "GPUs.") - target_backend = _Backend.XFORMERS - elif dtype not in (torch.float16, torch.bfloat16): - logger.info( - "Cannot use FlashAttention-2 backend for dtype other than " - "torch.float16 or torch.bfloat16.") - target_backend = _Backend.XFORMERS - elif block_size % 16 != 0: - logger.info( - "Cannot use FlashAttention-2 backend for block size not " - "divisible by 16.") - target_backend = _Backend.XFORMERS - - # FlashAttn is valid for the model, checking if the package is - # installed. - if target_backend == _Backend.FLASH_ATTN: - try: - import vllm.vllm_flash_attn # noqa: F401 - from vllm.attention.backends.flash_attn import ( # noqa: F401 - FlashAttentionBackend, flash_attn_supports_fp8) - - supported_sizes = \ - FlashAttentionBackend.get_supported_head_sizes() - if head_size not in supported_sizes: - logger.info( - "Cannot use FlashAttention-2 backend for head size %d.", - head_size) - target_backend = _Backend.XFORMERS - fp8_kv_cache = (kv_cache_dtype is not None - and kv_cache_dtype.startswith("fp8")) - if (fp8_kv_cache and not flash_attn_supports_fp8()): - logger.info( - "Cannot use FlashAttention backend for FP8 KV cache.") - target_backend = _Backend.XFORMERS - except ImportError: - logger.info( - "Cannot use FlashAttention-2 backend because the " - "vllm.vllm_flash_attn package is not found. " - "Make sure that vllm_flash_attn was built and installed " - "(on by default).") - target_backend = _Backend.XFORMERS - - if target_backend == _Backend.XFORMERS: - logger.info("Using XFormers backend.") - return "vllm.attention.backends.xformers.XFormersBackend" - - logger.info("Using Flash Attention backend.") - return "vllm.attention.backends.flash_attn.FlashAttentionBackend" + raise RuntimeError( + "V0 attention backends have been removed. Set VLLM_USE_V1=1 " + "to select a supported backend.") @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index dce2924ac7a9d..9470434aa428b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -191,6 +191,11 @@ class RocmPlatform(Platform): kv_cache_dtype, block_size, use_v1, use_mla, has_sink) -> str: if use_mla: + if not use_v1: + raise RuntimeError( + "MLA attention backends require the V1 engine. " + "Set VLLM_USE_V1=1 to enable them.") + from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( is_aiter_mla_enabled) @@ -201,39 +206,24 @@ class RocmPlatform(Platform): if selected_backend == _Backend.TRITON_MLA: if block_size != 1: - if use_v1: - logger.info_once( - "Using Triton MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "triton_mla.TritonMLABackend") - else: - logger.info("Using Triton MLA backend.") - return "vllm.attention.backends.triton_mla.TritonMLABackend" # noqa: E501 - else: - raise ValueError( - f" The selected backend, {selected_backend.name}," - f"does not support block size {block_size}.") - elif selected_backend == _Backend.ROCM_AITER_MLA \ - or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1: - if block_size == 1: - if use_v1: - logger.info("Using AITER MLA backend on V1 engine.") - return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 - else: - logger.info("Using AITER MLA backend") - return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend" # noqa: E501 - else: - raise ValueError( - f" The selected backend, {selected_backend.name}," - f"does not support block size {block_size}." - "(currently only supports block size 1)") - else: + logger.info_once("Using Triton MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "triton_mla.TritonMLABackend") raise ValueError( f" The selected backend, {selected_backend.name}," - f"is not MLA type while requested for MLA backend.") - - if selected_backend is None or selected_backend == _Backend.FLASH_ATTN: - selected_backend = _Backend.ROCM_FLASH + f"does not support block size {block_size}.") + if selected_backend in (_Backend.ROCM_AITER_MLA, + _Backend.ROCM_AITER_MLA_VLLM_V1): + if block_size == 1: + logger.info("Using AITER MLA backend on V1 engine.") + return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 + raise ValueError( + f" The selected backend, {selected_backend.name}," + f"does not support block size {block_size}." + "(currently only supports block size 1)") + raise ValueError( + f" The selected backend, {selected_backend.name}," + f"is not MLA type while requested for MLA backend.") if envs.VLLM_USE_V1: if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA \ @@ -245,14 +235,9 @@ class RocmPlatform(Platform): logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "triton_attn.TritonAttentionBackend") - if selected_backend == _Backend.ROCM_FLASH: - if not cls.has_device_capability(90): - # not Instinct series GPUs. - logger.info("flash_attn is not supported on NAVI GPUs.") - else: - logger.info("%s is not supported in AMD GPUs.", selected_backend) - logger.info("Using ROCmFlashAttention backend.") - return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501 + raise RuntimeError( + "V0 attention backends have been removed. Set VLLM_USE_V1=1 " + "to select a supported backend.") @classmethod def set_device(cls, device: torch.device) -> None: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 968bba664f0a9..834ec9b1d30b4 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -157,10 +157,8 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND" # register, corresponding to possible backends STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA" -STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH" STR_XFORMERS_ATTN_VAL: str = "XFORMERS" STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" -STR_DUAL_CHUNK_FLASH_ATTN_VAL: str = "DUAL_CHUNK_FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" MB_bytes = 1_000_000 From 04d3752329e39d11674862be341faaab42ac4d11 Mon Sep 17 00:00:00 2001 From: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Date: Sun, 21 Sep 2025 16:06:16 -0700 Subject: [PATCH 082/185] [Bugfix][V0 Deprecation][CI] use async mock and await for async method (#25325) Signed-off-by: Yang --- .../entrypoints/openai/test_lora_resolvers.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index e2c83b9c40045..9d5ee84a19567 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -5,7 +5,7 @@ from contextlib import suppress from dataclasses import dataclass, field from http import HTTPStatus from typing import Optional -from unittest.mock import MagicMock +from unittest.mock import AsyncMock, MagicMock import pytest @@ -83,20 +83,31 @@ def register_mock_resolver(): def mock_serving_setup(): """Provides a mocked engine and serving completion instance.""" mock_engine = MagicMock(spec=AsyncLLM) - mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - def mock_add_lora_side_effect(lora_request: LoRARequest): + tokenizer = get_tokenizer(MODEL_NAME) + mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer) + + async def mock_add_lora_side_effect(lora_request: LoRARequest): """Simulate engine behavior when adding LoRAs.""" if lora_request.lora_name == "test-lora": # Simulate successful addition - return - elif lora_request.lora_name == "invalid-lora": + return True + if lora_request.lora_name == "invalid-lora": # Simulate failure during addition (e.g. invalid format) raise ValueError(f"Simulated failure adding LoRA: " f"{lora_request.lora_name}") + return True + + mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect) + + async def mock_generate(*args, **kwargs): + for _ in []: + yield _ + + mock_engine.generate = MagicMock(spec=AsyncLLM.generate, + side_effect=mock_generate) - mock_engine.add_lora.side_effect = mock_add_lora_side_effect mock_engine.generate.reset_mock() mock_engine.add_lora.reset_mock() @@ -131,7 +142,7 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup, with suppress(Exception): await serving_completion.create_completion(req_found) - mock_engine.add_lora.assert_called_once() + mock_engine.add_lora.assert_awaited_once() called_lora_request = mock_engine.add_lora.call_args[0][0] assert isinstance(called_lora_request, LoRARequest) assert called_lora_request.lora_name == lora_model_name @@ -157,7 +168,7 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup, response = await serving_completion.create_completion(req) - mock_engine.add_lora.assert_not_called() + mock_engine.add_lora.assert_not_awaited() mock_engine.generate.assert_not_called() assert isinstance(response, ErrorResponse) @@ -181,7 +192,7 @@ async def test_serving_completion_resolver_add_lora_fails( response = await serving_completion.create_completion(req) # Assert add_lora was called before the failure - mock_engine.add_lora.assert_called_once() + mock_engine.add_lora.assert_awaited_once() called_lora_request = mock_engine.add_lora.call_args[0][0] assert isinstance(called_lora_request, LoRARequest) assert called_lora_request.lora_name == invalid_model From 5aeb9254521023f97aca292b3478aa7ff485ffb2 Mon Sep 17 00:00:00 2001 From: Deboleina Date: Sun, 21 Sep 2025 19:07:11 -0400 Subject: [PATCH 083/185] Multimodal - audio tests (#25285) Signed-off-by: Debolina Roy --- tests/multimodal/test_audio.py | 140 +++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/multimodal/test_audio.py diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py new file mode 100644 index 0000000000000..ba39af845041a --- /dev/null +++ b/tests/multimodal/test_audio.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# test_audio.py +import base64 +from pathlib import Path +from unittest.mock import patch + +import numpy as np +import pytest + +from vllm.multimodal.audio import (AudioMediaIO, AudioResampler, + resample_audio_librosa, + resample_audio_scipy) + + +@pytest.fixture +def dummy_audio(): + return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) + + +def test_resample_audio_librosa(dummy_audio): + with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: + mock_resample.return_value = dummy_audio * 2 + out = resample_audio_librosa(dummy_audio, + orig_sr=44100, + target_sr=22050) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio * 2) + + +def test_resample_audio_scipy(dummy_audio): + out_down = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=2) + out_up = resample_audio_scipy(dummy_audio, orig_sr=2, target_sr=4) + out_same = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=4) + + assert len(out_down) == 3 + assert len(out_up) == 10 + assert np.all(out_same == dummy_audio) + + +@pytest.mark.xfail( + reason="resample_audio_scipy is buggy for non-integer ratios") +def test_resample_audio_scipy_non_integer_ratio(dummy_audio): + out = resample_audio_scipy(dummy_audio, orig_sr=5, target_sr=3) + + expected_len = int(round(len(dummy_audio) * 3 / 5)) + assert len(out) == expected_len + + assert isinstance(out, np.ndarray) + assert np.isfinite(out).all() + + +def test_audio_resampler_librosa_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="librosa") + with patch( + "vllm.multimodal.audio.resample_audio_librosa") as mock_resample: + mock_resample.return_value = dummy_audio + out = resampler.resample(dummy_audio, orig_sr=44100) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio) + + +def test_audio_resampler_scipy_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="scipy") + with patch("vllm.multimodal.audio.resample_audio_scipy") as mock_resample: + mock_resample.return_value = dummy_audio + out = resampler.resample(dummy_audio, orig_sr=44100) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio) + + +def test_audio_resampler_invalid_method(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="invalid") + with pytest.raises(ValueError): + resampler.resample(dummy_audio, orig_sr=44100) + + +def test_audio_resampler_no_target_sr(dummy_audio): + resampler = AudioResampler(target_sr=None) + with pytest.raises(RuntimeError): + resampler.resample(dummy_audio, orig_sr=44100) + + +@pytest.fixture +def dummy_audio_bytes(): + return b"FAKEAUDIOBYTES" + + +def test_audio_media_io_load_bytes(dummy_audio_bytes): + audio_io = AudioMediaIO() + with patch("vllm.multimodal.audio.librosa.load") as mock_load: + mock_load.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_bytes(dummy_audio_bytes) + mock_load.assert_called_once() + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_load_base64(dummy_audio_bytes): + audio_io = AudioMediaIO() + encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") + with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: + mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_base64("audio/wav", encoded) + mock_load_bytes.assert_called_once() + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_load_file(): + audio_io = AudioMediaIO() + path = Path("/fake/path.wav") + with patch("vllm.multimodal.audio.librosa.load") as mock_load: + mock_load.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_file(path) + mock_load.assert_called_once_with(path, sr=None) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_encode_base64(dummy_audio): + audio_io = AudioMediaIO() + media = (dummy_audio, 16000) + with patch("vllm.multimodal.audio.soundfile.write") as mock_write: + + def write_to_buffer(buffer, *_args, **_kwargs): + buffer.write(b"dummy_wav_data") + + mock_write.side_effect = write_to_buffer + + out = audio_io.encode_base64(media) + decoded = base64.b64decode(out) + assert decoded == b"dummy_wav_data" + mock_write.assert_called_once() From 7b57a433daf94dc075ccf52267b82aae359d0d53 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 21 Sep 2025 19:24:40 -0700 Subject: [PATCH 084/185] [Model] Support Dots OCR (#24645) Signed-off-by: Roger Wang Co-authored-by: yinz-aizip --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 18 + tests/models/registry.py | 2 + vllm/model_executor/models/dots_ocr.py | 824 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/dotsocr.py | 69 ++ 7 files changed, 917 insertions(+) create mode 100644 vllm/model_executor/models/dots_ocr.py create mode 100644 vllm/transformers_utils/configs/dotsocr.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index cbc0a56a645ea..9d288667a318f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -352,6 +352,7 @@ th { | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | +| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ | | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index de3f3afc17948..f8ddb5a22b31a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ) +# Dots-OCR +def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions] + engine_args = EngineArgs( + model="rednote-hilab/dots.ocr", + limit_mm_per_prompt={modality: 1}, + trust_remote_code=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1676,6 +1693,7 @@ model_example_map = { "aya_vision": run_aya_vision, "blip-2": run_blip2, "chameleon": run_chameleon, + "dots_ocr": run_dots_ocr, "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, "ernie45_vl": run_ernie45_vl, diff --git a/tests/models/registry.py b/tests/models/registry.py index e9cc5170ade74..29b6980aaa42a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -448,6 +448,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 + "DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr", + trust_remote_code=True), "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT", # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py new file mode 100644 index 0000000000000..04fa5584199a3 --- /dev/null +++ b/vllm/model_executor/models/dots_ocr.py @@ -0,0 +1,824 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import LayerNorm +from transformers.modeling_utils import PreTrainedModel +from transformers.models.qwen2_vl import Qwen2VLProcessor + +from vllm.attention.layer import check_upstream_fa_availability +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, + SupportsMultiModal, + SupportsPP) +from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM +from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder, + Qwen2VLMultiModalProcessor, + Qwen2VLProcessingInfo) +from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, + init_vllm_registered_model, + maybe_prefix, + merge_multimodal_embeddings) +from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.dotsocr import (DotsOCRConfig, + DotsVisionConfig) + +IMAGE_TOKEN = "<|imgpad|>" + + +class DotsOCRImagePixelInputs(TypedDict): + type: Literal["pixel_values", "image_grid_thw"] + + pixel_values: torch.Tensor + image_grid_thw: torch.Tensor + + +class DotsOCRImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds", "image_grid_thw"] + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + + +DotsOCRImageInputs = Union[DotsOCRImagePixelInputs, + DotsOCRImageEmbeddingInputs] + + +class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + return IMAGE_TOKEN * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 + ) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class DotsOCRProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self) -> DotsOCRConfig: + config = self.ctx.get_hf_config() + if not config.__class__.__name__ == 'DotsOCRConfig': + raise TypeError(f"Expected DotsOCRConfig, got {type(config)}") + + if hasattr(config, "vision_config") and isinstance( + config.vision_config, dict): + config.vision_config = DotsVisionConfig(**config.vision_config) + + return config + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + return {"image": max_image_tokens} + + def get_hf_processor( + self, + **kwargs: object, + ) -> Qwen2VLProcessor: + self.get_tokenizer( + ).image_token = IMAGE_TOKEN # Ensure image token is set + processor = self.ctx.get_hf_processor( + Qwen2VLProcessor, + **kwargs, + ) + processor.image_token = IMAGE_TOKEN + processor.video_token = "<|video_pad|>" + return processor + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision(tensor: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: + orig_dtype = tensor.dtype + tensor = tensor.float() + + cos = freqs.cos() + sin = freqs.sin() + + cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() + sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() + + output = (tensor * cos) + (rotate_half(tensor) * sin) + + output = output.to(orig_dtype) + + return output + + +class VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class PatchMerger(nn.Module): + + def __init__( + self, + dim: int, + context_dim: int, + spatial_merge_size: int = 2, + pre_norm="layernorm", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + self.pre_norm = pre_norm + if self.pre_norm == "layernorm": + self.ln_q = LayerNorm(context_dim, eps=1e-6) + elif self.pre_norm == "rmsnorm": + self.ln_q = RMSNorm(context_dim, eps=1e-6) + else: + print("no norm in patch merger") + + self.mlp = nn.Sequential( + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + return_bias=False, + disable_tp=True), + nn.GELU(), + RowParallelLinear(self.hidden_size, + dim, + bias=True, + return_bias=False, + disable_tp=True), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.pre_norm: + x = self.mlp(self.ln_q(x).view(-1, self.hidden_size)) + else: + x = self.mlp(x.view(-1, self.hidden_size)) + return x + + +class DotsVisionAttention(nn.Module): + + def __init__(self, + config, + dim: int, + num_heads: int = 16, + bias: bool = True, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + super().__init__() + from vllm.distributed import (parallel_state, + tensor_model_parallel_all_gather) + from vllm.distributed import utils as dist_utils + + self.embed_dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.num_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size) + + # qkv/proj follow Qwen2-VL style; bias controlled by arg + self.qkv = QKVParallelLinear(hidden_size=dim, + head_size=dim // num_heads, + total_num_heads=num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=dim, + output_size=dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.proj") + self._all_gather = tensor_model_parallel_all_gather + self._split_last = dist_utils.split_tensor_along_last_dim + + # Select attention backend + self.attn_backend = get_vit_attn_backend(self.head_dim, + torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA + }: + raise RuntimeError( + f"Unsupported vision attention backend: {self.attn_backend}") + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA + } + + def _split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # qkv: [S, B, 3*dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = self._all_gather(qkv) + q, k, v = qkv.chunk(3, dim=2) + if self.tp_size > 1: + q = self._split_last(q, num_partitions=self.tp_size)[self.tp_rank] + k = self._split_last(k, num_partitions=self.tp_size)[self.tp_rank] + v = self._split_last(v, num_partitions=self.tp_size)[self.tp_rank] + new_shape = (seq_len, bs, self.num_heads_per_partition, self.head_dim) + return (q.view(*new_shape), k.view(*new_shape), v.view(*new_shape)) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + *, + max_seqlen: Optional[int] = None, + seqlens: Optional[list[int]] = None, + ) -> torch.Tensor: + # [S, C] -> [S, B=1, C] + x = hidden_states.unsqueeze(1) + x, _ = self.qkv(x) + q, k, v = self._split_qkv(x) + bs = q.shape[1] + # [S,B,H,D] -> [B,S,H,D] + q = q.permute(1, 0, 2, 3).contiguous() + k = k.permute(1, 0, 2, 3).contiguous() + v = v.permute(1, 0, 2, 3).contiguous() + + if rotary_pos_emb is not None: + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) + + if self.is_flash_attn_backend: + if self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func + q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3]) + k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3]) + v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3]) + output = flash_attn_varlen_func(q_, + k_, + v_, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0.0, + causal=False) + context_layer = output.view(bs, -1, self.num_heads_per_partition, + self.head_dim) + elif self.attn_backend == _Backend.TORCH_SDPA: + outputs = [] + for i in range(1, len(cu_seqlens)): + s = int(cu_seqlens[i - 1]) + e = int(cu_seqlens[i]) + q_i = q[:, s:e].permute(0, 2, 1, 3) + k_i = k[:, s:e].permute(0, 2, 1, 3) + v_i = v[:, s:e].permute(0, 2, 1, 3) + out_i = F.scaled_dot_product_attention(q_i, + k_i, + v_i, + dropout_p=0.0) + out_i = out_i.permute(0, 2, 1, 3) + outputs.append(out_i) + context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0] + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None, + device=q.device) + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) + else: + raise RuntimeError("Unsupported attention backend") + + # [B,S,H,D] -> [S,B,H*D] -> [S, C] + context_layer = context_layer.permute(1, 0, 2, 3).contiguous() + context_layer = context_layer.view(context_layer.shape[0], bs, -1) + out, _ = self.proj(context_layer) + return out.squeeze(1) + + +class DotsSwiGLUFFN(nn.Module): + + def __init__(self, + config, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + hidden_features = config.intermediate_size + in_features = config.embed_dim + bias = config.use_bias + + # Referenced aimv2.py AIMv2SwiGLUFFN + self.fc13 = MergedColumnParallelLinear(in_features, + [hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc13", + disable_tp=True) + self.fc2 = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=True) + self.act_fn = SiluAndMul() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc13(x) + x = self.act_fn(x) + x, _ = self.fc2(x) + return x + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params = dict(self.named_parameters()) + loaded: set[str] = set() + for name, w in weights: + # Map fc1 -> fc13 (shard 0) + if name.startswith("fc1."): + tgt = name.replace("fc1.", "fc13.") + if tgt in params: + params[tgt].weight_loader(params[tgt], w, 0) + loaded.add(tgt) + continue + # Map fc3 -> fc13 (shard 1) + if name.startswith("fc3."): + tgt = name.replace("fc3.", "fc13.") + if tgt in params: + params[tgt].weight_loader(params[tgt], w, 1) + loaded.add(tgt) + continue + # Pass-through for fc2 and others + if name in params: + params[name].weight_loader(params[name], w) + loaded.add(name) + return loaded + + +class DotsPatchEmbed(nn.Module): + + def __init__(self, config): + super().__init__() + self.num_channels = config.num_channels + self.patch_size = config.patch_size + self.temporal_patch_size = config.temporal_patch_size + self.embed_dim = config.embed_dim + self.config = config + self.proj = nn.Conv2d( + config.num_channels, + config.embed_dim, + kernel_size=(config.patch_size, config.patch_size), + stride=(config.patch_size, config.patch_size), + ) + self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + + def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor: + x = x.view(-1, self.num_channels, self.temporal_patch_size, + self.patch_size, self.patch_size)[:, :, 0] + x = self.proj(x).view(-1, self.embed_dim) + x = self.norm(x) + return x + + +class DotsViTPreprocessor(nn.Module): + + def __init__(self, config): + super().__init__() + self.patch_h = config.patch_size + self.patch_w = config.patch_size + self.embed_dim = config.embed_dim + self.config = config + self.patchifier = DotsPatchEmbed(config) + + def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor: + tokens = self.patchifier(x, grid_thw) + return tokens + + +class DotsVisionBlock(nn.Module): + + def __init__(self, + config, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.attn = DotsVisionAttention( + config, + config.embed_dim, + num_heads=config.num_attention_heads, + bias=config.use_bias, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + self.mlp = DotsSwiGLUFFN(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + + def forward(self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, + seqlens: Optional[list[int]] = None) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class DotsVisionTransformer(PreTrainedModel): + + def __init__( + self, + config: DotsVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: + super().__init__(config) + self.config = config + self.spatial_merge_size = config.spatial_merge_size + + self.patch_embed = DotsViTPreprocessor(config) + + head_dim = config.embed_dim // config.num_attention_heads + self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + + # Keep blocks for compatibility with other vision towers + num_layers = (config.num_hidden_layers if num_hidden_layers_override + is None else num_hidden_layers_override) + self.blocks = nn.ModuleList([ + DotsVisionBlock(config, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{i}") + for i in range(num_layers) + ]) + if require_post_norm is None: + require_post_norm = (len(self.blocks) == config.num_hidden_layers) + if require_post_norm and self.config.post_norm: + self.post_trunk_norm = RMSNorm(config.embed_dim, + eps=config.rms_norm_eps) + else: + self.post_trunk_norm = None + + self.merger = PatchMerger( + dim=config.hidden_size, + context_dim=config.embed_dim, + spatial_merge_size=config.spatial_merge_size, + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.patchifier.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.patchifier.proj.weight.device + + def get_pos_ids_by_grid(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + return pos_ids + + def rot_pos_emb(self, grid_thw): + pos_ids = self.get_pos_ids_by_grid(grid_thw) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + + def forward(self, hidden_states: torch.Tensor, + grid_thw: torch.Tensor) -> torch.Tensor: + hidden_states = hidden_states.to(self.dtype) + hidden_states = self.patch_embed(hidden_states, grid_thw) + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + dtype=grid_thw.dtype + if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + for blk in self.blocks: + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + + if self.post_trunk_norm is not None: + hidden_states = self.post_trunk_norm(hidden_states) + + hidden_states = self.merger(hidden_states) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2VLMultiModalProcessor, + info=DotsOCRProcessingInfo, + dummy_inputs=DotsOCRDummyInputsBuilder, +) +class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".attn.qkv_proj.": ".attn.qkv.", + ".attn.out_proj.": ".attn.proj.", + }, + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }, + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|img|><|imgpad|><|endofimg|>" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.config: DotsOCRConfig = vllm_config.model_config.hf_config + self.quant_config = vllm_config.quant_config + self.multimodal_config = vllm_config.model_config.multimodal_config + + if isinstance(self.config.vision_config, dict): + vision_config = DotsVisionConfig(**self.config.vision_config) + self.config.vision_config = vision_config + else: + vision_config = self.config.vision_config + + self.vision_tower = DotsVisionTransformer( + vision_config, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "vision_tower"), + ) + self.language_model: Qwen2ForCausalLM = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=self.config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[DotsOCRImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return DotsOCRImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return DotsOCRImageEmbeddingInputs(type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _process_image_input( + self, image_input: DotsOCRImageInputs) -> tuple[torch.Tensor, ...]: + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type( + self.vision_tower.dtype) + else: + pixel_values = image_input["pixel_values"].type( + self.vision_tower.dtype) + image_embeds = self.vision_tower( + pixel_values, grid_thw)[:, :self.config.hidden_size] + + # Split concatenated embeddings for each image item. + merge_size = self.vision_tower.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + + return image_embeds.split(sizes) + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return [] + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_id, + ) + + return inputs_embeds + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None and kwargs.get("pixel_values") is not None: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + inputs_embeds = None + else: + assert input_ids is not None + inputs_embeds = self.get_multimodal_embeddings( + input_ids, + image_input=image_input, + ) + input_ids = None + + hidden_states = self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 5dc5d545bb9c5..86123bc092b9b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -219,6 +219,7 @@ _MULTIMODAL_MODELS = { "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501 "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), + "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"), "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501 "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 91bfeb8c55ee5..52fa49ad302bd 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -9,6 +9,7 @@ Model configs may be defined in this directory for the following reasons: from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config +from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the @@ -36,6 +37,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", "DeepseekVLV2Config", + "DotsOCRConfig", "EAGLEConfig", "RWConfig", "JAISConfig", diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py new file mode 100644 index 0000000000000..6bb3c12d9c7eb --- /dev/null +++ b/vllm/transformers_utils/configs/dotsocr.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + +from transformers.configuration_utils import PretrainedConfig +from transformers.models.qwen2 import Qwen2Config + + +class DotsVisionConfig(PretrainedConfig): + model_type: str = "dots_vit" + + def __init__( + self, + embed_dim: int = 1536, # vision encoder embed size + hidden_size: int = 1536, # after merger hidden size + intermediate_size: int = 4224, + num_hidden_layers: int = 42, + num_attention_heads: int = 12, + num_channels: int = 3, + patch_size: int = 14, + spatial_merge_size: int = 2, + temporal_patch_size: int = 1, + rms_norm_eps: float = 1e-5, + use_bias: bool = False, + attn_implementation="flash_attention_2", + initializer_range=0.02, + init_merger_std=0.02, + is_causal=False, # ve causal forward + post_norm=True, + gradient_checkpointing=False, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.rms_norm_eps = rms_norm_eps + self.use_bias = use_bias + self.attn_implementation = attn_implementation + self.initializer_range = initializer_range + self.init_merger_std = init_merger_std + self.is_causal = is_causal + self.post_norm = post_norm + self.gradient_checkpointing = gradient_checkpointing + + +class DotsOCRConfig(Qwen2Config): + model_type = "dots_ocr" + + def __init__(self, + image_token_id=151665, + video_token_id=151656, + vision_config: Optional[dict] = None, + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.image_token_id = image_token_id + self.video_token_id = video_token_id + self.vision_config = DotsVisionConfig(**(vision_config or {})) + + def save_pretrained(self, save_directory, **kwargs): + self._auto_class = None + super().save_pretrained(save_directory, **kwargs) From 793be8d0579f11b3326f7e2099432c1433033571 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Mon, 22 Sep 2025 10:49:13 +0800 Subject: [PATCH 085/185] [Docs] GSM8K Accuracy Evaluation doc update (#25360) Signed-off-by: David Chen <530634352@qq.com> --- tests/evals/gsm8k/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md index 58572c3a6fbc1..29c5199e1e87a 100644 --- a/tests/evals/gsm8k/README.md +++ b/tests/evals/gsm8k/README.md @@ -19,7 +19,7 @@ pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \ vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000 # Run evaluation -python tests/gsm8k/gsm8k_eval.py --port 8000 +python tests/evals/gsm8k/gsm8k_eval.py --port 8000 ``` ## Configuration Format From 0eecb3166365a29db117c2aff6ca441b484b514d Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Mon, 22 Sep 2025 11:35:39 +0800 Subject: [PATCH 086/185] [Bugfix] Fix hermes tool parser handling of non-string argument types (#22002) Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: David Chen <530634352@qq.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Chauncey --- .../tool_parsers/test_hermes_tool_parser.py | 131 ++++++++++++++++++ .../openai/tool_parsers/hermes_tool_parser.py | 42 +++++- 2 files changed, 166 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index 4bab849f47c27..e0e6b2c07e171 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -45,8 +45,39 @@ TOOLS = [{ }, }] +PRODUCT_TOOLS = [{ + "type": "function", + "function": { + "name": "get_product_info", + "description": "Get detailed information of a product based on its " + "product ID.", + "parameters": { + "type": "object", + "properties": { + "inserted": { + "type": "boolean", + "description": "inserted.", + }, + "product_id": { + "type": "integer", + "description": "The product ID of the product.", + }, + }, + "required": ["product_id", "inserted"], + }, + }, +}] + MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}] +PRODUCT_MESSAGES = [{ + "role": + "user", + "content": + "Hi! Do you have any detailed information about the product id " + "7355608 and inserted true?" +}] + @pytest.mark.asyncio async def test_non_streaming_tool_call(): @@ -127,3 +158,103 @@ async def test_streaming_tool_call(): print("\n[Streaming Test Passed]") print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") print(f"Reconstructed Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_non_streaming_product_tool_call(): + """Test tool call integer and boolean parameters in non-streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + response = await client.chat.completions.create( + model=LORA_MODEL, + messages=PRODUCT_MESSAGES, + tools=PRODUCT_TOOLS, + tool_choice="auto", + temperature=0.66, + ) + + assert response.choices + choice = response.choices[0] + message = choice.message + + assert choice.finish_reason == "tool_calls" + assert message.tool_calls is not None + + tool_call = message.tool_calls[0] + assert tool_call.type == "function" + assert tool_call.function.name == "get_product_info" + + arguments = json.loads(tool_call.function.arguments) + assert "product_id" in arguments + assert "inserted" in arguments + + product_id = arguments.get("product_id") + inserted = arguments.get("inserted") + + assert isinstance(product_id, int) + assert product_id == 7355608 + assert isinstance(inserted, bool) + assert inserted is True + + print("\n[Non-Streaming Product Test Passed]") + print(f"Tool Call: {tool_call.function.name}") + print(f"Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_streaming_product_tool_call(): + """Test tool call integer and boolean parameters in streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + stream = await client.chat.completions.create( + model=LORA_MODEL, + messages=PRODUCT_MESSAGES, + tools=PRODUCT_TOOLS, + tool_choice="auto", + temperature=0.66, + stream=True, + ) + + tool_call_chunks = {} + async for chunk in stream: + if not chunk.choices: + continue + + delta = chunk.choices[0].delta + if not delta or not delta.tool_calls: + continue + + for tool_chunk in delta.tool_calls: + index = tool_chunk.index + if index not in tool_call_chunks: + tool_call_chunks[index] = {"name": "", "arguments": ""} + + if tool_chunk.function.name: + tool_call_chunks[index]["name"] += tool_chunk.function.name + if tool_chunk.function.arguments: + tool_call_chunks[index][ + "arguments"] += tool_chunk.function.arguments + + assert len(tool_call_chunks) == 1 + reconstructed_tool_call = tool_call_chunks[0] + + assert reconstructed_tool_call["name"] == "get_product_info" + + arguments = json.loads(reconstructed_tool_call["arguments"]) + assert "product_id" in arguments + assert "inserted" in arguments + + # Handle type coercion for streaming test as well + product_id = arguments.get("product_id") + inserted = arguments.get("inserted") + + assert isinstance(product_id, int) + assert product_id == 7355608 + assert isinstance(inserted, bool) + assert inserted is True + + print("\n[Streaming Product Test Passed]") + print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") + print(f"Reconstructed Arguments: {arguments}") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e74c420da1d3c..87595953da067 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -368,16 +368,32 @@ class Hermes2ProToolParser(ToolParser): # case -- we now have the first info about arguments available from # autocompleting the JSON elif cur_arguments and not prev_arguments: + # extract the content after {"name": ..., "arguments": + # directly from tool_call_portion as cur_arguments_json, + # since cur_arguments may differ from the original text + # due to partial JSON parsing + # for example, tool_call_portion = + # {"name": "search", "arguments": {"search_request": {" + # but cur_arguments = + # {"search_request": {}} + function_name = current_tool_call.get("name") + match = re.search( + r'\{"name":\s*"' + + re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)', + tool_call_portion.strip(), re.DOTALL) + if match: + cur_arguments_json = match.group(1) + else: + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) - cur_arguments_json = json.dumps(cur_arguments, - ensure_ascii=False) logger.debug("finding %s in %s", delta_text, cur_arguments_json) - # get the location where previous args differ from current - if (delta_text not in cur_arguments_json[:-2]): + # get the location where previous args differ from current. + if (delta_text not in cur_arguments_json): return None - args_delta_start_loc = cur_arguments_json[:-2]. \ + args_delta_start_loc = cur_arguments_json. \ rindex(delta_text) + \ len(delta_text) @@ -397,8 +413,20 @@ class Hermes2ProToolParser(ToolParser): # last case -- we have an update to existing arguments. elif cur_arguments and prev_arguments: - if isinstance(delta_text, str) and len(delta_text.rstrip( - )) >= 1 and delta_text.rstrip()[-1] == '}': + # judge whether the tool_call_portion is a complete JSON + try: + json.loads(tool_call_portion) + is_complete_json = True + except Exception: + is_complete_json = False + + # if the delta_text ends with a '}' and tool_call_portion is a + # complete JSON, then the last '}' does not belong to the + # arguments, so we should trim it off + if isinstance(delta_text, str) \ + and len(delta_text.rstrip()) >= 1 \ + and delta_text.rstrip()[-1] == '}' \ + and is_complete_json: delta_text = delta_text.rstrip()[:-1] logger.debug("got diff %s", delta_text) From 6d0b827cbd0510173f6a9e77549d917828e80c76 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 22 Sep 2025 13:58:26 +0800 Subject: [PATCH 087/185] [V0 Deprecation] Remove V0-only methods in multi-modal registry (#25362) Signed-off-by: DarkLight1337 --- .../multimodal/generation/test_qwen2_vl.py | 1 - vllm/multimodal/registry.py | 32 +------------------ 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 8336ebc0d59cb..c8a3513ac7ad1 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -209,7 +209,6 @@ def batch_make_video_embeddings( return visual(pixel_values_on_device, grid_thw=video_grid_thw_on_device).cpu() - # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 38adbf8f3536a..5d485bc361d11 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -12,8 +12,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) from vllm.utils import ClassRegistry -from .cache import (BaseMultiModalProcessorCache, - processor_only_cache_from_config) +from .cache import BaseMultiModalProcessorCache from .processing import BaseMultiModalProcessor, BaseProcessingInfo from .profiling import (BaseDummyInputsBuilder, DummyDecoderData, DummyEncoderData, MultiModalProfiler) @@ -176,35 +175,6 @@ class MultiModalRegistry: if mm_limits[key] > 0 } - # TODO: Remove once V0 is gone - def get_max_tokens_by_modality( - self, - model_config: "ModelConfig", - ) -> Mapping[str, int]: - """ - Get the maximum number of tokens from each modality - for profiling the memory usage of a model. - """ - cache = processor_only_cache_from_config(model_config, self) - mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) - max_tokens_per_item = self.get_max_tokens_per_item_by_modality( - model_config, - cache=cache, - ) - - return { - key: mm_limits[key] * max_tokens_per_mm_item - for key, max_tokens_per_mm_item in max_tokens_per_item.items() - } - - # TODO: Remove once V0 is gone - def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: - """ - Get the maximum number of multi-modal tokens - for profiling the memory usage of a model. - """ - return sum(self.get_max_tokens_by_modality(model_config).values()) - def get_mm_limits_per_prompt( self, model_config: "ModelConfig", From f92d952632541889c38f107f0425a5bd4707d9d9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 22 Sep 2025 16:49:19 +0800 Subject: [PATCH 088/185] [V0 Deprecation] Remove `MultiModalPlaceholderMap` (#25366) Signed-off-by: DarkLight1337 --- tests/kernels/utils.py | 2 - vllm/attention/backends/abstract.py | 10 --- vllm/attention/backends/placeholder_attn.py | 23 +------ vllm/attention/backends/utils.py | 18 ----- vllm/multimodal/__init__.py | 2 - vllm/multimodal/base.py | 74 +-------------------- vllm/v1/attention/backends/cpu_attn.py | 1 - 7 files changed, 2 insertions(+), 128 deletions(-) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 8d6ce381976b5..39ea07309134b 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -959,7 +959,6 @@ def make_test_metadata( return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -1009,7 +1008,6 @@ def make_test_metadata( return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index ab7ef2112b083..1b392cd7c88d3 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -10,7 +10,6 @@ from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple, import torch from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey -from vllm.multimodal import MultiModalPlaceholderMap class AttentionType: @@ -116,15 +115,6 @@ class AttentionMetadata: # in block 0, and 1st slot in block 1, respectively. slot_mapping: torch.Tensor - # The index maps that relate multi-modal embeddings to the corresponding - # placeholders. - # - # N.B. These aren't really related to attention and don't belong on this - # type -- this is just a temporary solution to make them available to - # `model_executable`. - multi_modal_placeholder_index_maps: Optional[Dict[ - str, MultiModalPlaceholderMap.IndexMap]] - # Enable/disable KV scales calculation. This is so that we can disable the # calculation until after prefill and cuda graph capture. enable_kv_scales_calculation: bool diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index f82d28938f45c..cddeb2cf39bf0 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import Dict, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import torch @@ -12,7 +11,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionMetadataBuilder) from vllm.attention.backends.utils import CommonAttentionState -from vllm.multimodal import MultiModalPlaceholderMap from vllm.utils import async_tensor_h2d # Placeholder attention backend for models like Mamba and pooling models that @@ -141,8 +139,6 @@ class PlaceholderAttentionMetadata(AttentionMetadata): num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, @@ -178,7 +174,6 @@ class PlaceholderAttentionMetadata(AttentionMetadata): num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=seq_lens_tensor, @@ -210,9 +205,6 @@ class PlaceholderAttentionMetadataBuilder( self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -232,12 +224,6 @@ class PlaceholderAttentionMetadataBuilder( self.context_lens.append(context_len) if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -295,12 +281,6 @@ class PlaceholderAttentionMetadataBuilder( seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - # Placeholders slot_mapping_tensor = torch.empty(0) block_tables = torch.empty(0) @@ -308,7 +288,6 @@ class PlaceholderAttentionMetadataBuilder( return PlaceholderAttentionMetadata( num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, - multi_modal_placeholder_index_maps=placeholder_index_maps, enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 3f15580872c7f..33d8168f8a13c 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend utils""" -from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate @@ -15,7 +14,6 @@ from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder, from vllm.attention.backends.abstract import AttentionType from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) @@ -135,9 +133,6 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.context_lens: List[int] = [] self.block_tables: List[List[int]] = [] self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -154,12 +149,6 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): inter_data.curr_sliding_window_blocks): self.context_lens.append(context_len) if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -254,16 +243,10 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.runner.pin_memory) seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } return self._metadata_cls( # type: ignore num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, - multi_modal_placeholder_index_maps=placeholder_index_maps, enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -320,7 +303,6 @@ class CommonAttentionState(AttentionState): num_prefill_tokens=0, num_decode_tokens=batch_size, slot_mapping=self._graph_slot_mapping[:batch_size], - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=self._graph_seq_lens[:batch_size], diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 7ffa732cf3708..8ea79078465e6 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .base import MultiModalPlaceholderMap from .hasher import MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, MultiModalDataDict, MultiModalKwargs, @@ -27,7 +26,6 @@ __all__ = [ "MultiModalKwargs", "MultiModalKwargsItems", "MultiModalPlaceholderDict", - "MultiModalPlaceholderMap", "MultiModalUUIDDict", "NestedTensors", "MULTIMODAL_REGISTRY", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index e0edb3e883ed6..faffddd57199d 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -3,83 +3,11 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Generic, NamedTuple, TypeVar +from typing import Generic, TypeVar _T = TypeVar("_T") -class MultiModalPlaceholderMap: - """ - Relates multi-modal embeddings to their corresponding placeholders. - - Note: This is only used in V0. - """ - - class IndexMap(NamedTuple): - src: list[int] - dest: list[int] - - src_ranges: list[range] - """ - The indices of the multi-modal embeddings that will replace the - corresponding placeholder embeddings pointed to by ``dest_ranges``. - """ - - src_len: int - """ - The total number of flattened multi-modal embeddings. - """ - - dest_ranges: list[range] - """ - The indices of the placeholder embeddings that will be replaced by the - multimodal embeddings. - """ - - dest_len: int - """ - The total number of embeddings in the destination tensor. - """ - - def __init__(self): - self.src_ranges = [] - self.src_len = 0 - self.dest_ranges = [] - self.dest_len = 0 - - def extend(self, other: "MultiModalPlaceholderMap"): - """ - Adds the placeholders from another ``MultiModalPlaceholderMap`` to this - instance based on the source and destination tensors being - concatenated. - """ - - self.src_ranges.extend( - range(self.src_len + r.start, self.src_len + r.stop) - for r in other.src_ranges) - self.src_len += other.src_len - self.dest_ranges.extend( - range(self.dest_len + r.start, self.dest_len + r.stop) - for r in other.dest_ranges) - self.dest_len += other.dest_len - - def index_map(self) -> "IndexMap": - """ - Finalizes the placeholder map into lists of indices that can be used to - index the source and destination tensors. - """ - - src_indices = [i for r in self.src_ranges for i in r] - dest_indices = [i for r in self.dest_ranges for i in r] - - if len(src_indices) != len(dest_indices): - raise ValueError( - f"The number of source ({len(src_indices)}) and destination " - f"indices ({len(dest_indices)}) must be the same.") - - return self.IndexMap(src=src_indices, dest=dest_indices) - - class MediaIO(ABC, Generic[_T]): @abstractmethod diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 6627164c98798..7e485fea2689d 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -425,7 +425,6 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): num_prompt_req], # prefill query_start_loc=query_start_loc_cpu[:num_reqs + 1], # for logits index - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=False, ) From 21467f9a1c6219cbd66640e539b1f23221cff375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Mon, 22 Sep 2025 10:50:39 +0200 Subject: [PATCH 089/185] Enable Eagle3 speculative decoding for GPT-OSS model (#25246) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- vllm/config/speculative.py | 2 +- vllm/model_executor/models/gpt_oss.py | 19 ++++++++++++++-- vllm/v1/spec_decode/eagle.py | 32 +++++++++++++++++++-------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 2c861723c3966..d533930e1c7aa 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -527,7 +527,7 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - eagle3_target_supported = ["llama", "qwen"] + eagle3_target_supported = ["llama", "qwen", "gpt_oss"] if self.method == "eagle3" and self.target_model_config and not any( supported_model in self.target_model_config.hf_text_config.model_type diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 4fe59f91124dd..7c755a00e1c98 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -27,7 +27,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from vllm.utils import cdiv -from .interfaces import SupportsPP +from .interfaces import SupportsEagle3, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -238,6 +238,7 @@ class GptOssModel(nn.Module): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], self.config.hidden_size)) + self.aux_hidden_state_layers = tuple[int, ...]() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embedding(input_ids) @@ -261,8 +262,12 @@ class GptOssModel(nn.Module): x = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + aux_hidden_states = [] for i in range(self.start_layer, self.end_layer): layer = self.layers[i] + if i in self.aux_hidden_state_layers: + aux_hidden_states.append(x if residual is None else x + + residual) x, residual = layer(x, positions, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -270,6 +275,9 @@ class GptOssModel(nn.Module): "residual": residual }) x, _ = self.norm(x, residual) + + if len(aux_hidden_states) > 0: + return x, aux_hidden_states return x def _load_weights_mxfp4( @@ -610,7 +618,7 @@ class GptOssModel(nn.Module): weights, stacked_params_mapping) -class GptOssForCausalLM(nn.Module, SupportsPP): +class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3): packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} hf_to_vllm_mapper = WeightsMapper( @@ -658,6 +666,13 @@ class GptOssForCausalLM(nn.Module, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5dacf60886966..dc97d5c8f39d4 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -823,15 +823,29 @@ class EagleProposer: else: target_language_model = target_model # share embed_tokens with the target model if needed - if get_pp_group().world_size == 1 \ - and self.model.model.embed_tokens.weight.shape \ - == target_language_model.model.embed_tokens.weight.shape: - logger.info( - "Assuming the EAGLE head shares the same vocab embedding" - " with the target model.") - del self.model.model.embed_tokens - self.model.model.embed_tokens = ( - target_language_model.model.embed_tokens) + if get_pp_group().world_size == 1: + if hasattr(target_language_model.model, 'embed_tokens'): + target_embed_tokens = target_language_model.model.embed_tokens + elif hasattr(target_language_model.model, 'embedding'): + target_embed_tokens = target_language_model.model.embedding + else: + raise AttributeError( + "Target model does not have 'embed_tokens' or 'embedding' " + "attribute") + + # Check if shapes match and we found the embedding + eagle_shape = self.model.model.embed_tokens.weight.shape + target_shape = target_embed_tokens.weight.shape + if eagle_shape == target_shape: + logger.info( + "Assuming the EAGLE head shares the same vocab embedding" + " with the target model.") + del self.model.model.embed_tokens + self.model.model.embed_tokens = target_embed_tokens + else: + logger.info( + "The EAGLE head's vocab embedding will be loaded separately" + " from the target model.") else: logger.info( "The EAGLE head's vocab embedding will be loaded separately" From a66d13138137147d5b3fbdfe64afb103fbc52f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Mon, 22 Sep 2025 11:55:04 +0200 Subject: [PATCH 090/185] [TPU][Bugfix][CI] Fix broken tests/build dependency (#25255) Signed-off-by: NickLucche --- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 1073a4ee30afa..e76528a178205 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ - && python3 -m pip install --progress-bar off hf-transfer + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 export VLLM_XLA_CHECK_RECOMPILATION=1 diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 505664f3aecd0..69366cd503219 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ - && python3 -m pip install --progress-bar off hf-transfer + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 export VLLM_XLA_CHECK_RECOMPILATION=1 From 4cf71cc88a19f97bf6f5db9802395edd646375ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Mon, 22 Sep 2025 12:12:57 +0200 Subject: [PATCH 091/185] [TPU] Deprecate `xm.mark_step` in favor of ``torch_xla.sync` (#25254) Signed-off-by: NickLucche Co-authored-by: Ye (Charlotte) Qi --- tests/tpu/test_moe_pallas.py | 3 +- tests/v1/tpu/test_topk_topp_sampler.py | 11 ++++--- vllm/lora/punica_wrapper/punica_tpu.py | 4 +-- .../model_loader/default_loader.py | 9 +++-- vllm/v1/worker/tpu_model_runner.py | 33 ++++++++++--------- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 407a824d81748..1e5d9d923d004 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -6,6 +6,7 @@ Run `pytest tests/kernels/moe/test_moe_pallas.py`. """ import pytest import torch +import torch_xla # yapf conflicts with isort for this block # yapf: disable @@ -77,7 +78,7 @@ def test_pallas_moe( expert_map=e_map, renormalize=False, ) - xm.mark_step() + torch_xla.sync(wait=False) # Compare outputs torch.testing.assert_close( diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index 05751badc7619..665cf8cd2629e 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -4,6 +4,7 @@ import math import pytest import torch +import torch_xla from vllm.platforms import current_platform from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p @@ -63,7 +64,7 @@ def test_topp_result_sums_past_p(): probs.masked_fill_(logits_masked.isinf(), 0) masked_prob_sum = probs.sum(dim=-1) - xm.mark_step() + torch_xla.sync() # Perform assertion on CPU. assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu())) @@ -82,7 +83,7 @@ def test_topp_basic(): k=torch.tensor([3, 3]), p=torch.tensor([0.79, 0.79])) - xm.mark_step() + torch_xla.sync() # Expect the smallest elements to be dropped. expected_result = logits.clone().cpu() @@ -104,7 +105,7 @@ def test_topp_select_all(): k=torch.tensor([3, 3]), p=torch.tensor([1.0, 1.0])) - xm.mark_step() + torch_xla.sync() assert torch.allclose(logits.cpu(), result.cpu()) @@ -122,7 +123,7 @@ def test_topp_with_ties(): k=torch.tensor([4]), p=torch.tensor([0.2])) - xm.mark_step() + torch_xla.sync() # All tie values are included in the top-p set. Tie breaking is left # to be done during final sampling (all tie tokens have equal @@ -146,7 +147,7 @@ def test_both_topk_topp(): k=torch.tensor([1, 3]), p=torch.tensor([0.79, 0.79])) - xm.mark_step() + torch_xla.sync() # Since for the first batch k=1, expect only the largest element gets # selected. diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 07dc337a1cc87..5896da516540b 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional, Union import torch import torch.nn.functional as F -import torch_xla.core.xla_model as xm +import torch_xla from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink from vllm.lora.punica_wrapper.utils import convert_mapping @@ -323,7 +323,7 @@ class PunicaWrapperTPU(PunicaWrapperBase): extra_vocab_size: int, ): # Make sure we don't accidentally collect outside operations - xm.mark_step() + torch_xla.sync() # Pad the prompt mapping to avoid running into recompiles on the TPU # TODO: Should this happen inside mapping internally? If so how can we diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index d1bdec21fd974..4b7bcd37d4bc2 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -211,16 +211,15 @@ class DefaultModelLoader(BaseModelLoader): from vllm.platforms.tpu import USE_TPU_COMMONS if not USE_TPU_COMMONS: - # In PyTorch XLA, we should call `xm.mark_step` + # In PyTorch XLA, we should call `torch_xla.sync` # frequently so that not too many ops are accumulated - # in the XLA program. import torch_xla.core.xla_model - # as xm - import torch_xla.core.xla_model as xm + # in the XLA program. + import torch_xla def _xla_weights_iterator(iterator: Generator): for weights in iterator: yield weights - xm.mark_step() + torch_xla.sync(wait=False) weights_iterator = _xla_weights_iterator(weights_iterator) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index dd11b1dcbe94c..4cbf991a14c11 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -10,6 +10,7 @@ import numpy as np import torch import torch.nn as nn # TPU XLA related +import torch_xla import torch_xla.core.xla_model as xm import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr @@ -846,10 +847,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # 2. A list or tuple (length: num_items) of tensors, each of shape # (feature_size, hidden_size) in case the feature size is dynamic # depending on the input multimodal items. - xm.mark_step() + torch_xla.sync(wait=False) curr_group_outputs = self.model.get_multimodal_embeddings( **mm_kwargs_group) - xm.mark_step() + torch_xla.sync(wait=False) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -952,7 +953,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_embeds = self._gather_mm_embeddings(scheduler_output) else: mm_embeds = [] - xm.mark_step() + torch_xla.sync(wait=False) # Prepare inputs, the requests might be split into multiple # executions, combine the result of each execution. start_index = 0 @@ -969,7 +970,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): end_index = self._prepare_inputs(scheduler_output, start_index) input_ids, inputs_embeds = self._get_model_inputs( self.input_ids, mm_embeds) - xm.mark_step() + torch_xla.sync(wait=False) # Run the decoder with set_forward_context( attn_metadata, @@ -1183,7 +1184,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Sync all pending XLA execution during model initialization and weight # loading. - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() if not hasattr(self, "model"): self.model = model @@ -1267,10 +1268,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, lora_requests) -> None: - xm.mark_step() # Captures input updates + torch_xla.sync(wait=False) # Captures input updates super()._set_active_loras(prompt_lora_mapping, token_lora_mapping, lora_requests) - xm.mark_step() # Captures metadata updates + torch_xla.sync(wait=False) # Captures metadata updates def _precompile_mm_encoder(self) -> None: if not self.supports_mm_inputs: @@ -1297,10 +1298,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_items, ) # Run multimodal encoder. - xm.mark_step() + torch_xla.sync(wait=False) mm_embeds = self.model.get_multimodal_embeddings( **batched_dummy_mm_inputs) - xm.mark_step() + torch_xla.sync(wait=False) num_patches = mm_embeds[0].shape[0] items_size = num_patches * num_items @@ -1325,7 +1326,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): a, b = self._get_model_inputs(placeholders_ids, [mm_embeds]) assert a is None - xm.mark_step() + torch_xla.sync(wait=False) # Pre-compile `get_input_embeddings` when mm_embeddings are not # present. Chunk is only made of text, no mm_placeholders. @@ -1336,7 +1337,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): placeholders_ids = placeholders_ids.to(self.device) a, b = self._get_model_inputs(placeholders_ids, []) assert a is None - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() end = time.perf_counter() @@ -1532,11 +1533,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Isolate encoder graph from post-processing to minimize # impact of recompilation until it's fixed. start = time.perf_counter() - xm.mark_step() + torch_xla.sync(wait=False) dummy_encoder_outputs = \ self.model.get_multimodal_embeddings( **batched_dummy_mm_inputs) - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() end = time.perf_counter() logger.info( @@ -1559,7 +1560,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self._dummy_run(num_tokens, self.num_reqs_most_model_len, self.num_blocks_per_most_len_req) - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() self.encoder_cache.clear() gc.collect() @@ -1927,11 +1928,11 @@ def replace_set_lora(model): # to a tensor doesn't seem to work anymore. This might be fixed with a # later release of torch_xla. self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias) - xm.mark_step() + torch_xla.sync(wait=False) def _tpu_reset_lora(self, index: int): self._original_reset_lora(index) - xm.mark_step() + torch_xla.sync(wait=False) for _, module in model.named_modules(): if isinstance(module, BaseLayerWithLoRA): From b6f01bd9a7941c3c06175feaf7588fa55c9db646 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:22:29 +0800 Subject: [PATCH 092/185] refactor: abstract graph mode support into platform interface (#25161) Signed-off-by: Yizhou Liu --- vllm/config/__init__.py | 2 +- vllm/platforms/cuda.py | 4 ++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 4 ++++ vllm/platforms/xpu.py | 13 +++++++------ 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e31a78ba33baa..9bb8087a511d5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -503,7 +503,7 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if current_platform.is_cuda_alike() or current_platform.is_xpu(): + if current_platform.support_static_graph_mode(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 05f129f513a0a..7baa5a9742f44 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -498,6 +498,10 @@ class CudaPlatformBase(Platform): def support_hybrid_kv_cache(cls) -> bool: return True + @classmethod + def support_static_graph_mode(cls) -> bool: + return True + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c43580ac5da13..8a05c84d4242e 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -587,6 +587,13 @@ class Platform: """ return False + @classmethod + def support_static_graph_mode(cls) -> bool: + """ + Returns if the graph mode is supported by the current platform. + """ + return False + @classmethod def use_sync_weight_loader(cls) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 9470434aa428b..0c7b9c2a4abf6 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -477,3 +477,7 @@ class RocmPlatform(Platform): @classmethod def support_hybrid_kv_cache(cls) -> bool: return True + + @classmethod + def support_static_graph_mode(cls) -> bool: + return True diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 4d3bef4b42947..eb591ae4454e4 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -113,12 +113,9 @@ class XPUPlatform(Platform): # lazy import to avoid circular import from vllm.config import CompilationLevel, CUDAGraphMode compilation_config = vllm_config.compilation_config - if compilation_config.cudagraph_mode is None or \ - compilation_config.cudagraph_mode.max_cudagraph_mode() \ - != CUDAGraphMode.NONE: - logger.info("[XPU] CUDA graph is not supported on XPU, disabling " - "cudagraphs. Fallback to cudagraph_mode=NONE") - compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \ + "CUDA graph mode should be NONE on XPU" if vllm_config.lora_config is not None: compilation_config.level = CompilationLevel.NO_COMPILATION @@ -169,6 +166,10 @@ class XPUPlatform(Platform): def support_hybrid_kv_cache(cls) -> bool: return True + @classmethod + def support_static_graph_mode(cls) -> bool: + return False + @classmethod def is_pin_memory_available(cls): return True From 417a164af66a11978f11a9ba64d4d76b4cf3e72a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 22 Sep 2025 19:04:32 +0800 Subject: [PATCH 093/185] [Misc] Remove unused encoder-decoder error strings (#25374) Signed-off-by: DarkLight1337 --- vllm/attention/backends/utils.py | 5 --- vllm/utils/__init__.py | 58 -------------------------------- 2 files changed, 63 deletions(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 33d8168f8a13c..63ee8f50825c5 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -18,11 +18,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) -# Error string(s) for encoder/decoder -# unsupported attention scenarios -STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported " - "with encoder/decoder models.") - PAD_SLOT_ID = -1 # Switch to numpy implementation of compute_slot_mapping diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 834ec9b1d30b4..b74b746a35830 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -88,64 +88,6 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 -# Exception strings for non-implemented encoder/decoder scenarios - -# Reminder: Please update docs/features/compatibility_matrix.md -# If the feature combo become valid - -STR_NOT_IMPL_ENC_DEC_SWA = \ - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ - "Prefix caching for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \ - "Chunked prefill for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( - "Models with logits_soft_cap " - "require FlashInfer backend, which is " - "currently not supported for encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not " - "currently supported with " - "encoder/decoder models.") - -STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not " - "currently supported with encoder/" - "decoder models.") - -STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " - "backends currently supported with encoder/" - "decoder models.") - -# Efficiently import all enc/dec error strings -# rather than having to import all of the above -STR_NOT_IMPL_ENC_DEC_ERR_STRS = { - "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA, - "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL": - STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL, - "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP, - "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA, - "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, - "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, - "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, - "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, -} - # Constants related to forcing the attention backend selection # String name of register which may be set in order to From 64c824cd787dd610008e1fc91dfe7c06debf909b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 22 Sep 2025 12:08:25 +0100 Subject: [PATCH 094/185] Make pickle import check fast (#25379) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 5 +- .../{ => pre_commit}/check_pickle_imports.py | 79 ++++--------------- 2 files changed, 18 insertions(+), 66 deletions(-) rename tools/{ => pre_commit}/check_pickle_imports.py (61%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4ea888af3f3e..bf36db7d15ed9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -155,11 +155,10 @@ repos: additional_dependencies: [regex] - id: check-pickle-imports name: Prevent new pickle/cloudpickle imports - entry: python tools/check_pickle_imports.py + entry: python tools/pre_commit/check_pickle_imports.py language: python types: [python] - pass_filenames: false - additional_dependencies: [pathspec, regex] + additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring entry: python tools/validate_config.py diff --git a/tools/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py similarity index 61% rename from tools/check_pickle_imports.py rename to tools/pre_commit/check_pickle_imports.py index fe717121db40d..acbbc1f181d69 100644 --- a/tools/check_pickle_imports.py +++ b/tools/pre_commit/check_pickle_imports.py @@ -1,20 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import sys import regex as re -try: - import pathspec -except ImportError: - print( - "ERROR: The 'pathspec' library is required. " - "Install it with 'pip install pathspec'.", - file=sys.stderr) - sys.exit(2) - # List of files (relative to repo root) that are allowed to import pickle or # cloudpickle # @@ -25,7 +15,7 @@ except ImportError: # Before adding new uses of pickle/cloudpickle, please consider safer # alternatives like msgpack or pydantic that are already in use in vLLM. Only # add to this list if absolutely necessary and after careful security review. -ALLOWED_FILES = set([ +ALLOWED_FILES = { # pickle 'vllm/v1/serial_utils.py', 'vllm/v1/executor/multiproc_executor.py', @@ -36,11 +26,9 @@ ALLOWED_FILES = set([ 'tests/tokenization/test_cached_tokenizer.py', 'vllm/distributed/utils.py', 'vllm/distributed/parallel_state.py', - 'vllm/engine/multiprocessing/client.py', 'vllm/distributed/device_communicators/all_reduce_utils.py', 'vllm/distributed/device_communicators/shm_broadcast.py', 'vllm/distributed/device_communicators/shm_object_storage.py', - 'vllm/engine/multiprocessing/engine.py', 'benchmarks/kernels/graph_machete_bench.py', 'benchmarks/kernels/benchmark_lora.py', 'benchmarks/kernels/benchmark_machete.py', @@ -55,65 +43,30 @@ ALLOWED_FILES = set([ 'tests/utils.py', # pickle and cloudpickle 'vllm/utils/__init__.py', - 'vllm/v1/serial_utils.py', - 'vllm/v1/executor/multiproc_executor.py', - 'vllm/transformers_utils/config.py', - 'vllm/model_executor/models/registry.py', - 'vllm/engine/multiprocessing/client.py', - 'vllm/engine/multiprocessing/engine.py', -]) +} PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)" r"|from\s+(pickle|cloudpickle)\s+import\b)") -def is_python_file(path): - return path.endswith('.py') - - -def scan_file(path): +def scan_file(path: str) -> int: with open(path, encoding='utf-8') as f: - for line in f: + for i, line in enumerate(f, 1): if PICKLE_RE.match(line): - return True - return False - - -def load_gitignore(repo_root): - gitignore_path = os.path.join(repo_root, '.gitignore') - patterns = [] - if os.path.exists(gitignore_path): - with open(gitignore_path, encoding='utf-8') as f: - patterns = f.read().splitlines() - # Always ignore .git directory - patterns.append('.git/') - return pathspec.PathSpec.from_lines('gitwildmatch', patterns) + print(f"{path}:{i}: " + "\033[91merror:\033[0m " # red color + "Found pickle/cloudpickle import") + return 1 + return 0 def main(): - repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - spec = load_gitignore(repo_root) - bad_files = [] - for dirpath, _, filenames in os.walk(repo_root): - for filename in filenames: - if not is_python_file(filename): - continue - abs_path = os.path.join(dirpath, filename) - rel_path = os.path.relpath(abs_path, repo_root) - # Skip ignored files - if spec.match_file(rel_path): - continue - if scan_file(abs_path) and rel_path not in ALLOWED_FILES: - bad_files.append(rel_path) - if bad_files: - print("\nERROR: The following files import 'pickle' or 'cloudpickle' " - "but are not in the allowed list:") - for f in bad_files: - print(f" {f}") - print("\nIf this is intentional, update the allowed list in " - "tools/check_pickle_imports.py.") - sys.exit(1) - sys.exit(0) + returncode = 0 + for filename in sys.argv[1:]: + if filename in ALLOWED_FILES: + continue + returncode |= scan_file(filename) + return returncode def test_regex(): @@ -149,4 +102,4 @@ if __name__ == '__main__': if '--test-regex' in sys.argv: test_regex() else: - main() + sys.exit(main()) From 3d2c56b7a958aa2c92f61424e828b1d3c3c933d4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:23:45 +0100 Subject: [PATCH 095/185] Make `mypy` behave like a proper pre-commit hook (#25313) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .github/CODEOWNERS | 1 + .pre-commit-config.yaml | 34 ++++----- pyproject.toml | 21 ------ tools/mypy.sh | 35 --------- tools/pre_commit/mypy.py | 140 +++++++++++++++++++++++++++++++++++ vllm/entrypoints/llm.py | 4 +- vllm/entrypoints/renderer.py | 2 +- vllm/utils/__init__.py | 9 ++- vllm/utils/tensor_schema.py | 7 +- 9 files changed, 166 insertions(+), 87 deletions(-) delete mode 100755 tools/mypy.sh create mode 100755 tools/pre_commit/mypy.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 37bd0ace98a97..9d749fe8d3238 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -72,6 +72,7 @@ mkdocs.yaml @hmellor # Linting .markdownlint.yaml @hmellor .pre-commit-config.yaml @hmellor +/tools/pre_commit @hmellor # CPU /vllm/v1/worker/cpu* @bigPYJ1151 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf36db7d15ed9..8ca414ee4269b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,38 +60,32 @@ repos: files: ^requirements/test\.(in|txt)$ - id: mypy-local name: Run mypy for local Python installation - entry: tools/mypy.sh 0 "local" - language: python - types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] + entry: python tools/pre_commit/mypy.py 0 "local" stages: [pre-commit] # Don't run in CI + <<: &mypy_common + language: python + types_or: [python, pyi] + require_serial: true + additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 - entry: tools/mypy.sh 1 "3.9" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.9" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 - entry: tools/mypy.sh 1 "3.10" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.10" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.11 - entry: tools/mypy.sh 1 "3.11" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.11" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.12 - entry: tools/mypy.sh 1 "3.12" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.12" + <<: *mypy_common stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts diff --git a/pyproject.toml b/pyproject.toml index f43ae69e00bdd..88c5c4067f5ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,27 +110,6 @@ ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" -# After fixing type errors resulting from follow_imports: "skip" -> "silent", -# move the directory here and remove it from tools/mypy.sh -files = [ - "vllm/*.py", - "vllm/assets", - "vllm/entrypoints", - "vllm/inputs", - "vllm/logging_utils", - "vllm/multimodal", - "vllm/platforms", - "vllm/transformers_utils", - "vllm/triton_utils", - "vllm/usage", -] -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = [ - "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", - # Ignore triton kernels in ops. - 'vllm/attention/ops/.*\.py$' -] - [tool.isort] skip_glob = [ ".buildkite/*", diff --git a/tools/mypy.sh b/tools/mypy.sh deleted file mode 100755 index 63e3b9a916634..0000000000000 --- a/tools/mypy.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -CI=${1:-0} -PYTHON_VERSION=${2:-local} - -if [ "$CI" -eq 1 ]; then - set -e -fi - -if [ $PYTHON_VERSION == "local" ]; then - PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') -fi - -run_mypy() { - echo "Running mypy on $1" - if [ "$CI" -eq 1 ] && [ -z "$1" ]; then - mypy --python-version "${PYTHON_VERSION}" "$@" - return - fi - mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" -} - -run_mypy # Note that this is less strict than CI -run_mypy tests -run_mypy vllm/attention -run_mypy vllm/compilation -run_mypy vllm/distributed -run_mypy vllm/engine -run_mypy vllm/executor -run_mypy vllm/inputs -run_mypy vllm/lora -run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor -run_mypy vllm/plugins -run_mypy vllm/worker -run_mypy vllm/v1 diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py new file mode 100755 index 0000000000000..039cf6075f631 --- /dev/null +++ b/tools/pre_commit/mypy.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Run mypy on changed files. + +This script is designed to be used as a pre-commit hook. It runs mypy +on files that have been changed. It groups files into different mypy calls +based on their directory to avoid import following issues. + +Usage: + python tools/pre_commit/mypy.py + +Args: + ci: "1" if running in CI, "0" otherwise. In CI, follow_imports is set to + "silent" for the main group of files. + python_version: Python version to use (e.g., "3.10") or "local" to use + the local Python version. + changed_files: List of changed files to check. +""" + +import subprocess +import sys +from typing import Optional + +import regex as re + +FILES = [ + "vllm/*.py", + "vllm/assets", + "vllm/entrypoints", + "vllm/inputs", + "vllm/logging_utils", + "vllm/multimodal", + "vllm/platforms", + "vllm/transformers_utils", + "vllm/triton_utils", + "vllm/usage", +] + +# After fixing errors resulting from changing follow_imports +# from "skip" to "silent", move the following directories to FILES +SEPARATE_GROUPS = [ + "tests", + "vllm/attention", + "vllm/compilation", + "vllm/distributed", + "vllm/engine", + "vllm/executor", + "vllm/inputs", + "vllm/lora", + "vllm/model_executor", + "vllm/plugins", + "vllm/worker", + "vllm/v1", +] + +# TODO(woosuk): Include the code from Megatron and HuggingFace. +EXCLUDE = [ + "vllm/model_executor/parallel_utils", + "vllm/model_executor/models", + "vllm/model_executor/layers/fla/ops", + # Ignore triton kernels in ops. + "vllm/attention/ops", +] + + +def group_files(changed_files: list[str]) -> dict[str, list[str]]: + """ + Group changed files into different mypy calls. + + Args: + changed_files: List of changed files. + + Returns: + A dictionary mapping file group names to lists of changed files. + """ + exclude_pattern = re.compile(f"^{'|'.join(EXCLUDE)}.*") + files_pattern = re.compile(f"^({'|'.join(FILES)}).*") + file_groups = {"": []} + file_groups.update({k: [] for k in SEPARATE_GROUPS}) + for changed_file in changed_files: + # Skip files which should be ignored completely + if exclude_pattern.match(changed_file): + continue + # Group files by mypy call + if files_pattern.match(changed_file): + file_groups[""].append(changed_file) + continue + else: + for directory in SEPARATE_GROUPS: + if re.match(f"^{directory}.*", changed_file): + file_groups[directory].append(changed_file) + break + return file_groups + + +def mypy(targets: list[str], python_version: Optional[str], + follow_imports: Optional[str], file_group: str) -> int: + """ + Run mypy on the given targets. + + Args: + targets: List of files or directories to check. + python_version: Python version to use (e.g., "3.10") or None to use + the default mypy version. + follow_imports: Value for the --follow-imports option or None to use + the default mypy behavior. + file_group: The file group name for logging purposes. + + Returns: + The return code from mypy. + """ + args = ["mypy"] + if python_version is not None: + args += ["--python-version", python_version] + if follow_imports is not None: + args += ["--follow-imports", follow_imports] + print(f"$ {' '.join(args)} {file_group}") + return subprocess.run(args + targets, check=False).returncode + + +def main(): + ci = sys.argv[1] == "1" + python_version = sys.argv[2] + file_groups = group_files(sys.argv[3:]) + + if python_version == "local": + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + returncode = 0 + for file_group, changed_files in file_groups.items(): + follow_imports = None if ci and file_group == "" else "skip" + if changed_files: + returncode |= mypy(changed_files, python_version, follow_imports, + file_group) + return returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 092d3f276d1c5..c41f44aa47187 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1468,7 +1468,7 @@ class LLM: def _validate_and_add_requests( self, - prompts: Union[PromptType, Sequence[PromptType]], + prompts: Union[PromptType, Sequence[PromptType], DataPrompt], params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, Sequence[PoolingParams]], *, @@ -1478,7 +1478,7 @@ class LLM: ) -> None: if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. - prompts = [prompts] + prompts = [prompts] # type: ignore[list-item] num_requests = len(prompts) if isinstance(params, Sequence) and len(params) != num_requests: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index fb859d57be9fe..d7ce57c728ba6 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -280,7 +280,7 @@ class CompletionRenderer(BaseRenderer): if truncate_prompt_tokens < 0: truncate_prompt_tokens = self.model_config.max_model_len - if max_length is not None and truncate_prompt_tokens > max_length: + if max_length is not None and truncate_prompt_tokens > max_length: # type: ignore[operator] raise ValueError( f"truncate_prompt_tokens ({truncate_prompt_tokens}) " f"cannot be greater than max_length ({max_length}). " diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index b74b746a35830..022e35a399c53 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -551,9 +551,10 @@ class AsyncMicrobatchTokenizer: # If every request uses identical kwargs we can run a single # batched tokenizer call for a big speed-up. if can_batch and len(prompts) > 1: - encode_fn = partial(self.tokenizer, prompts, **kwargs) + batch_encode_fn = partial(self.tokenizer, prompts, + **kwargs) results = await self._loop.run_in_executor( - self._executor, encode_fn) + self._executor, batch_encode_fn) for i, fut in enumerate(result_futures): if not fut.done(): @@ -889,7 +890,7 @@ def get_open_port() -> int: def get_open_ports_list(count: int = 5) -> list[int]: """Get a list of open ports.""" - ports = set() + ports = set[int]() while len(ports) < count: ports.add(get_open_port()) return list(ports) @@ -1279,7 +1280,7 @@ def as_list(maybe_list: Iterable[T]) -> list[T]: def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]: if isinstance(obj, str) or not isinstance(obj, Iterable): - obj = [obj] + return [obj] # type: ignore[list-item] return obj diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py index 21d3249fe1547..d75dbcd5401b2 100644 --- a/vllm/utils/tensor_schema.py +++ b/vllm/utils/tensor_schema.py @@ -22,9 +22,8 @@ class TensorShape: self.dims = dims self.dynamic_dims = dynamic_dims if dynamic_dims else set() - def resolve(self, **bindings: dict[str, - int]) -> tuple[Union[int, str], ...]: - resolved = [] + def resolve(self, **bindings: int) -> tuple[Union[int, str], ...]: + resolved = list[Union[int, str]]() for dim in self.dims: if isinstance(dim, str) and dim in bindings: resolved.append(bindings[dim]) @@ -159,7 +158,7 @@ class TensorSchema: def validate(self) -> None: type_hints = get_type_hints(self.__class__, include_extras=True) - shape_env = {} + shape_env = dict[str, int]() for field_name, field_type in type_hints.items(): # Check if field is missing From ac243886b02399d327314b2d8bbf9989fd07f247 Mon Sep 17 00:00:00 2001 From: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Date: Mon, 22 Sep 2025 09:29:54 -0500 Subject: [PATCH 096/185] [Kernel] MI-300X triton moe configs (#23445) Signed-off-by: Sara Kokkila Schumacher --- ...N=128,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=256,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=512,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=192,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=384,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=768,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ 6 files changed, 1200 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..40d86ff8ba324 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..6014d827d7417 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..3622659f3e915 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..311d2e829a050 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..91c4b916b8649 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..8fee30ec70660 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} From c10101a3eba053d22842edd95941d49e4324c727 Mon Sep 17 00:00:00 2001 From: Csrayz Date: Mon, 22 Sep 2025 22:53:13 +0800 Subject: [PATCH 097/185] [Bugfix] Fix several issues with p2p xPyD in GET type (#23993) Signed-off-by: Csrayz Signed-off-by: ivyilike Co-authored-by: ivyilike --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 5 +++- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 26 ++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index ec72905a0d3ec..3dadfa595ef1e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -178,6 +178,9 @@ class P2pNcclConnector(KVConnectorBase_V1): # Load the KV for each request each layer for request in metadata.requests: + request_id = request.request_id + ip, port = self.parse_request_id(request_id, False) + remote_address = ip + ":" + str(port + self._rank) for layer_name in forward_context.no_compile_layers: layer = forward_context.no_compile_layers[layer_name] @@ -191,7 +194,7 @@ class P2pNcclConnector(KVConnectorBase_V1): layer = kv_cache[forward_context.virtual_engine] kv_cache = self.p2p_nccl_engine.recv_tensor( - request.request_id + "#" + layer_name) + request.request_id + "#" + layer_name, remote_address) if kv_cache is None: logger.warning("🚧kv_cache is None, %s", request.request_id) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index fa7cc66ab654d..959bf0277a3f5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -134,7 +134,6 @@ class P2pNcclEngine: # PUT or PUT_ASYNC # tensor_id: torch.Tensor self.send_queue: deque[SendQueueItem] = deque() - self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self.send_async, daemon=True) @@ -143,6 +142,7 @@ class P2pNcclEngine: # tensor_id: torch.Tensor/(addr, dtype, shape) self.recv_store: dict[str, Any] = {} self.recv_request_id_to_tensor_ids: dict[str, set[str]] = {} + self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} self.socks: dict[str, Any] = {} # remote_address: client socket self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) @@ -223,18 +223,26 @@ class P2pNcclEngine: # GET with self.send_store_cv: tensor_size = tensor.element_size() * tensor.numel() + if tensor_size > self.buffer_size_threshold: + logger.warning( + "❗[GET]tensor_id:%s, tensor_size:%d, is greater than" + "buffer size threshold :%d, skip send to %s, rank:%d", + tensor_id, tensor_size, self.buffer_size_threshold, + remote_address, self.rank) + return False while (self.buffer_size + tensor_size > self.buffer_size_threshold): - oldest_tenser_id = next(iter(self.send_store)) - oldest_tenser = self.send_store.pop(oldest_tenser_id) - oldest_tenser_size = oldest_tenser.element_size( - ) * oldest_tenser.numel() - self.buffer_size -= oldest_tenser_size - logger.info( + assert len(self.send_store) > 0 + oldest_tensor_id = next(iter(self.send_store)) + oldest_tensor = self.send_store.pop(oldest_tensor_id) + oldest_tensor_size = oldest_tensor.element_size( + ) * oldest_tensor.numel() + self.buffer_size -= oldest_tensor_size + logger.debug( "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," - " buffer_size:%d, oldest_tenser_size:%d, rank:%d", + " buffer_size:%d, oldest_tensor_size:%d, rank:%d", remote_address, tensor_id, tensor_size, self.buffer_size, - oldest_tenser_size, self.rank) + oldest_tensor_size, self.rank) self.send_store[tensor_id] = tensor self.buffer_size += tensor_size From 175811e3b53f8f13eb3e8ac6aae02050f6e1412f Mon Sep 17 00:00:00 2001 From: Burkhard Ringlein Date: Mon, 22 Sep 2025 17:20:28 +0200 Subject: [PATCH 098/185] [V1][Attention] Split triton_attn in triton-only and rocm specific backends (#24648) Signed-off-by: Burkhard Ringlein --- vllm/engine/arg_utils.py | 1 + vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 10 + vllm/v1/attention/backends/rocm_attn.py | 426 ++++++++++++++++++++++ vllm/v1/attention/backends/triton_attn.py | 167 +++------ 5 files changed, 482 insertions(+), 123 deletions(-) create mode 100644 vllm/v1/attention/backends/rocm_attn.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b09d43f705580..d4d801b155e17 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1494,6 +1494,7 @@ class EngineArgs: "FLEX_ATTENTION", "TREE_ATTN", "XFORMERS_VLLM_V1", + "ROCM_ATTN_VLLM_V1", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 8a05c84d4242e..cad04ea14c01c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -67,6 +67,7 @@ class _Backend(enum.Enum): FLEX_ATTENTION = enum.auto() TREE_ATTN = enum.auto() XFORMERS_VLLM_V1 = enum.auto() + ROCM_ATTN_VLLM_V1 = enum.auto() class PlatformEnum(enum.Enum): diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0c7b9c2a4abf6..6a49bd4a33866 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -231,7 +231,17 @@ class RocmPlatform(Platform): logger.info("Using Flash Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "rocm_aiter_fa.AiterFlashAttentionBackend") + elif (envs.VLLM_ROCM_USE_AITER and + envs.VLLM_USE_AITER_UNIFIED_ATTENTION) or \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION or \ + selected_backend == _Backend.ROCM_ATTN_VLLM_V1: + # rocm specific backend, with aiter and/or + # triton prefix-prefill + logger.info("Using Rocm/Aiter Attention backend on V1 engine.") + return ("vllm.v1.attention.backends." + "rocm_attn.RocmAttentionBackend") else: + # default case, using triton unified attention logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "triton_attn.TritonAttentionBackend") diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py new file mode 100644 index 0000000000000..365df5f0d6eca --- /dev/null +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention layer with PagedAttention and Triton prefix prefill.""" +from dataclasses import dataclass +from functools import cache +from typing import ClassVar, Optional + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode) +from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, kFp8StaticTensorSym) +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec + +logger = init_logger(__name__) + + +@dataclass +class RocmAttentionMetadata: + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + num_actual_tokens: int # Number of tokens excluding padding. + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + # For cascade attention. + use_cascade: bool + common_prefix_len: int + cu_prefix_query_lens: Optional[torch.Tensor] + prefix_kv_lens: Optional[torch.Tensor] + suffix_kv_lens: Optional[torch.Tensor] + + # Optional aot scheduling + scheduler_metadata: Optional[torch.Tensor] = None + prefix_scheduler_metadata: Optional[torch.Tensor] = None + + +class RocmAttentionMetadataBuilder( + AttentionMetadataBuilder[RocmAttentionMetadata]): + cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + + self.block_size = kv_cache_spec.block_size + + model_config = vllm_config.model_config + self.num_heads_q = model_config.get_num_attention_heads( + vllm_config.parallel_config) + self.num_heads_kv = model_config.get_num_kv_heads( + vllm_config.parallel_config) + self.headdim = model_config.get_head_size() + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata + ) -> RocmAttentionMetadata: + attn_metadata = self.build(0, common_attn_metadata) + # When doing full graph capture, setting seq_lens to + # max_model_len will cause graph capture to be extremely + # slow, so here we set it to 1. + attn_metadata.seq_lens.fill_(1) + return attn_metadata + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False) -> RocmAttentionMetadata: + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + + max_seq_len = common_attn_metadata.max_seq_len + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + block_table_tensor = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + use_cascade = common_prefix_len > 0 + + if use_cascade: + cu_prefix_query_lens = torch.tensor([0, num_actual_tokens], + dtype=torch.int32, + device=self.device) + prefix_kv_lens = torch.tensor([common_prefix_len], + dtype=torch.int32, + device=self.device) + suffix_kv_lens = (common_attn_metadata.seq_lens_cpu - + common_prefix_len) + suffix_kv_lens = suffix_kv_lens.to(self.device) + else: + cu_prefix_query_lens = None + prefix_kv_lens = None + suffix_kv_lens = None + prefix_scheduler_metadata = None + + attn_metadata = RocmAttentionMetadata( + num_actual_tokens=num_actual_tokens, + max_query_len=max_query_len, + query_start_loc=query_start_loc, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=block_table_tensor, + slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, + prefix_scheduler_metadata=prefix_scheduler_metadata, + ) + return attn_metadata + + +class RocmAttentionBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + + @classmethod + def validate_head_size(cls, head_size: int) -> None: + supported_head_sizes = cls.get_supported_head_sizes() + if head_size not in supported_head_sizes: + attn_type = cls.__name__.removesuffix("Backend") + raise ValueError( + f"Head size {head_size} is not supported by {attn_type}. " + f"Supported head sizes are: {supported_head_sizes}. " + "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " + "FlexAttention backend which supports all head sizes.") + + @staticmethod + def get_name() -> str: + return "ROCM_ATTN_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["RocmAttentionImpl"]: + return RocmAttentionImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return RocmAttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + @staticmethod + def get_builder_cls() -> type["RocmAttentionMetadataBuilder"]: + return RocmAttentionMetadataBuilder + + +@cache +def use_aiter_unified_attention() -> bool: + """Check if aiter unified attention should be used.""" + # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set + # to 1 as default + return envs.VLLM_ROCM_USE_AITER \ + and envs.VLLM_USE_AITER_UNIFIED_ATTENTION + + +class RocmAttentionImpl(AttentionImpl): + + def fused_output_quant_supported(self, quant_key: QuantKey): + return quant_key == kFp8StaticTensorSym + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, + sinks: Optional[torch.Tensor] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + if sliding_window is None: + self.sliding_window = (-1, -1) + else: + self.sliding_window = (sliding_window - 1, 0) + self.kv_cache_dtype = kv_cache_dtype + if logits_soft_cap is None: + # In flash-attn, setting logits_soft_cap as 0 means no soft cap. + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + RocmAttentionBackend.validate_head_size(head_size) + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "RocmAttentionImpl") + + self.fp8_dtype = current_platform.fp8_dtype() + self.force_prefill_decode_attn = \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + + if not self.force_prefill_decode_attn: + # If not using prefill decode attention, we use the Triton + # unified attention implementation. + if use_aiter_unified_attention(): + logger.info_once( + "Using aiter unified attention for RocmAttentionImpl") + from aiter.ops.triton.unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + else: + logger.info_once( + "Using vllm unified attention for RocmAttentionImpl") + from vllm.attention.ops.triton_unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") + + def forward( + self, + layer: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + output_block_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with FlashAttention. + + Args: + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert output is not None, "Output tensor must be provided." + + if output_block_scale is not None: + raise NotImplementedError( + "fused block_scale output quantization is not yet supported" + " for RocmAttentionImpl") + + if attn_metadata is None: + # Profiling run. + return output + + assert attn_metadata.use_cascade is False + + # IMPORTANT! + # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in + # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead + # in this method. For example, `view` and `slice` (or `[:n]`) operations + # are surprisingly slow even in the case they do not invoke any GPU ops. + # Minimize the PyTorch ops in this method as much as possible. + # Whenever making a change in this method, please benchmark the + # performance to make sure it does not introduce any overhead. + + use_prefill_decode_attn = self.force_prefill_decode_attn + num_actual_tokens = attn_metadata.num_actual_tokens + + if use_prefill_decode_attn: + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + else: + key_cache, value_cache = kv_cache.unbind(0) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + if use_prefill_decode_attn: + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + else: + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(self.fp8_dtype) + value_cache = value_cache.view(self.fp8_dtype) + num_tokens, num_heads, head_size = query.shape + assert layer._q_scale_float == 1.0, \ + "A non 1.0 q_scale is not currently supported." + if current_platform.is_cuda(): + # Skip Q quantization on ROCm and XPU, enable this on cuda + # only, since dequantizing back to f32 in the attention kernel + # is not supported. + query, _ = ops.scaled_fp8_quant( + query.reshape( + (num_tokens, num_heads * head_size)).contiguous(), + layer._q_scale) + query = query.reshape((num_tokens, num_heads, head_size)) + + cu_seqlens_q = attn_metadata.query_start_loc + seqused_k = attn_metadata.seq_lens + max_seqlen_q = attn_metadata.max_query_len + max_seqlen_k = attn_metadata.max_seq_len + block_table = attn_metadata.block_table + + if use_prefill_decode_attn: + # Compute attention and update output up to `num_actual_tokens`. + chunked_prefill_paged_decode( + query=query[:num_actual_tokens], + key=key[:num_actual_tokens], + value=value[:num_actual_tokens], + output=output[:num_actual_tokens], + kv_cache_dtype=self.kv_cache_dtype, + key_cache=key_cache, + value_cache=value_cache, + block_table=block_table, + query_start_loc=cu_seqlens_q, + seq_lens=seqused_k, + max_seq_len=max_seqlen_k, + max_query_len=max_seqlen_q, + k_scale=layer._k_scale, + v_scale=layer._v_scale, + alibi_slopes=self.alibi_slopes, + sliding_window=self.sliding_window[0], + sm_scale=self.scale, + output_scale=output_scale, + sinks=self.sinks, + ) + + else: + descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) + + self.unified_attention( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, + output_scale=output_scale) + + return output diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 784912a122f68..722c23f150cd3 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,24 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with PagedAttention and Triton prefix prefill.""" +"""High-Performance Triton-only Attention layer.""" from dataclasses import dataclass -from functools import cache from typing import ClassVar, Optional import torch -from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) -from vllm.attention.ops.chunked_prefill_paged_decode import ( - chunked_prefill_paged_decode) -from vllm.attention.ops.paged_attn import PagedAttention +from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym) from vllm.platforms import current_platform -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import (AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata) @@ -144,20 +139,15 @@ class TritonAttentionBackend(AttentionBackend): @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: - return [torch.float16, torch.bfloat16] - - @classmethod - def get_supported_head_sizes(cls) -> list[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] + return [torch.float16, torch.bfloat16, torch.float32] @classmethod def validate_head_size(cls, head_size: int) -> None: - supported_head_sizes = cls.get_supported_head_sizes() - if head_size not in supported_head_sizes: - attn_type = cls.__name__.removesuffix("Backend") + # Triton Attention supports any head size above 32 + if head_size < 32: raise ValueError( - f"Head size {head_size} is not supported by {attn_type}. " - f"Supported head sizes are: {supported_head_sizes}. " + f"Head size {head_size} is not supported by TritonAttention." + f"Head sizes need to be larger or equal 32 for this backend. " "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes.") @@ -182,7 +172,7 @@ class TritonAttentionBackend(AttentionBackend): ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) + return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: @@ -193,15 +183,6 @@ class TritonAttentionBackend(AttentionBackend): return TritonAttentionMetadataBuilder -@cache -def use_aiter_unified_attention() -> bool: - """Check if aiter unified attention should be used.""" - # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set - # to 1 as default - return envs.VLLM_ROCM_USE_AITER \ - and envs.VLLM_USE_AITER_UNIFIED_ATTENTION - - class TritonAttentionImpl(AttentionImpl): def fused_output_quant_supported(self, quant_key: QuantKey): @@ -250,24 +231,6 @@ class TritonAttentionImpl(AttentionImpl): "TritonAttentionImpl") self.fp8_dtype = current_platform.fp8_dtype() - self.force_prefill_decode_attn = \ - envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION - - if not self.force_prefill_decode_attn: - # If not using prefill decode attention, we use the Triton - # unified attention implementation. - if use_aiter_unified_attention(): - logger.info_once( - "Using aiter unified attention for TritonAttentionImpl") - from aiter.ops.triton.unified_attention import ( - unified_attention) - self.unified_attention = unified_attention - else: - logger.info_once( - "Using vllm unified attention for TritonAttentionImpl") - from vllm.attention.ops.triton_unified_attention import ( - unified_attention) - self.unified_attention = unified_attention self.sinks = sinks if sinks is not None: @@ -283,19 +246,19 @@ class TritonAttentionImpl(AttentionImpl): key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_metadata: FlashAttentionMetadata, + attn_metadata: TritonAttentionMetadata, output: Optional[torch.Tensor] = None, output_scale: Optional[torch.Tensor] = None, output_block_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: - """Forward pass with FlashAttention. + """Forward pass with Paged Attention impl. in Triton. Args: query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] kv_cache: shape = - [2, num_blocks, block_size, num_kv_heads, head_size] + [num_blocks, 2, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -322,40 +285,22 @@ class TritonAttentionImpl(AttentionImpl): # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. - use_prefill_decode_attn = self.force_prefill_decode_attn num_actual_tokens = attn_metadata.num_actual_tokens - - if use_prefill_decode_attn: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - else: - key_cache, value_cache = kv_cache.unbind(0) + key_cache, value_cache = kv_cache.unbind(1) if self.kv_sharing_target_layer_name is None: # Reshape the input keys and values and store them in the cache. # Skip this if sharing KV cache with an earlier attention layer. - if use_prefill_decode_attn: - PagedAttention.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - else: - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(self.fp8_dtype) @@ -379,52 +324,28 @@ class TritonAttentionImpl(AttentionImpl): max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table - if use_prefill_decode_attn: - # Compute attention and update output up to `num_actual_tokens`. - chunked_prefill_paged_decode( - query=query[:num_actual_tokens], - key=key[:num_actual_tokens], - value=value[:num_actual_tokens], - output=output[:num_actual_tokens], - kv_cache_dtype=self.kv_cache_dtype, - key_cache=key_cache, - value_cache=value_cache, - block_table=block_table, - query_start_loc=cu_seqlens_q, - seq_lens=seqused_k, - max_seq_len=max_seqlen_k, - max_query_len=max_seqlen_q, - k_scale=layer._k_scale, - v_scale=layer._v_scale, - alibi_slopes=self.alibi_slopes, - sliding_window=self.sliding_window[0], - sm_scale=self.scale, - output_scale=output_scale, - sinks=self.sinks, - ) + descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - else: - descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - - self.unified_attention( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=cu_seqlens_q, - max_seqlen_q=max_seqlen_q, - seqused_k=seqused_k, - max_seqlen_k=max_seqlen_k, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=block_table, - softcap=self.logits_soft_cap, - q_descale=None, # Not supported - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - sinks=self.sinks, - output_scale=output_scale) + unified_attention( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, + output_scale=output_scale, + ) return output From 06a41334c7e5c23b9a053f866c02524945d7d1db Mon Sep 17 00:00:00 2001 From: Bowen Wang Date: Mon, 22 Sep 2025 09:31:05 -0700 Subject: [PATCH 099/185] [EPLB] Reduce EPLB Inference Overhead (#24573) Signed-off-by: Bowen Wang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Tyler Michael Smith --- .../layers/fused_moe/fused_moe.py | 73 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 69 +++++------------- 2 files changed, 92 insertions(+), 50 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6c2a5bda7cbaa..0e334fdf24045 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1017,6 +1017,79 @@ def grouped_topk( return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: Optional[torch.dtype] = None) -> torch.Tensor: + ''' + Map the logical expert ids to physical expert ids + and record the expert load metrics. + + This will select a pseudo-random replica for each logical expert. + Only used for EPLB. + + Args: + topk_ids: The logical expert ids. + expert_load_view: The expert load view. + logical_to_physical_map: The logical to physical map. + logical_replica_count: The logical replica count. + indices_type: The indices type. + + Returns: + The physical expert ids. + ''' + + # 1. Convert the logical expert ids to physical expert ids + # Directly select a random replica for each logical expert + + # In case `indices_type` is not `torch.long` or `torch.int`, + # e.g. `torch.uint32` as required by dispatch/combine kernels + topk_ids_long = topk_ids.long() + # Use (token position) modulo (replica count) + # to deterministically choose a replica + replica_count = logical_replica_count[topk_ids_long] + # Flatten-position based index, reshaped back to `topk_ids` shape + pos_indices = torch.arange(topk_ids.numel(), + device=topk_ids.device, + dtype=torch.long).reshape_as(topk_ids) + # Compute pseudo-random indices by modulo + replica_indices = (pos_indices % replica_count).unsqueeze(-1) + physical_ids = logical_to_physical_map[topk_ids_long].gather( + -1, replica_indices).squeeze(-1) + + topk_ids = physical_ids + + # 2. Record expert load metrics. + + # TODO(bowen): When using `FusedMoEModularKernel`, this + # can be done in a more unified way, since + # `FusedMoEPrepareAndFinalize` will return the expert + # token count, in some cases directly from the kernel. + # However, now there are many code paths not using + # the modular kernel, e.g. calling `fused_experts`, + # so we decide to keep the logic here. + # + # If later refactor moved all the MoE kernel calls + # to the modular kernel, we can move this logic there + # to achieve better efficiency. + + # `expert_load_view`: (num_physical_experts,) + + # `torch.bincount` is not compilable, so use `scatter_add_` instead. + topk_ids_flatten = topk_ids.flatten() + expert_load_view.scatter_add_( + dim=0, + index=topk_ids_flatten.long(), + src=torch.ones_like(topk_ids_flatten).to(expert_load_view)) + + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) + return topk_ids + + def fused_grouped_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index da513d75da4da..17ad75584a3f6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -43,7 +43,8 @@ from vllm.v1.worker.ubatching import dbo_current_ubatch_id if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts - from .fused_moe import TritonExperts, fused_experts + from .fused_moe import (TritonExperts, eplb_map_to_physical_and_record, + fused_experts) if has_pplx(): from .pplx_prepare_finalize import (PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes) @@ -55,6 +56,16 @@ else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore FusedMoEPrepareAndFinalize = None # type: ignore + + def eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: Optional[torch.dtype]) -> torch.Tensor: + # CPU fallback: no EPLB so just return as is + return topk_ids + + if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_grouped_topk as grouped_topk) @@ -1616,55 +1627,13 @@ class FusedMoE(CustomOp): assert logical_to_physical_map is not None assert logical_replica_count is not None - # 1. Convert the logical expert ids to physical expert ids - # Directly select a random replica for each logical expert - - # TODO: maybe optimize this by using specified kernels, - # or compute pseudo-random indices by modulo - - # In case `indices_type` is not `torch.long` or `torch.int`, - # e.g. `torch.uint32` as required by dispatch/combine kernels - topk_ids_long = topk_ids.long() - replica_indices = ( - torch.rand_like(topk_ids, dtype=torch.float) * - logical_replica_count[topk_ids_long]).long().unsqueeze(-1) - physical_ids = logical_to_physical_map[topk_ids_long].gather( - -1, replica_indices).squeeze(-1) - - topk_ids = physical_ids - - # 2. Record expert load metrics. - - # TODO(bowen): When using `FusedMoEModularKernel`, this - # can be done in a more unified way, since - # `FusedMoEPrepareAndFinalize` will return the expert - # token count, in some cases directly from the kernel. - # However, now there are many code paths not using - # the modular kernel, e.g. calling `fused_experts`, - # so we decide to keep the logic here. - # - # If later refactor moved all the MoE kernel calls - # to the modular kernel, we can move this logic there - # to achieve better efficiency. - - # `expert_load_view`: (num_physical_experts,) - - topk_ids_flatten = topk_ids.flatten() - - # Performance optimization: - # `masked_fill` is significantly faster than `masked_select` - invalid_mask = topk_ids_flatten < 0 - # Replace invalid expert ids with 0 (just a dummy position) - # to avoid out-of-bounds errors in scatter_add_ - index = topk_ids_flatten.masked_fill_(invalid_mask, 0) - # `src` is the valid mask, which is 1 for valid and 0 for invalid - src = ~invalid_mask - - expert_load_view.scatter_add_(dim=0, - index=index.long(), - src=src.to(expert_load_view)) - - topk_ids = topk_ids.to(dtype=indices_type) + topk_ids = eplb_map_to_physical_and_record( + topk_ids=topk_ids, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + indices_type=indices_type, + ) assert topk_ids.dtype == indices_type or indices_type is None From cfbee3d0e72582b0b0d910ecbd9a7c1028a7205c Mon Sep 17 00:00:00 2001 From: Daisy-Ma-coder Date: Mon, 22 Sep 2025 10:37:43 -0700 Subject: [PATCH 100/185] [CLI env var] Add VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH in env variables (#25274) Signed-off-by: qqma Signed-off-by: DarkLight1337 Co-authored-by: qqma Co-authored-by: Cyrus Leung --- tests/compile/piecewise/test_full_cudagraph.py | 11 +++++++++-- tests/v1/cudagraph/test_cudagraph_mode.py | 11 +++++++++-- vllm/envs.py | 8 ++++++++ vllm/v1/attention/backends/flash_attn.py | 7 +++---- vllm/v1/attention/backends/mla/flashattn_mla.py | 8 +++----- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 2454f85342eba..780a0d6b5c0e4 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -46,7 +46,10 @@ backend_configs = { # FA3 on Hopper "FA3": BackendConfig(name="FA3", - env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "3", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }, @@ -66,6 +69,7 @@ backend_configs = { BackendConfig(name="FlashAttentionMLA", env_vars={ "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -89,7 +93,10 @@ backend_configs = { # FA2 "FA2": BackendConfig(name="FA2", - env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "2", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }), diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 25e01806f4956..1ae9185fafbdd 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -47,7 +47,10 @@ backend_configs = { # FA3 on Hopper "FA3": BackendConfig(name="FA3", - env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "3", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }, @@ -67,6 +70,7 @@ backend_configs = { BackendConfig(name="FlashAttentionMLA", env_vars={ "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -75,7 +79,10 @@ backend_configs = { # FA2 "FA2": BackendConfig(name="FA2", - env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "2", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }), diff --git a/vllm/envs.py b/vllm/envs.py index cbd1d5474e60f..e517088c52903 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,6 +119,7 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None @@ -946,6 +947,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + # If set, vLLM will pick up the provided Flash Attention MLA + # max number splits for cuda graph decode + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": + lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", + "16")), + # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU, # so that users can colocate other actors on the same GPUs as vLLM. @@ -1379,6 +1386,7 @@ def compute_hash() -> str: environment_variables_to_hash = [ "VLLM_PP_LAYER_PARTITION", "VLLM_MLA_DISABLE", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "VLLM_USE_TRITON_FLASH_ATTN", "VLLM_USE_TRITON_AWQ", "VLLM_DP_RANK", diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 20f1904b3be6f..d564cf9988ea6 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -8,6 +8,7 @@ import numpy as np import torch from vllm import _custom_ops as ops +from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType, is_quantized_kv_cache) @@ -33,9 +34,6 @@ from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) -# NOTE(woosuk): This is an arbitrary number. Tune it if needed. -_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16 - class FlashAttentionBackend(AttentionBackend): @@ -215,7 +213,8 @@ class FlashAttentionMetadataBuilder( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 472095e13615b..4ad9a13b61d8e 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,6 +6,7 @@ from typing import ClassVar, Optional, Union import torch +from vllm import envs from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, is_quantized_kv_cache) from vllm.attention.utils.fa_utils import (flash_attn_supports_mla, @@ -24,10 +25,6 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata logger = init_logger(__name__) -# NOTE(matt): This is an arbitrary number, copied from -# woosuk's implementation in standard FlashAttention backend -_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16 - class FlashAttnMLABackend(MLACommonBackend): @@ -97,7 +94,8 @@ class FlashAttnMLAMetadataBuilder( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH) # TODO(lucas): Until we add support for the DCP custom masking we need # to restrict decodes to q_len == 1 when DCP is enabled. From 1d7f95b85c9b1bdaa576a5275851995b75a9420d Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Mon, 22 Sep 2025 19:37:46 +0200 Subject: [PATCH 101/185] [Compiler] Disable Inductor standalone compile by default (#25391) Signed-off-by: ElizaWszola --- vllm/envs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index e517088c52903..8941be125ed08 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -126,6 +126,7 @@ if TYPE_CHECKING: VLLM_DP_RANK: int = 0 VLLM_DP_RANK_LOCAL: int = -1 VLLM_DP_SIZE: int = 1 + VLLM_USE_STANDALONE_COMPILE: bool = False VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 @@ -437,9 +438,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is - # enabled by default. + # disabled by default. "VLLM_USE_STANDALONE_COMPILE": - lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1", + lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "0") == "1", # local rank of the process in the distributed setting, used to determine # the GPU device id From 239ef0c1ac0dfe68d8d2e28c54ecf9aa9bcd945b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 22 Sep 2025 14:27:51 -0400 Subject: [PATCH 102/185] [CI Failure] Fix fp8 kv cache on --- vllm/platforms/cuda.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 7baa5a9742f44..b10bc03ee16c6 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -286,6 +286,9 @@ class CudaPlatformBase(Platform): TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 + use_fp8_kv_cache = (kv_cache_dtype is not None + and kv_cache_dtype.startswith("fp8")) + if selected_backend == _Backend.FLASHINFER: logger.info_once("Using FlashInfer backend on V1 engine.") if cls.has_device_capability(100): @@ -334,10 +337,11 @@ class CudaPlatformBase(Platform): # FlashAttention is the default for SM 8.0+ GPUs if cls.has_device_capability(80): - if has_sink and not cls.is_device_capability(90): + if (has_sink or + use_fp8_kv_cache) and not cls.is_device_capability(90): logger.info_once("Using Triton backend on V1 engine.") return TRITON_ATTN_VLLM_V1 - if is_default_backend_supported := is_attn_backend_supported( + elif is_default_backend_supported := is_attn_backend_supported( FLASH_ATTN_V1, head_size, dtype, allow_import_error=False): logger.info_once("Using Flash Attention backend on " From 922979bfcc461cf0069789728f4afdf1284265fb Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 22 Sep 2025 12:06:05 -0700 Subject: [PATCH 103/185] [DP] support torchrun external launcher with Data Parallelism (#24899) Signed-off-by: Lu Fang Signed-off-by: Zhuohan Li Co-authored-by: Zhuohan Li --- .buildkite/test-pipeline.yaml | 12 ++- .../offline_inference/torchrun_dp_example.py | 81 +++++++++++++++++++ .../distributed/test_torchrun_example_moe.py | 81 +++++++++++++++++++ vllm/config/parallel.py | 13 ++- vllm/distributed/parallel_state.py | 4 +- vllm/v1/engine/llm_engine.py | 18 ++++- 6 files changed, 202 insertions(+), 7 deletions(-) create mode 100644 examples/offline_inference/torchrun_dp_example.py create mode 100644 tests/distributed/test_torchrun_example_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fe4796b35786c..c4ea4b675649c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -165,10 +165,18 @@ steps: - tests/v1/test_hybrid_lb_dp.py - tests/v1/engine/test_engine_core_client.py commands: - # test with tp=2 and external_dp=2 + # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with tp=2 and pp=2 + # test with torchrun tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py new file mode 100644 index 0000000000000..8e888a100254e --- /dev/null +++ b/examples/offline_inference/torchrun_dp_example.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +experimental support for data-parallel inference with torchrun +Note the data load balancing and distribution is done out of the vllm engine, +no internal lb supported in external_launcher mode. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] * 50 + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +# it is important to set an explicit seed to make sure that +# all ranks have the same random seed, so that sampling can be +# deterministic across ranks. +llm = LLM( + model="microsoft/Phi-mini-MoE-instruct", + tensor_parallel_size=1, + data_parallel_size=2, + pipeline_parallel_size=1, + enable_expert_parallel=False, + distributed_executor_backend="external_launcher", + max_model_len=4096, + gpu_memory_utilization=0.6, + seed=1, +) + +dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank +dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size + +prompts = [ + f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank +] + +outputs = llm.generate(prompts, sampling_params) + + +# all ranks will have the same outputs +print("-" * 50) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n") + print("-" * 50) +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py new file mode 100644 index 0000000000000..2d6b930fcc07e --- /dev/null +++ b/tests/distributed/test_torchrun_example_moe.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# unit test for `examples/offline_inference/torchrun_example.py` +import os +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_tp_group, get_world_group + +dist.init_process_group(backend="gloo") + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] * 10 +dp_size = int(os.getenv("DP_SIZE", "1")) +dp_rank = int(os.getenv("DP_RANK", "0")) + +if dp_size > 1: + # distribute the prompts across the data parallel ranks + prompts = [ + prompt for idx, prompt in enumerate(prompts) + if idx % dp_size == dp_rank + ] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="microsoft/Phi-mini-MoE-instruct", + tensor_parallel_size=int(os.getenv("TP_SIZE", "1")), + pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")), + enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4), + seed=0) + +outputs = llm.generate(prompts, sampling_params) + +group = get_world_group() if dp_size == 1 else get_tp_group() +cpu_group = group.cpu_group +group_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if group_rank == 0: + dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, + src=group.ranks[0], + group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# make sure we can access the model parameters from the calling process +# of the `LLM` instance. +params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner. + model.parameters()) +test_consistent_across_ranks(len(params)) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {group_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 37a41bf6de71a..a84d882430166 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib +import os from dataclasses import field from typing import TYPE_CHECKING, Any, Literal, Optional, Union @@ -351,6 +352,10 @@ class ParallelConfig: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size + if self.distributed_executor_backend == "external_launcher": + logger.info("Using external launcher for distributed inference.") + self.world_size *= self.data_parallel_size + if self.data_parallel_size_local > self.data_parallel_size: raise ValueError( f"data_parallel_size_local ({self.data_parallel_size_local}) " @@ -358,6 +363,13 @@ class ParallelConfig: if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: # Data parallel was specified in the engine args. + if self.distributed_executor_backend == "external_launcher": + # For external launcher, + # we need to set the data parallel rank automatically + self.data_parallel_rank = int(os.environ["RANK"]) \ + // (self.world_size // self.data_parallel_size) + logger.info("Set data_parallel_rank to %d automatically.", + self.data_parallel_rank) if not self._data_parallel_master_port_list: self._data_parallel_master_port_list = get_open_ports_list(5) self.data_parallel_master_port = \ @@ -380,7 +392,6 @@ class ParallelConfig: "be set when data_parallel_size > 1") if self.distributed_executor_backend == "external_launcher": - import os os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" logger.info("Disabling V1 multiprocessing for external launcher.") diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 12571afaa4c13..895971893a661 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1032,7 +1032,9 @@ def init_distributed_environment(world_size: int = -1, distributed_init_method, backend) from vllm.config import get_current_vllm_config config = get_current_vllm_config() - if config is not None and config.parallel_config.data_parallel_size > 1: + if config is not None and config.parallel_config.data_parallel_size > 1 \ + and config.parallel_config.distributed_executor_backend \ + != "external_launcher": parallel_config = config.parallel_config # adjust to take into account data parallelism # offset the rank by the data parallel rank diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 907656d1b24cb..92c861d9e91fe 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -11,6 +11,7 @@ from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group +from vllm.distributed.parallel_state import get_dp_group from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType from vllm.logger import init_logger @@ -77,10 +78,15 @@ class LLMEngine: if self.log_stats: self.stat_logger = PrometheusStatLogger(vllm_config) + executor_backend = ( + self.vllm_config.parallel_config.distributed_executor_backend) + parallel_config = vllm_config.parallel_config + self.external_launcher_dp = (parallel_config.data_parallel_size > 1 and + executor_backend == "external_launcher") # important: init dp group before init the engine_core # In the decoupled engine case this is handled in EngineCoreProc. - parallel_config = vllm_config.parallel_config - if not multiprocess_mode and parallel_config.data_parallel_size > 1: + if not multiprocess_mode and parallel_config.data_parallel_size > 1 \ + and not self.external_launcher_dp: self.dp_group = parallel_config.stateless_init_dp_group() else: self.dp_group = None @@ -120,6 +126,11 @@ class LLMEngine: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore + if self.external_launcher_dp: + # If we use DP in external launcher mode, we reuse the + # existing DP group used for data communication. + self.dp_group = get_dp_group().cpu_group + # Don't keep the dummy data in memory self.reset_mm_cache() @@ -331,5 +342,6 @@ class LLMEngine: return self.collective_rpc("apply_model", args=(func, )) def __del__(self): - if dp_group := getattr(self, "dp_group", None): + if dp_group := getattr(self, "dp_group", + None) and not self.external_launcher_dp: stateless_destroy_torch_distributed_process_group(dp_group) From 8d0ee5a56452bfb671ae00e96d10841a8054b823 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 22 Sep 2025 12:16:59 -0700 Subject: [PATCH 104/185] [misc] Remove RFC review hours reference (#25416) --- .github/ISSUE_TEMPLATE/750-RFC.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index 7ee57c42895ca..c0e009855964a 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -43,10 +43,6 @@ body: Any other things you would like to mention. validations: required: false -- type: markdown - attributes: - value: > - Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit). - type: checkboxes id: askllm attributes: From d5e0fca264f7d277ed372f7c075827ed9a0c5e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Mon, 22 Sep 2025 15:30:05 -0400 Subject: [PATCH 105/185] [torch.compile] Cleanup compilation tests and custom passes, add debug utils, fix DCE bug (#23091), fix test (#24376), and prep for custom op matching (#24604) (#24542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Signed-off-by: luka Signed-off-by: Luka Govedič --- tests/compile/backend.py | 28 +- tests/compile/test_async_tp.py | 2 + tests/compile/test_config.py | 10 +- tests/compile/test_functionalization.py | 10 +- tests/compile/test_fusion.py | 17 +- tests/compile/test_fusion_all_reduce.py | 6 +- tests/compile/test_fusion_attn.py | 20 +- tests/compile/test_sequence_parallelism.py | 21 +- tests/compile/test_silu_mul_quant_fusion.py | 13 +- vllm/compilation/activation_quant_fusion.py | 18 +- vllm/compilation/collective_fusion.py | 29 +-- vllm/compilation/fix_functionalization.py | 9 +- vllm/compilation/fusion.py | 273 ++------------------ vllm/compilation/fusion_attn.py | 22 +- vllm/compilation/multi_output_match.py | 109 -------- vllm/compilation/noop_elimination.py | 5 +- vllm/compilation/pass_manager.py | 49 +++- vllm/compilation/post_cleanup.py | 20 ++ vllm/compilation/sequence_parallelism.py | 18 +- vllm/compilation/vllm_inductor_pass.py | 110 +++++++- vllm/config/__init__.py | 7 +- vllm/config/compilation.py | 48 +++- vllm/envs.py | 6 + vllm/utils/__init__.py | 15 +- 24 files changed, 404 insertions(+), 461 deletions(-) delete mode 100644 vllm/compilation/multi_output_match.py create mode 100644 vllm/compilation/post_cleanup.py diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 2c4287950dcfe..f25c367433f41 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import weakref from collections.abc import Sequence from copy import deepcopy from typing import Callable, Union @@ -10,7 +11,26 @@ from torch._ops import OpOverload from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.inductor_pass import InductorPass -from vllm.config import get_current_vllm_config +from vllm.compilation.pass_manager import with_pattern_match_debug +from vllm.compilation.vllm_inductor_pass import VllmInductorPass +from vllm.config import VllmConfig, get_current_vllm_config + + +class LazyInitPass(InductorPass): + """ + If there's a pass that we want to initialize lazily in a test, + we can wrap it in LazyInitPass, which will initialize the pass when invoked + and then immediately invoke it. + """ + + def __init__(self, pass_cls: type[VllmInductorPass], + vllm_config: VllmConfig): + self.pass_cls = pass_cls + self.vllm_config = weakref.proxy(vllm_config) # avoid cycle + + def __call__(self, graph: fx.Graph) -> None: + self.pass_ = self.pass_cls(self.vllm_config) + self.pass_(graph) class TestBackend: @@ -40,10 +60,16 @@ class TestBackend: example_inputs, config_patches=self.inductor_config) + @with_pattern_match_debug def post_pass(self, graph: fx.Graph): self.graph_pre_pass = deepcopy(graph) + + VllmInductorPass.dump_prefix = 0 for pass_ in self.custom_passes: pass_(graph) + VllmInductorPass.dump_prefix += 1 + + VllmInductorPass.dump_prefix = None self.graph_post_pass = deepcopy(graph) # assign by reference, will reflect the final state of the graph diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 9a51e6b3514f4..1dc21365d5577 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -294,6 +294,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states) + assert async_tp_pass.matched_count == 1 + # In pre-nodes, all gather or reduce scatter should exist, # fused_matmul_reduce_scatter or fused_all_gather_matmul should not backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 90e8e0ff95858..7afd6251bbbd5 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest import vllm from vllm.compilation.counter import compilation_counter -from vllm.config import VllmConfig +from vllm.config import CompilationConfig, VllmConfig from vllm.utils import _is_torch_equal_or_newer @@ -26,6 +26,14 @@ def test_use_cudagraphs_dynamic(monkeypatch): assert not vllm_config.compilation_config.use_cudagraph +def test_custom_op(): + # proper syntax + _ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"]) + + with pytest.raises(ValueError, match="Invalid syntax '"): + _ = CompilationConfig(custom_ops=["quant_fp8"]) + + # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 0c7e6fbccf20c..2ee9aa7476beb 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -8,9 +8,10 @@ import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fusion import FUSED_OPS, FusionPass +from vllm.compilation.fusion import FUSED_OPS, RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym) @@ -58,11 +59,12 @@ def test_fix_functionalization(model: str, quant_key: QuantKey, vllm_config.compilation_config = CompilationConfig( pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True)) noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config) - passes = [noop_pass, fusion_pass, act_quant_fusion_pass - ] if do_fusion else [noop_pass] + passes = [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass + ] if do_fusion else [noop_pass, cleanup_pass] func_pass = FixFunctionalizationPass(vllm_config) backend_func = TestBackend(*passes, func_pass) backend_no_func = TestBackend(*passes) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index eedb9bdcd5299..3d8897d3f18b8 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -4,11 +4,11 @@ import pytest import torch -import vllm.envs as envs import vllm.plugins from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, - FusionPass) + RMSNormQuantFusionPass) from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, VllmConfig) from vllm.model_executor.layers.layernorm import RMSNorm @@ -79,15 +79,15 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) -@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) +@pytest.mark.parametrize("hidden_size", [64]) +@pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("static", [True, False]) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize("cuda_force_torch", [True, False] if cutlass_fp8_supported() else [True]) -@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], +@pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm") def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, cuda_force_torch): @@ -104,9 +104,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, with vllm.config.set_current_vllm_config(vllm_config): # Reshape pass is needed for the fusion pass to work noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - backend = TestBackend(noop_pass, fusion_pass) + backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) model = TestModel(hidden_size, eps, static, cuda_force_torch) # First dimension dynamic @@ -128,6 +129,8 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) + assert fusion_pass.matched_count == 2 + # In pre-nodes, fp8 quant should be there and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index dd31e0db1f59f..60f32c863208d 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -9,6 +9,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AllReduceFusionPass from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig, ModelConfig, PassConfig, VllmConfig) from vllm.distributed import tensor_model_parallel_all_reduce @@ -215,8 +216,10 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, all_reduce_fusion_pass = AllReduceFusionPass(vllm_config) noop_pass = NoOpEliminationPass(vllm_config) func_pass = FixFunctionalizationPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass) + backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass, + cleanup_pass) token_num = batch_size * seq_len model = test_model_cls(hidden_size, token_num) @@ -227,6 +230,7 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states, residual) + assert all_reduce_fusion_pass.matched_count == 1 backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) backend.check_after_ops(model.ops_in_model_after()) del all_reduce_fusion_pass diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index c3f1c7481d1b3..c4cac95531926 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -6,18 +6,19 @@ from typing import Optional import pytest import torch._dynamo -from tests.compile.backend import TestBackend +from tests.compile.backend import LazyInitPass, TestBackend from tests.models.utils import check_outputs_equal from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata) from vllm import LLM, SamplingParams from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention import Attention +from vllm.attention import Attention, AttentionMetadata from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, ModelConfig, PassConfig, SchedulerConfig, VllmConfig, set_current_vllm_config) @@ -104,7 +105,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str, # AttnFusionPass needs attention layers to be registered in config upon init # so we initialize it during compilation. - attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw) + attn_pass = LazyInitPass(AttnFusionPass, vllm_config) backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass) llm2 = LLM(model, enforce_eager=True, @@ -197,7 +198,8 @@ class AttentionQuantPatternModel(torch.nn.Module): device=self.device, ) - def build_attn_metadata(self, batch_size: int, use_hnd: bool): + def build_attn_metadata(self, batch_size: int, use_hnd: bool) \ + -> AttentionMetadata: """Initialize attention metadata.""" # Create common attn metadata @@ -447,9 +449,10 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, # Create test backend with fusion passes enabled noop_pass = NoOpEliminationPass(vllm_config) - attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw - ) - test_backend = TestBackend(noop_pass, attn_pass) + attn_pass = LazyInitPass(AttnFusionPass, vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) + + test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass) # Compile model with fusion enabled model_compiled = torch.compile(model_fused, @@ -485,6 +488,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True) + # access the underlying `AttnFusionPass` on the `LazyInitPass` + assert attn_pass.pass_.matched_count == sum(attn_fusion_supported) + # Check attention ops in the graph before and after fusion attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass)) attn_nodes_post = list(find_op_nodes(ATTN_OP, diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index fb9f9dde22799..b2734e915bbbf 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -6,10 +6,12 @@ import torch import vllm.envs as envs from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fusion import FusionPass +from vllm.compilation.fusion import RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.sequence_parallelism import SequenceParallelismPass +from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, PassConfig, VllmConfig) from vllm.distributed import tensor_model_parallel_all_reduce @@ -104,7 +106,7 @@ class TestQuantModel(torch.nn.Module): # Initialize weights torch.nn.init.normal_(self.gate_proj, std=0.02) - self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False) + self.fp8_linear = Fp8LinearOp(act_quant_static=True) self.scale = torch.rand(1, dtype=torch.float32) # Create a weight that is compatible with torch._scaled_mm, @@ -137,8 +139,7 @@ class TestQuantModel(torch.nn.Module): # layer normalization norm_output, residual_output = self.norm(all_reduce, residual) - # for static input quantization - # self.fp8_linear is initialized with use_per_token_if_dynamic=False + # scaled_mm with static input quantization fp8_linear_result = self.fp8_linear.apply(norm_output, self.w, self.wscale, @@ -253,16 +254,20 @@ def sequence_parallelism_pass_on_test_model( dtype=dtype, seed=42) - sequence_parallelism_pass = SequenceParallelismPass(vllm_config) noop_pass = NoOpEliminationPass(vllm_config) + sequence_parallelism_pass = SequenceParallelismPass(vllm_config) func_pass = FixFunctionalizationPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - passes_for_backend = [noop_pass, sequence_parallelism_pass] + passes_for_backend: list[VllmInductorPass] = \ + [noop_pass, sequence_parallelism_pass] if enable_fusion: - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) passes_for_backend.append(fusion_pass) + passes_for_backend.append(cleanup_pass) + backend_no_func = TestBackend(*passes_for_backend) backend_func = TestBackend(*passes_for_backend, func_pass) @@ -279,6 +284,8 @@ def sequence_parallelism_pass_on_test_model( compiled_model_func = torch.compile(model, backend=backend_func) compiled_model_func(hidden_states, residual) + assert sequence_parallelism_pass.matched_count == 1 + # In pre-nodes, all reduce should be there, # reduce scatter and all gather should not backend_no_func.check_before_ops(model.ops_in_model_before()) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index ae190d25cad62..c445f4dde2cc4 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -15,6 +15,7 @@ from vllm.compilation.activation_quant_fusion import ( # yapf: enable from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -69,6 +70,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs): super().__init__() + from vllm.compilation.activation_quant_fusion import ( + silu_and_mul_nvfp4_quant_supported) + assert silu_and_mul_nvfp4_quant_supported + self.silu_and_mul = SiluAndMul() # create nvfp4 weight @@ -127,7 +132,11 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class, pass_config=PassConfig(enable_fusion=True, enable_noop=True)) fusion_pass = ActivationQuantFusionPass(config) - backend = TestBackend(NoOpEliminationPass(config), fusion_pass) + passes = [ + NoOpEliminationPass(config), fusion_pass, + PostCleanupPass(config) + ] + backend = TestBackend(*passes) model = model_class(hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x) @@ -151,6 +160,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class, atol=atol, rtol=rtol) + assert fusion_pass.matched_count == 1 + # In pre-nodes, quant op should be present and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index f2fbb1200eecc..74462fb37ca97 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -17,7 +17,7 @@ from vllm.platforms import current_platform from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -152,7 +152,7 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern): register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) -class ActivationQuantFusionPass(VllmInductorPass): +class ActivationQuantFusionPass(VllmPatternMatcherPass): """ This pass fuses a pre-defined set of custom ops into fused ops. It uses the torch pattern matcher to find the patterns and replace them. @@ -176,16 +176,12 @@ class ActivationQuantFusionPass(VllmInductorPass): pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() pattern_silu_mul_nvfp4.register(self.patterns) + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): - self.begin() - self.dump_graph(graph, "before_act_quant_fusion") - - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns in ActivationQuantFusionPass", - count) - - self.dump_graph(graph, "after_act_quant_fusion") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) def uuid(self): return VllmInductorPass.hash_source(self, ActivationQuantPattern, diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 0658b59a2e215..331cd8a873929 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -20,7 +20,7 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass FP8_DTYPE = current_platform.fp8_dtype() @@ -348,7 +348,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern): pm.fwd_only, pm_pass) -class AsyncTPPass(VllmInductorPass): +class AsyncTPPass(VllmPatternMatcherPass): @enable_fake_mode def __init__(self, config: VllmConfig): @@ -378,18 +378,17 @@ class AsyncTPPass(VllmInductorPass): AllGatherCutlassScaledMMPattern( self.model_dtype, self.device).register(self.patterns) + self.dump_patterns(config, self.patterns) + def is_applicable_for_shape(self, shape: Optional[int]) -> bool: # only do replace for specific shapes tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_async_tp_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns with async TP pass.", count) - self.dump_graph(graph, "after_async_tp_pass") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) if flashinfer_comm is not None: @@ -1068,7 +1067,7 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern): pm.fwd_only, pm_pass) -class AllReduceFusionPass(VllmInductorPass): +class AllReduceFusionPass(VllmPatternMatcherPass): def __init__(self, config: VllmConfig): super().__init__(config) @@ -1124,6 +1123,7 @@ class AllReduceFusionPass(VllmInductorPass): fuse_rms_quant=config.compilation_config.pass_config.enable_fusion) self.register_patterns() + self.dump_patterns(config, self.patterns) @enable_fake_mode def register_patterns(self): @@ -1172,15 +1172,14 @@ class AllReduceFusionPass(VllmInductorPass): self.disabled = False + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): if self.disabled: + logger.debug("AllReduceFusionPass disabled") return - self.begin() - self.dump_graph(graph, "before_all_reduce_fusion_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) - self.dump_graph(graph, "after_all_reduce_fusion_pass") - self.end_and_log() + + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) def __del__(self): if getattr(self, "disabled", True): diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 6bc721eec3d45..54403c1f7ca3d 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -26,6 +26,7 @@ class FixFunctionalizationPass(VllmInductorPass): To add new nodes to defunctionalize, add to the if-elif chain in __call__. """ + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): # XPU does not support auto-functionalization yet. # Will enable this when switch to vllm-xpu-kernels. @@ -34,9 +35,6 @@ class FixFunctionalizationPass(VllmInductorPass): "pass currently.") return - self.begin() - self.dump_graph(graph, "before_fix_functionalization") - self.nodes_to_remove: list[torch.fx.Node] = [] count = 0 for node in graph.nodes: @@ -111,7 +109,7 @@ class FixFunctionalizationPass(VllmInductorPass): count += 1 - self.dump_graph(graph, "before_fix_functionalization_cleanup") + self.dump_graph(graph, "before_cleanup") # Remove the nodes all at once count_removed = len(self.nodes_to_remove) @@ -120,8 +118,7 @@ class FixFunctionalizationPass(VllmInductorPass): logger.debug("De-functionalized %s nodes, removed %s nodes", count, count_removed) - self.dump_graph(graph, "after_fix_functionalization") - self.end_and_log() + self.nodes_to_remove.clear() def _remove(self, node_or_nodes: Union[torch.fx.Node, Iterable[torch.fx.Node]]): diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index afa739c966a5b..3034b6eaeaca1 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable, NamedTuple, Optional +from typing import Any, NamedTuple import torch import torch._inductor.pattern_matcher as pm @@ -16,10 +16,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale) from vllm.platforms import current_platform -from .fx_utils import find_getitem_maybe from .inductor_pass import enable_fake_mode -from .multi_output_match import MultiOutputMatch -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) FP8_DTYPE = current_platform.fp8_dtype() @@ -50,8 +48,7 @@ QUANT_OPS: dict[QuantKey, OpOverload] = { torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 } if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): - QUANT_OPS[ - kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default # noqa: E501 + QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default class FusedRMSQuantKey(NamedTuple): @@ -80,68 +77,6 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { } -class QuantMultiOutputMatch(MultiOutputMatch): - - def __init__(self, match: pm.Match, quant_op, fused_op): - super().__init__(match) - assert isinstance(quant_op, OpOverload) - assert isinstance(fused_op, OpOverload) - self.QUANT_OP = quant_op # in-place quant op - self.FUSED_OP = fused_op # in-place fused quant op - - def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node, - int]], - **kwargs): - """ - This utility function inserts an auto-functionalized node for FUSED_OP. - It also correctly sets its meta value and rebinds the users of the - unfused nodes to use the fused node instead. - - :param fused_return_mapping: A dictionary, mapping from getitem indices - of the fused node result to a tuple of the old node and a getitem index. - :param kwargs: kwargs that get directly forwarded to the auto_fn node - - Example: - If we want to replace this graph: - _, x1, x2 = auto_fn(op1) - _, y1, y2 = auto_fn(op2) - - with - _, x1, y2, x2 = auto_fn(FUSED_OP) - - we would call: - insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)} - - Note that the 0th element is None for auto-functionalized in-place ops. - Hence, others appear 1-indexed. - """ - fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs) - indices = fused_return_mapping.keys() - getitem_nodes = self.insert_getitems(fused_node, indices) - - # Prepare the meta value, use a list so it's mutable - meta_val = [None] * (max(indices) + 1) - - # Iterate through elements of the tuple produced by fused_node - for idx, getitem_node in zip(indices, getitem_nodes): - old_node, old_idx = fused_return_mapping[idx] - - # If the old value was never used, the old_getitem might not exist - old_getitem = find_getitem_maybe(old_node, old_idx) - if old_getitem is not None: - # Rebind the users of match getitem nodes to use the new nodes. - # The old nodes will be removed by DCE at the end of the pass. - old_getitem.replace_all_uses_with(getitem_node) - getitem_node.meta["val"] = old_getitem.meta["val"] - - # Extract the appropriate meta value - # It is present even if the getitem node does not exist - meta_val[idx] = old_node.meta["val"][old_idx] - - # Fix the meta value on the new fused node - fused_node.meta["val"] = tuple(meta_val) - - class RMSNormQuantPattern: def __init__(self, epsilon: float, key: FusedRMSQuantKey): @@ -224,8 +159,7 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, @@ -271,36 +205,7 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_ADD_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 2 - assert len(quant_node.users) == 1 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract the result and residual. - # The auto_fn node returns a tuple of (None, result, residual). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...) # noqa - # result_node_new = at[1] - # residual_node_new = at[2] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - - # 0 is always None - fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)} - self.insert_fused_node(fused_return_mapping, - **kwargs, - epsilon=rms_node.kwargs["epsilon"]) + ) class RMSNormDynamicQuantPattern(RMSNormQuantPattern): @@ -317,8 +222,7 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, result_rms: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, @@ -366,39 +270,7 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 1 - assert len(quant_node.users) == 2 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract the result and scale. - # The auto_fn node returns a tuple of (None, result, scale). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa - # result_node_new = at[1] - # scale_node_new = at[2] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - del kwargs["result_rms"] # not used in the fused op - - fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)} - self.insert_fused_node( - fused_return_mapping, - epsilon=rms_node.kwargs["epsilon"], - scale_ub=None, # not used but required - residual=None, # not used but required - **kwargs) + ) class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): @@ -415,8 +287,7 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, @@ -464,137 +335,49 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_ADD_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 2 - assert len(quant_node.users) == 2 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract result, scale, and residual. - # The auto_fn node returns a tuple (None, result, scale, residual). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa - # result_node_new = at[1] - # scale_node_new = at[2] - # residual_node_new = at[3] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - - fused_return_mapping = { - 1: (quant_node, 1), # result - 2: (quant_node, 2), # scale - 3: (rms_node, 2), # residual - } - self.insert_fused_node( - fused_return_mapping, - epsilon=rms_node.kwargs["epsilon"], - scale_ub=None, # not used but required - **kwargs) + ) -class FusionPass(VllmInductorPass): +class RMSNormQuantFusionPass(VllmPatternMatcherPass): """ - This pass fuses a pre-defined set of custom ops into fused ops. - It uses the torch pattern matcher to find the patterns and replace them. - It also manually processes multi-output matches, as those are broken in - the torch pattern matcher. - - Because patterns can only be registered once, the pass is a singleton. - This will be addressed in a future version of PyTorch: - https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + It also supports fused_add_rms_norm. """ - _instance: 'Optional[FusionPass]' = None - - @classmethod - def instance(cls, config: VllmConfig): - """ - Get the singleton instance of the FusionPass. - If the instance exists, the config is updated but - initialization is not repeated. - """ - if cls._instance is None: - cls._instance = FusionPass(config) - else: - cls._instance.pass_config = config.compilation_config.pass_config - return cls._instance - @enable_fake_mode def __init__(self, config: VllmConfig): - assert self.__class__._instance is None, \ - "FusionPass singleton instance already exists" super().__init__(config) - self.matches: list[MultiOutputMatch] = [] self.patterns: PatternMatcherPass = PatternMatcherPass( - pass_name="fusion_pass") + pass_name="rmsnorm_quant_fusion_pass") for epsilon in [1e-5, 1e-6]: # Fuse rms_norm + static fp8 quant RMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(self.patterns) - # Matches for patterns below have 2 or more outputs, - # so we need to process them manually (see process_matches) - - # Fuse rms_norm + static fp8 quant + # Fuse fused_add_rms_norm + static fp8 quant FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + self.patterns) # Fuse rms_norm + dynamic per-token fp8 quant - RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + RMSNormDynamicQuantPattern(epsilon, + FP8_DTYPE).register(self.patterns) # Fuse fused_add_rms_norm + dynamic per-token fp8 quant FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + self.patterns) - # WARNING: This is a hack to clear the pattern matcher cache - # and allow multiple values of epsilon. - torch._inductor.pattern_matcher._seen_patterns.clear() - - def record_match(self, match: MultiOutputMatch) -> bool: - # Hijack the extra_check to record the match and - # save it for post-processing. - self.matches.append(match) - - # Return False to prevent automatic replacement. - return False - - def process_matches(self, graph: fx.Graph): - """ - Manually process multi-output matches and replace them with fused nodes. - See MultiOutputMatch for more details. - """ - for match in self.matches: - match.process() - - # Finally, remove matched nodes - graph.eliminate_dead_code() - assert all(node not in graph.nodes for match in self.matches - for node in match.match.nodes) + self.dump_patterns(config, self.patterns) + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_fusion") + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) - self.dump_graph(graph, "after_pattern_match") - - # Manually process multi-output matches (and run DCE) - self.process_matches(graph) - logger.debug("Post-processed %s matches", len(self.matches)) - self.dump_graph(graph, "after_fusion") - self.matches.clear() - self.end_and_log() + def uuid(self) -> Any: + return self.hash_source(self, RMSNormQuantPattern, + RMSNormStaticQuantPattern, + RMSNormDynamicQuantPattern, + FusedAddRMSNormStaticQuantPattern, + FusedAddRMSNormDynamicQuantPattern) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index e3677b3dd62d8..2c6cf8f12fdc1 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -18,7 +18,7 @@ from vllm.utils import round_up from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -245,7 +245,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): pm_pass) -class AttnFusionPass(VllmInductorPass): +class AttnFusionPass(VllmPatternMatcherPass): """ This pass fuses post-attention quantization onto attention if supported. @@ -282,20 +282,12 @@ class AttnFusionPass(VllmInductorPass): "were found in CompilationConfig.static_forward_context " "so no fusion patterns were registered.") + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.graph.Graph) -> None: - self.begin() - self.dump_graph(graph, "before_attn_fusion") - - count = self.patterns.apply(graph) - - # TODO: Move this to pass_manager.py after the fx graph broken issue - # has been resolved. - # see https://github.com/vllm-project/vllm/issues/23091 - graph.eliminate_dead_code() - - logger.debug("Fused quantization onto %s attention nodes", count) - self.dump_graph(graph, "after_attn_fusion") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Fused quant onto %s attention nodes", self.matched_count) def uuid(self): return VllmInductorPass.hash_source(self, AttentionQuantPattern, diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py deleted file mode 100644 index 6d1893777cec6..0000000000000 --- a/vllm/compilation/multi_output_match.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import abc -import operator -from abc import abstractmethod -from collections.abc import Iterable - -from torch import fx -from torch._higher_order_ops.auto_functionalize import auto_functionalized -from torch._inductor import pattern_matcher as pm -from torch._ops import OpOverload -from torch.fx import Node - -from vllm.compilation.fx_utils import find_auto_fn - - -class MultiOutputMatch(abc.ABC): - """ - This class provides utilities to process multi-output matches and - manually insert replacements. - - This is necessary because the automatic replacement for multi-output - matches is broken: https://github.com/pytorch/pytorch/issues/137280 - """ - - def __init__(self, match: pm.Match): - self.match = match - - @abstractmethod - def process(self): - """ - Process a multi-output match and manually insert the replacement. - - This method should: - 1. Insert the replacement nodes after the last node in the match. - 2. Rebind the users of nodes in the match to use the new nodes. - 3. Set meta["val"] for de-functionalization. - - The result of an auto-functionalized node is a tuple of tensors. - The first element is the return value of the function, usually None. - The remaining elements are the mutated args of the function. - - All auto-functionalized nodes must contain a proper meta["val"], - as it is used by de-functionalization. meta["val"] has to contain the - value of the node (tuple of tensors) that would be returned by the - functionalized node during tracing. - - Existing nodes in the graph all have this property set, but we have - to set it manually for new nodes we insert. - - Example: - # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None - at = auto_functionalized(torch.ops._C.foo.default, a, b, c) - # at.meta["val"] = (None, a, c) - """ - raise NotImplementedError - - @property - def nodes(self) -> list[fx.Node]: - return self.match.nodes - - @property - def graph(self) -> fx.Graph: - return self.match.graph - - def find_auto_fn(self, op) -> fx.Node: - """ - Find the first auto_functionalized node with the given op in the match. - """ - return find_auto_fn(self.nodes, op) - - def inserting_after_match(self): - """ - Insert nodes after the last node in the match. - This is done to avoid use-before-definition errors after inserting - replacement nodes. - """ - - # match.nodes is not guaranteed to be sorted. - # Find the last node in the match. - for last_node_in_match in reversed(self.graph.nodes): - if last_node_in_match in self.match.nodes: - break - else: - raise ValueError("No nodes in graph") - - return self.graph.inserting_after(last_node_in_match) - - def insert_getitems(self, tuple_node: fx.Node, - indices: Iterable[int]) -> tuple[fx.Node, ...]: - """ - Insert operator.getitem nodes to extract elements from a tuple node. - - :param tuple_node: The tuple node to extract elements from. - :param indices: The indices of the elements to extract. - :return: Tuple of the new getitem nodes, corresponding to the indices. - """ - with self.graph.inserting_after(tuple_node): - return tuple( - self.graph.call_function(operator.getitem, (tuple_node, idx)) - for idx in indices) - - def insert_auto_fn(self, op: OpOverload, kwargs) -> Node: - """ - Insert an auto_functionalized node with the given op and kwargs. - """ - return self.graph.call_function(auto_functionalized, (op, ), - kwargs=kwargs) diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 17e85e70218da..2c453daf873d2 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -64,9 +64,8 @@ class NoOpEliminationPass(VllmInductorPass): out: "f16[s0, 4096]" = at[1] """ + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): - self.begin() - self.dump_graph(graph, "before_noop_elimination") count = 0 # Remove no-op reshapes/views: for node in graph.nodes: @@ -121,8 +120,6 @@ class NoOpEliminationPass(VllmInductorPass): count += 1 logger.debug("Removed %s no-op reshapes and slices", count) - self.dump_graph(graph, "after_noop_elimination") - self.end_and_log() # ---------------------- Reshape helpers ---------------------- def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node], diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 1b1cbe4fa12c2..e323fa1f77349 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,15 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools from torch import fx as fx +from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils import set_env_var + +from .post_cleanup import PostCleanupPass +from .vllm_inductor_pass import VllmInductorPass if current_platform.is_cuda_alike(): from .activation_quant_fusion import ActivationQuantFusionPass - from .fusion import FusionPass + from .fusion import RMSNormQuantFusionPass from .fusion_attn import AttnFusionPass if current_platform.is_cuda(): @@ -19,11 +25,28 @@ from .fix_functionalization import FixFunctionalizationPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context from .noop_elimination import NoOpEliminationPass from .sequence_parallelism import SequenceParallelismPass -from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) +def with_pattern_match_debug(fn): + """ + Function decorator that turns on inductor pattern match debug + for the duration of the call. + Used to avoid logging builtin Inductor pattern matching. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + if (debug_val := envs.VLLM_PATTERN_MATCH_DEBUG) is not None: + # optionally check rank here + with set_env_var("TORCHINDUCTOR_PATTERN_MATCH_DEBUG", debug_val): + return fn(*args, **kwargs) + return fn(*args, **kwargs) + + return wrapper + + class PostGradPassManager(CustomGraphPass): """ The pass manager for post-grad passes. @@ -40,16 +63,26 @@ class PostGradPassManager(CustomGraphPass): """ def __init__(self): - self.passes: list[VllmInductorPass] = [] + self.passes: list[InductorPass] = [] + @with_pattern_match_debug def __call__(self, graph: fx.Graph): + VllmInductorPass.dump_prefix = 0 # reset dump index + shape = get_pass_context().runtime_shape for pass_ in self.passes: if pass_.is_applicable_for_shape(shape): pass_(graph) + VllmInductorPass.dump_prefix += 1 + + # post-cleanup goes before fix_functionalization + # because it requires a functional graph + self.post_cleanup(graph) + VllmInductorPass.dump_prefix += 1 # always run fix_functionalization last self.fix_functionalization(graph) + VllmInductorPass.dump_prefix = None # Cleanup index def configure(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config @@ -61,14 +94,18 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.enable_async_tp: self.passes += [AsyncTPPass(config)] + if self.pass_config.enable_fi_allreduce_fusion: + self.passes += [AllReduceFusionPass(config)] + if self.pass_config.enable_fusion: - self.passes += [FusionPass.instance(config)] + self.passes += [RMSNormQuantFusionPass(config)] self.passes += [ActivationQuantFusionPass(config)] if self.pass_config.enable_attn_fusion: self.passes += [AttnFusionPass(config)] - if self.pass_config.enable_fi_allreduce_fusion: - self.passes += [AllReduceFusionPass(config)] + + # needs a functional graph + self.post_cleanup = PostCleanupPass(config) self.fix_functionalization = FixFunctionalizationPass(config) def add(self, pass_: InductorPass): diff --git a/vllm/compilation/post_cleanup.py b/vllm/compilation/post_cleanup.py new file mode 100644 index 0000000000000..6a31f3935da7c --- /dev/null +++ b/vllm/compilation/post_cleanup.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from torch import fx + +from vllm.compilation.vllm_inductor_pass import VllmInductorPass + + +class PostCleanupPass(VllmInductorPass): + """ + This pass performs cleanup after custom passes. + It topologically sorts the graph and removes unused nodes. + This is needed because the pattern matcher does not guarantee producing + a topologically sorted graph, and there may be unused nodes left around. + """ + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph) -> None: + from torch._inductor.pattern_matcher import stable_topological_sort + stable_topological_sort(graph) + graph.eliminate_dead_code() diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 1758ed4c86d27..a6ca50c925a2a 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -417,7 +417,7 @@ class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): pm.fwd_only, pm_pass) -class SequenceParallelismPass(VllmInductorPass): +class SequenceParallelismPass(VllmPatternMatcherPass): """ This pass enables sequence parallelism for models. It identifies patterns where an AllReduce operation is followed by @@ -466,19 +466,13 @@ class SequenceParallelismPass(VllmInductorPass): LastAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) - - # WARNING: This is a hack to clear the pattern matcher cache - # and allow multiple values of epsilon. - torch._inductor.pattern_matcher._seen_patterns.clear() + self.dump_patterns(config, self.patterns) def is_applicable_for_shape(self, shape: Optional[int]) -> bool: tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_sequence_parallelism_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns with sequence parallelism", count) - self.dump_graph(graph, "after_sequence_parallelism_pass") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index b822b05b0f1ec..837770d181993 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import functools +import operator import time +from pathlib import Path +from typing import ClassVar, Optional +import regex as re import torch from torch._dynamo.utils import lazy_format_graph_code +from torch._inductor.pattern_matcher import (PatternMatcherPass, + PatternPrettyPrinter) from vllm.config import VllmConfig from vllm.logger import init_logger @@ -19,6 +25,8 @@ class VllmInductorPass(InductorPass): An inductor pass with access to vLLM PassConfig. It provides timing, logging, and dumping utilities. """ + dump_prefix: ClassVar[Optional[int]] = None + """Keep track of pass index for debug dump ordering.""" def __init__(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config @@ -28,8 +36,24 @@ class VllmInductorPass(InductorPass): else None self.pass_name = self.__class__.__name__ + @staticmethod + def time_and_log(call_fn): + + @functools.wraps(call_fn) + def wrapped(self: VllmInductorPass, graph: torch.fx.Graph): + self.begin() + self.dump_graph(graph, "before") + call_fn(self, graph) + self.dump_graph(graph, "after") + self.end_and_log() + + return wrapped + def dump_graph(self, graph: torch.fx.Graph, stage: str): - lazy_format_graph_code(stage, graph.owning_module) + i = VllmInductorPass.dump_prefix + i_str = "" if i is None else f".{i}" + lazy_format_graph_code(f"post_grad{i_str}.{self.pass_name}.{stage}", + graph.owning_module) def begin(self): self._start_time = time.perf_counter_ns() @@ -40,6 +64,88 @@ class VllmInductorPass(InductorPass): logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms) +class VllmPatternMatcherPass(VllmInductorPass): + """ + A VllmInductorPass that uses the Inductor pattern matcher. + Its main use is providing the dump_patterns utility that dumps the + Inductor pattern matcher patterns into a file, which greatly aids debugging. + + TODO(luka) move more utilities to this pass. + """ + matched_count: int = 0 + """The number of matched patterns in the pass.""" + + _OP_OVERLOAD_PATTERN: ClassVar[re.Pattern] = re.compile( + r"") + + def _replace_op_overloads(self, string: str) -> str: + """Replace with nicer formulations""" + return self._OP_OVERLOAD_PATTERN.sub( + lambda m: f"torch.ops.{m.group(1)}.{m.group(2)}", + string, + ) + + def dump_patterns(self, config: VllmConfig, pm_pass: PatternMatcherPass): + """ + If debug dumping is enabled, dump the Inductor pattern-matcher patterns + into the debug_dump_path folder next to the dumped fx graphs. + + This method does its best to print something that looks like Python code + for easier debugging and potentially navigation. If any errors appear in + the output, please add to this method. + + TODO(luka): use pattern object to manually produce pattern graph + """ + debug_dump_path = config.compilation_config.debug_dump_path + if not debug_dump_path: + return + + rank = config.parallel_config.rank + debug_dump_path = Path(debug_dump_path) / f"rank_{rank}" + debug_dump_path.mkdir(parents=True, exist_ok=True) + + from vllm.utils import unique_filepath + file_path = unique_filepath( + lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py") + + with file_path.open("w") as f: + print( + f'# This file was produced by VllmPatternMatcherPass.' + f'dump_patterns for {self.pass_name}.\n' + f'# It does its best to produce valid-Python-looking code but' + f' please add to dump_patterns if there are any errors.\n\n' + f'from torch._higher_order_ops.auto_functionalize import ' + f'auto_functionalized as auto_functionalized\n' + f'from torch._inductor.pattern_matcher import *', + file=f) + + for node, patterns in pm_pass.patterns.items(): + # fix the operator.getitem repr + if node[1] == operator.getitem: + node_repr = f"({repr(node[0])}, operator.getitem)" + else: + node_repr = repr(node) + + node_repr = self._replace_op_overloads(node_repr) + + print(f"\n\n# Patterns for op: {node_repr}", file=f) + for i, pattern in enumerate(patterns): + # reserve auto_functionalized ahead of time + pp = PatternPrettyPrinter() + pp.namespace.create_name("auto_functionalized", None) + + # Assemble pattern + out_node = pp.pretty_print(pattern.pattern) + pattern_repr = "\n".join([f"def pattern_{i}():"] + [ + f"{pp.memoized_objs_names[key]} = " + f"{pp.memoized_objs_pp[key]}" + for key in pp.memoized_objs_names + ] + [f"return {out_node}"]).replace("\n", "\n ") + + pattern_repr = self._replace_op_overloads(pattern_repr) + print(f"{pattern_repr}\n", file=f) + + class PrinterInductorPass(VllmInductorPass): def __init__(self, name: str, config: VllmConfig): diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 9bb8087a511d5..92fc68f8927ca 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -905,10 +905,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, except Exception: raise else: - logger.debug("enabled custom ops: %s", - vllm_config.compilation_config.enabled_custom_ops) - logger.debug("disabled custom ops: %s", - vllm_config.compilation_config.disabled_custom_ops) + if check_compile: + vllm_config.compilation_config.custom_op_log_check() + if check_compile and \ vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 22b38daf46c39..34fa7fcfe7e87 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -487,6 +487,12 @@ class CompilationConfig: "supported with torch>=2.9.0.dev. Set " "use_inductor_graph_partition=False instead.") + for op in self.custom_ops: + if op[0] not in {'+', '-'} and op not in {'all', 'none'}: + raise ValueError(f"Invalid syntax '{op}' for custom op, " + "must be 'all', 'none', '+op' or '-op' " + "(where 'op' is the registered op name)") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -532,8 +538,8 @@ class CompilationConfig: for x in self.compile_sizes: if isinstance(x, str): assert x == "cudagraph_capture_sizes", \ - "Unrecognized size type in compile_sizes, " \ - f"expect 'cudagraph_capture_sizes', got {x}" + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" computed_compile_sizes.extend(self.cudagraph_capture_sizes) else: assert isinstance(x, int) @@ -628,3 +634,41 @@ class CompilationConfig: return use_fx_graph_piecewise_compilation or \ use_inductor_piecewise_compilation + + def custom_op_log_check(self): + """ + This method logs the enabled/disabled custom ops and checks that the + passed custom_ops field only contains relevant ops. + It is called at the end of set_current_vllm_config, + after the custom ops have been instantiated. + """ + + if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0: + logger.debug("No custom ops found in model.") + return + + logger.debug("enabled custom ops: %s", self.enabled_custom_ops) + logger.debug("disabled custom ops: %s", self.disabled_custom_ops) + + all_ops_in_model = (self.enabled_custom_ops | self.disabled_custom_ops) + for op in self.custom_ops: + if op in {"all", "none"}: + continue + + assert op[0] in {'+', '-'}, "Invalid custom op syntax " \ + "(should be checked during init)" + + # check if op name exists in model + op_name = op[1:] + if op_name not in all_ops_in_model: + from vllm.model_executor.custom_op import CustomOp + + # Does op exist at all or is it just not present in this model? + # Note: Only imported op classes appear in the registry. + missing_str = "doesn't exist (or wasn't imported/registered)" \ + if op_name not in CustomOp.op_registry \ + else "not present in model" + + enable_str = "enabling" if op[0] == '+' else "disabling" + logger.warning_once("Op '%s' %s, %s with '%s' has no effect", + op_name, missing_str, enable_str, op) diff --git a/vllm/envs.py b/vllm/envs.py index 8941be125ed08..eaee2f6cc7716 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -190,6 +190,7 @@ if TYPE_CHECKING: VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None def get_default_cache_root(): @@ -442,6 +443,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_STANDALONE_COMPILE": lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "0") == "1", + # Debug pattern matching inside custom passes. + # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3'). + "VLLM_PATTERN_MATCH_DEBUG": + lambda: os.environ.get("VLLM_PATTERN_MATCH_DEBUG", None), + # local rank of the process in the distributed setting, used to determine # the GPU device id "LOCAL_RANK": diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 022e35a399c53..3399d00fbabbd 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3392,7 +3392,7 @@ def length_from_prompt_token_ids_or_embeds( prompt_token_ids: Optional[list[int]], prompt_embeds: Optional[torch.Tensor], ) -> int: - """Calculate the request length (in number of tokens) give either + """Calculate the request length (in number of tokens) give either prompt_token_ids or prompt_embeds. """ prompt_token_len = None if prompt_token_ids is None else len( @@ -3413,3 +3413,16 @@ def length_from_prompt_token_ids_or_embeds( f" prompt_token_ids={prompt_token_len}" f" prompt_embeds={prompt_embeds_len}") return prompt_token_len + + +@contextlib.contextmanager +def set_env_var(key, value): + old = os.environ.get(key) + os.environ[key] = value + try: + yield + finally: + if old is None: + del os.environ[key] + else: + os.environ[key] = old From 8db29392899b12843787b0a03880f70a6e6ac88e Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Mon, 22 Sep 2025 22:30:36 +0300 Subject: [PATCH 106/185] [KV offload][5/N] Add `CPUOffloadingSpec` (#24251) Signed-off-by: Or Ozeri --- docs/features/disagg_prefill.md | 6 ++ .../{test_cpu.py => test_cpu_manager.py} | 0 tests/v1/kv_offload/test_cpu_offloading.py | 62 +++++++++++++++ vllm/v1/kv_offload/cpu.py | 75 +++++++++++++++++++ vllm/v1/kv_offload/factory.py | 3 + 5 files changed, 146 insertions(+) rename tests/v1/kv_offload/{test_cpu.py => test_cpu_manager.py} (100%) create mode 100644 tests/v1/kv_offload/test_cpu_offloading.py create mode 100644 vllm/v1/kv_offload/cpu.py diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index 996ef00a6b960..69f70b8ff5ac3 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -31,6 +31,12 @@ Now supports 5 types of connectors: --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}' ``` +- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker): + + ```bash + --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}' + ``` + ## Benchmarks Please refer to for disaggregated prefilling benchmarks. diff --git a/tests/v1/kv_offload/test_cpu.py b/tests/v1/kv_offload/test_cpu_manager.py similarity index 100% rename from tests/v1/kv_offload/test_cpu.py rename to tests/v1/kv_offload/test_cpu_manager.py diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py new file mode 100644 index 0000000000000..fc8ca09bea3de --- /dev/null +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time + +import pytest + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +CPU_BLOCK_SIZES = [16, 48] + + +@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) +def test_cpu_offloading(cpu_block_size: int) -> None: + """ + Tests OffloadingConnector with CPUOffloadingSpec. + """ + + # configure OffloadingConnector (spec_name=CPUOffloadingSpec by default) + kv_transfer_config = KVTransferConfig( + kv_connector="OffloadingConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "num_cpu_blocks": 100, + "block_size": cpu_block_size + }, + ) + + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_transfer_config=kv_transfer_config, + ) + + prompts = ["Hi " * 100] + sampling_params = SamplingParams(temperature=0, max_tokens=20) + + # run generation - this should trigger saving KV cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cold_time = time.time() - start_time + + # run generation again - should hit the GPU prefix cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + gpu_hit_time = time.time() - start_time + + # reset prefix cache to avoid GPU hit. + llm.reset_prefix_cache() + + # sleep for a sec to make sure CPU finished storing + time.sleep(1) + + # run generation again - this should trigger loading from CPU + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cpu_hit_time = time.time() - start_time + + print("Generation times:") + print(f" Cold: {cold_time * 1000:.2f}ms") + print(f" GPU hit: {gpu_hit_time * 1000:.2f}ms") + print(f" CPU hit: {cpu_hit_time * 1000:.2f}ms") diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py new file mode 100644 index 0000000000000..b85d375fe63e2 --- /dev/null +++ b/vllm/v1/kv_offload/cpu.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterator +from typing import Optional + +import torch + +from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.platforms import current_platform +from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager +from vllm.v1.kv_offload.backends.cpu import CPUBackend +from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler +from vllm.v1.kv_offload.worker.worker import OffloadingHandler + + +class CPUOffloadingSpec(OffloadingSpec): + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + num_cpu_blocks = self.extra_config.get("num_cpu_blocks") + if not num_cpu_blocks: + raise Exception("num_cpu_blocks must be specified " + "in kv_connector_extra_config") + self.num_cpu_blocks: int = num_cpu_blocks + + # scheduler-side + self._manager: Optional[OffloadingManager] = None + + # worker-side + self._handler: Optional[OffloadingHandler] = None + + def get_manager(self) -> OffloadingManager: + if not self._manager: + kv_events_config = self.vllm_config.kv_events_config + enable_events = (kv_events_config is not None + and kv_events_config.enable_kv_cache_events) + self._manager = LRUOffloadingManager(CPUBackend( + block_size=self.offloaded_block_size, + num_blocks=self.num_cpu_blocks), + enable_events=enable_events) + return self._manager + + def get_handlers( + self, kv_caches: dict[str, torch.Tensor] + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + if not self._handler: + if not current_platform.is_cuda(): + raise Exception("CPU Offloading is currently only supported" + " on CUDA GPUs") + + layer_names = list(kv_caches.keys()) + layers = get_layers_from_vllm_config(self.vllm_config, + AttentionLayerBase, + layer_names) + attn_backends = { + layer_name: layers[layer_name].get_attn_backend() + for layer_name in layer_names + } + + self._handler = CpuGpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=self.gpu_block_size, + cpu_block_size=self.offloaded_block_size, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches) + + assert self._handler is not None + yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler + yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py index 6365ab4a6db75..f9bef6cea9038 100644 --- a/vllm/v1/kv_offload/factory.py +++ b/vllm/v1/kv_offload/factory.py @@ -51,3 +51,6 @@ class OffloadingSpecFactory: # Register various specs here. +OffloadingSpecFactory.register_spec("CPUOffloadingSpec", + "vllm.v1.kv_offload.cpu", + "CPUOffloadingSpec") From f552d5e578077574276aa9d83139b91e1d5ae163 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 23 Sep 2025 04:18:24 +0800 Subject: [PATCH 107/185] [CI/Build] Skip Qwen3-VL initialization tests until models are actually released (#25394) Signed-off-by: DarkLight1337 --- tests/models/registry.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 29b6980aaa42a..6047a7a3e98dc 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -562,10 +562,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501 max_model_len=4096, - min_transformers_version="4.57"), # noqa: E501 + min_transformers_version="4.57", + is_available_online=False), "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501 - max_model_len=4096, - min_transformers_version="4.57"), + max_model_len=4096, + min_transformers_version="4.57", + is_available_online=False), "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", From 8bed1791091510012c33f563ee8ff02e13bf6297 Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:14:44 -0700 Subject: [PATCH 108/185] [TPU] update torch_xla dependency for PyPI compatibility (#25278) Signed-off-by: Johnny Yang Co-authored-by: Chengji Yao --- requirements/tpu.txt | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 7ea239b48ea26..4241cbb2b0333 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -14,14 +14,4 @@ nixl==0.3.0 tpu_info==0.4.0 # Install torch_xla ---pre ---extra-index-url https://download.pytorch.org/whl/nightly/cpu ---find-links https://storage.googleapis.com/libtpu-wheels/index.html ---find-links https://storage.googleapis.com/libtpu-releases/index.html ---find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html ---find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.9.0.dev20250730 -torchvision==0.24.0.dev20250730 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" - +torch_xla[tpu, pallas]==2.8.0 \ No newline at end of file From 45d7d852d362b055b7c02eb967c4ea00d6f9e130 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Mon, 22 Sep 2025 19:38:19 -0400 Subject: [PATCH 109/185] [Frontend] Responses API MCP tools for built in tools and to pass through headers (#24628) Signed-off-by: Alec Solder Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder Co-authored-by: Ye (Charlotte) Qi --- .../openai/test_response_api_mcp_tools.py | 106 +++++++++ .../openai/test_response_api_with_harmony.py | 9 +- tests/test_envs.py | 216 ++++++++++++++++++ vllm/entrypoints/context.py | 37 ++- vllm/entrypoints/harmony_utils.py | 4 +- vllm/entrypoints/openai/serving_responses.py | 25 +- vllm/entrypoints/tool_server.py | 29 ++- vllm/envs.py | 66 +++++- 8 files changed, 463 insertions(+), 29 deletions(-) create mode 100644 tests/entrypoints/openai/test_response_api_mcp_tools.py create mode 100644 tests/test_envs.py diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py new file mode 100644 index 0000000000000..b0eb84712c199 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "openai/gpt-oss-20b" + + +@pytest.fixture(scope="module") +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module") +def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch): + args = ["--enforce-eager", "--tool-server", "demo"] + + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="function") +def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch): + args = ["--enforce-eager", "--tool-server", "demo"] + + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") + m.setenv("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", + "code_interpreter,container") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def mcp_disabled_client(mcp_disabled_server): + async with mcp_disabled_server.get_async_client() as async_client: + yield async_client + + +@pytest_asyncio.fixture +async def mcp_enabled_client(mcp_enabled_server): + async with mcp_enabled_server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") +async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, + model_name: str): + response = await mcp_enabled_client.responses.create( + model=model_name, + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), + tools=[{ + "type": "mcp", + "server_label": "code_interpreter", + # URL unused for DemoToolServer + "server_url": "http://localhost:8888" + }], + ) + assert response is not None + assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") +async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, + model_name: str): + response = await mcp_disabled_client.responses.create( + model=model_name, + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), + tools=[{ + "type": "mcp", + "server_label": "code_interpreter", + # URL unused for DemoToolServer + "server_url": "http://localhost:8888" + }], + ) + assert response is not None + assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens == 0 diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index f3c3148577b85..23d8373d97809 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -454,7 +454,13 @@ async def test_web_search(client: OpenAI, model_name: str): async def test_code_interpreter(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, - input="Multiply 64548*15151 using builtin python interpreter.", + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), tools=[{ "type": "code_interpreter", "container": { @@ -464,6 +470,7 @@ async def test_code_interpreter(client: OpenAI, model_name: str): ) assert response is not None assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens > 0 def get_weather(latitude, longitude): diff --git a/tests/test_envs.py b/tests/test_envs.py new file mode 100644 index 0000000000000..f81a6e2e415cd --- /dev/null +++ b/tests/test_envs.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from unittest.mock import patch + +import pytest + +from vllm.envs import env_list_with_choices, env_with_choices + + +class TestEnvWithChoices: + """Test cases for env_with_choices function.""" + + def test_default_value_returned_when_env_not_set(self): + """Test default is returned when env var is not set.""" + env_func = env_with_choices("NONEXISTENT_ENV", "default", + ["option1", "option2"]) + assert env_func() == "default" + + def test_none_default_returned_when_env_not_set(self): + """Test that None is returned when env not set and default is None.""" + env_func = env_with_choices("NONEXISTENT_ENV", None, + ["option1", "option2"]) + assert env_func() is None + + def test_valid_value_returned_case_sensitive(self): + """Test that valid value is returned in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + assert env_func() == "option1" + + def test_valid_lowercase_value_returned_case_insensitive(self): + """Test that lowercase value is accepted in case insensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["OPTION1", "OPTION2"], + case_sensitive=False) + assert env_func() == "option1" + + def test_valid_uppercase_value_returned_case_insensitive(self): + """Test that uppercase value is accepted in case insensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=False) + assert env_func() == "OPTION1" + + def test_invalid_value_raises_error_case_sensitive(self): + """Test that invalid value raises ValueError in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + def test_case_mismatch_raises_error_case_sensitive(self): + """Test that case mismatch raises ValueError in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'OPTION1' for TEST_ENV"): + env_func() + + def test_invalid_value_raises_error_case_insensitive(self): + """Test that invalid value raises ValueError when case insensitive.""" + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=False) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + def test_callable_choices_resolved_correctly(self): + """Test that callable choices are resolved correctly.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1"}): + env_func = env_with_choices("TEST_ENV", "default", get_choices) + assert env_func() == "dynamic1" + + def test_callable_choices_with_invalid_value(self): + """Test that callable choices raise error for invalid values.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", "default", get_choices) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + +class TestEnvListWithChoices: + """Test cases for env_list_with_choices function.""" + + def test_default_list_returned_when_env_not_set(self): + """Test that default list is returned when env var is not set.""" + env_func = env_list_with_choices("NONEXISTENT_ENV", + ["default1", "default2"], + ["option1", "option2"]) + assert env_func() == ["default1", "default2"] + + def test_empty_default_list_returned_when_env_not_set(self): + """Test that empty default list is returned when env not set.""" + env_func = env_list_with_choices("NONEXISTENT_ENV", [], + ["option1", "option2"]) + assert env_func() == [] + + def test_single_valid_value_parsed_correctly(self): + """Test that single valid value is parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1"] + + def test_multiple_valid_values_parsed_correctly(self): + """Test that multiple valid values are parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_values_with_whitespace_trimmed(self): + """Test that values with whitespace are trimmed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": " option1 , option2 "}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_empty_values_filtered_out(self): + """Test that empty values are filtered out.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,,option2,"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_empty_string_returns_default(self): + """Test that empty string returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ""}): + env_func = env_list_with_choices("TEST_ENV", ["default"], + ["option1", "option2"]) + assert env_func() == ["default"] + + def test_only_commas_returns_default(self): + """Test that string with only commas returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ",,,"}): + env_func = env_list_with_choices("TEST_ENV", ["default"], + ["option1", "option2"]) + assert env_func() == ["default"] + + def test_case_sensitive_validation(self): + """Test case sensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,OPTION2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'OPTION2' in TEST_ENV"): + env_func() + + def test_case_insensitive_validation(self): + """Test case insensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"], + case_sensitive=False) + assert env_func() == ["OPTION1", "option2"] + + def test_invalid_value_in_list_raises_error(self): + """Test that invalid value in list raises ValueError.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,invalid,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + with pytest.raises(ValueError, + match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_callable_choices_resolved_correctly(self): + """Test that callable choices are resolved correctly.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,dynamic2"}): + env_func = env_list_with_choices("TEST_ENV", [], get_choices) + assert env_func() == ["dynamic1", "dynamic2"] + + def test_callable_choices_with_invalid_value(self): + """Test that callable choices raise error for invalid values.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,invalid"}): + env_func = env_list_with_choices("TEST_ENV", [], get_choices) + with pytest.raises(ValueError, + match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_duplicate_values_preserved(self): + """Test that duplicate values in the list are preserved.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option1", "option2"] diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 8619452f2445f..ea81fdbcd825e 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod from contextlib import AsyncExitStack from typing import TYPE_CHECKING, Optional, Union +from openai.types.responses.tool import Mcp from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.harmony_utils import ( @@ -21,6 +22,24 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +# This is currently needed as the tool type doesn't 1:1 match the +# tool namespace, which is what is used to look up the +# connection to the tool server +_TOOL_NAME_TO_TYPE_MAP = { + "browser": "web_search_preview", + "python": "code_interpreter", + "container": "container", +} + + +def _map_tool_name_to_tool_type(tool_name: str) -> str: + if tool_name not in _TOOL_NAME_TO_TYPE_MAP: + available_tools = ', '.join(_TOOL_NAME_TO_TYPE_MAP.keys()) + raise ValueError( + f"Built-in tool name '{tool_name}' not defined in mapping. " + f"Available tools: {available_tools}") + return _TOOL_NAME_TO_TYPE_MAP[tool_name] + class TurnTokens: """Tracks token counts for a single conversation turn.""" @@ -59,8 +78,8 @@ class ConversationContext(ABC): @abstractmethod async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]) -> None: pass @abstractmethod @@ -96,8 +115,8 @@ class SimpleContext(ConversationContext): raise NotImplementedError("Should not be called.") async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]) -> None: pass async def cleanup_session(self) -> None: @@ -318,13 +337,17 @@ class HarmonyContext(ConversationContext): ] async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]): if tool_server: for tool_name in self.available_tools: if tool_name not in self._tool_sessions: + tool_type = _map_tool_name_to_tool_type(tool_name) + headers = mcp_tools[ + tool_type].headers if tool_type in mcp_tools else None tool_session = await exit_stack.enter_async_context( - tool_server.new_session(tool_name, request_id)) + tool_server.new_session(tool_name, request_id, + headers)) self._tool_sessions[tool_name] = tool_session exit_stack.push_async_exit(self.cleanup_session) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 1364a41be950d..57e4bb1e1da52 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -126,8 +126,10 @@ def get_developer_message( function_tools: list[Union[Tool, ChatCompletionToolsParam]] = [] for tool in tools: if tool.type in ("web_search_preview", "code_interpreter", - "container"): + "container", "mcp"): # These are built-in tools that are added to the system message. + # Adding in MCP for now until we support MCP tools executed + # server side pass elif tool.type == "function": diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 6e243671af242..99bb464db1d13 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -460,8 +460,12 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: try: + mcp_tools = { + tool.server_label: tool + for tool in request.tools if tool.type == "mcp" + } await context.init_tool_sessions(self.tool_server, exit_stack, - request.request_id) + request.request_id, mcp_tools) async for _ in result_generator: pass except asyncio.CancelledError: @@ -748,11 +752,16 @@ class OpenAIServingResponses(OpenAIServing): # New conversation. reasoning_effort = (request.reasoning.effort if request.reasoning else None) - # Temporary: OpenAI types doesn't have container tool - # so we used MCP to cover that, up for change tool_types = [tool.type for tool in request.tools] - if envs.VLLM_GPT_OSS_USE_CONTAINER_TOOL: - tool_types.append("container") + + # Allow the MCP Tool type to enable built in tools if the + # server_label is allowlisted in + # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS + if envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS: + for tool in request.tools: + if (tool.type == "mcp" and tool.server_label + in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS): + tool_types.append(tool.server_label) enable_browser = ("web_search_preview" in tool_types and self.tool_server is not None and self.tool_server.has_tool("browser")) @@ -1653,8 +1662,12 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: processer = None if self.use_harmony: + mcp_tools = { + tool.server_label: tool + for tool in request.tools if tool.type == "mcp" + } await context.init_tool_sessions(self.tool_server, exit_stack, - request.request_id) + request.request_id, mcp_tools) processer = self._process_harmony_streaming_events else: processer = self._process_simple_streaming_events diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 056a571fb2fd1..4c627b865ef92 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -18,7 +18,6 @@ if TYPE_CHECKING: async def list_server_and_tools(server_url: str): from mcp import ClientSession from mcp.client.sse import sse_client - async with sse_client(url=server_url) as streams, ClientSession( *streams) as session: initialize_response = await session.initialize() @@ -86,8 +85,12 @@ class ToolServer(ABC): pass @abstractmethod - def new_session(self, tool_name: str, - session_id: str) -> AbstractAsyncContextManager[Any]: + def new_session( + self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None + ) -> AbstractAsyncContextManager[Any]: """ Create a session for the tool. """ @@ -144,16 +147,21 @@ class MCPToolServer(ToolServer): return self.harmony_tool_descriptions.get(tool_name) @asynccontextmanager - async def new_session(self, tool_name: str, session_id: str): + async def new_session(self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None): from mcp import ClientSession from mcp.client.sse import sse_client url = self.urls.get(tool_name) - headers = {"x-session-id": session_id} + request_headers = {"x-session-id": session_id} + if headers is not None: + request_headers.update(headers) if not url: raise KeyError(f"Tool '{tool_name}' is not supported") - async with sse_client(url=url, - headers=headers) as streams, ClientSession( - *streams) as session: + async with sse_client( + url=url, headers=request_headers) as streams, ClientSession( + *streams) as session: await session.initialize() yield session @@ -189,7 +197,10 @@ class DemoToolServer(ToolServer): raise ValueError(f"Unknown tool {tool_name}") @asynccontextmanager - async def new_session(self, tool_name: str, session_id: str): + async def new_session(self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None): if tool_name not in self.tools: raise KeyError(f"Tool '{tool_name}' is not supported") yield self.tools[tool_name] diff --git a/vllm/envs.py b/vllm/envs.py index eaee2f6cc7716..ee5efff8bcd92 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -185,11 +185,11 @@ if TYPE_CHECKING: VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False - VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None @@ -261,6 +261,58 @@ def env_with_choices( return _get_validated_env +def env_list_with_choices( + env_name: str, + default: list[str], + choices: Union[list[str], Callable[[], list[str]]], + case_sensitive: bool = True) -> Callable[[], list[str]]: + """ + Create a lambda that validates environment variable + containing comma-separated values against allowed choices + + Args: + env_name: Name of the environment variable + default: Default list of values if not set + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables + dict that returns list of strings + """ + + def _get_validated_env_list() -> list[str]: + value = os.getenv(env_name) + if value is None: + return default + + # Split comma-separated values and strip whitespace + values = [v.strip() for v in value.split(",") if v.strip()] + + if not values: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + # Validate each value + for val in values: + if not case_sensitive: + check_value = val.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = val + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError(f"Invalid value '{val}' in {env_name}. " + f"Valid options: {actual_choices}.") + + return values + + return _get_validated_env_list + + def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. @@ -1320,10 +1372,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), - # Allows vllm use container tool - "VLLM_GPT_OSS_USE_CONTAINER_TOOL": - lambda: bool(int(os.getenv("VLLM_GPT_OSS_USE_CONTAINER_TOOL", "0"))), - # Allows harmony instructions to be injected on system messages "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool( @@ -1343,6 +1391,14 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", "VLLM_OBJECT_STORAGE_SHM_BUFFER"), + + # Valid values are container,code_interpreter,web_search_preview + # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter + "GPT_OSS_SYSTEM_TOOL_MCP_LABELS": + env_list_with_choices("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", [], + ["container", + "code_interpreter", + "web_search_preview"]), } # --8<-- [end:env-vars-definition] From d588cd24061011f76da721c89f9e2171a2b2c4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Mon, 22 Sep 2025 20:07:43 -0400 Subject: [PATCH 110/185] [Bugfix] fix custom op test (#25429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič --- .../model_executor/test_enabled_custom_ops.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 86139d598582d..92ce10a9efc0b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional import pytest import torch @@ -34,15 +35,15 @@ class Relu3(ReLUSquaredActivation): [ # Default values based on compile level # - All by default (no Inductor compilation) - ("", 0, False, [True] * 4, True), - ("", 1, True, [True] * 4, True), - ("", 2, False, [True] * 4, True), + (None, 0, False, [True] * 4, True), + (None, 1, True, [True] * 4, True), + (None, 2, False, [True] * 4, True), # - None by default (with Inductor) - ("", 3, True, [False] * 4, False), - ("", 4, True, [False] * 4, False), + (None, 3, True, [False] * 4, False), + (None, 4, True, [False] * 4, False), # - All by default (without Inductor) - ("", 3, False, [True] * 4, True), - ("", 4, False, [True] * 4, True), + (None, 3, False, [True] * 4, True), + (None, 4, False, [True] * 4, True), # Explicitly enabling/disabling # # Default: all @@ -54,7 +55,7 @@ class Relu3(ReLUSquaredActivation): # All but SiluAndMul ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True), # All but ReLU3 (even if ReLU2 is on) - ("-relu3,relu2", 3, False, [1, 1, 1, 0], True), + ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True), # RMSNorm and SiluAndMul ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False), # All but RMSNorm @@ -67,12 +68,13 @@ class Relu3(ReLUSquaredActivation): # All but RMSNorm ("all,-rms_norm", 4, True, [0, 1, 1, 1], True), ]) -def test_enabled_ops(env: str, torch_level: int, use_inductor: bool, +def test_enabled_ops(env: Optional[str], torch_level: int, use_inductor: bool, ops_enabled: list[int], default_on: bool): + custom_ops = env.split(',') if env else [] vllm_config = VllmConfig( compilation_config=CompilationConfig(use_inductor=bool(use_inductor), level=torch_level, - custom_ops=env.split(","))) + custom_ops=custom_ops)) with set_current_vllm_config(vllm_config): assert CustomOp.default_on() == default_on From f31ff874607c311dda7aebe52f73c20f5cad923d Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 22 Sep 2025 20:09:52 -0400 Subject: [PATCH 111/185] [Core] Drop overly aggressive whisper assertion (#25408) Signed-off-by: Russell Bryant --- vllm/v1/core/sched/scheduler.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ef77d9e2d3ffd..7fc4776b02611 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -463,10 +463,6 @@ class Scheduler(SchedulerInterface): # always padded to the maximum length. If we support other # encoder-decoder models, this will need to be updated if we # want to only allocate what is needed. - assert ("whisper" - in self.vllm_config.model_config.model.lower()), ( - "Whisper is the only supported " - "encoder-decoder model.") num_encoder_tokens =\ self.scheduler_config.max_num_encoder_input_tokens else: From 090197034faf3b193c4467cedeb9281e3078892d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 23 Sep 2025 02:10:59 +0200 Subject: [PATCH 112/185] [Bugfix] Fix missing `clear_connector_metadata` (#25397) Signed-off-by: NickLucche --- .../unit/test_kv_connector_lifecyle.py | 59 +++++++++++++++++++ .../worker/kv_connector_model_runner_mixin.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py new file mode 100644 index 0000000000000..fe6296cf12ea0 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa: E501 + SharedStorageConnectorMetadata) +from vllm.distributed.kv_transfer.kv_transfer_state import ( + ensure_kv_transfer_initialized, get_kv_transfer_group) +from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput +from vllm.v1.worker.kv_connector_model_runner_mixin import ( + KVConnectorModelRunnerMixin) + +# Importing utils registers TestSharedStorageConnector with the factory +from .utils import create_vllm_config + + +def _make_empty_scheduler_output(): + return SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=CachedRequestData.make_empty(), + num_scheduled_tokens={}, + total_num_scheduled_tokens=0, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[], + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None, + kv_connector_metadata=SharedStorageConnectorMetadata(), + ) + + +def test_kv_connector_mixin_clears_metadata(): + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector" + vllm_config.kv_transfer_config.kv_role = "kv_both" + vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = ("unit") + + # Initialize the global connector instance + ensure_kv_transfer_initialized(vllm_config) + + try: + # Minimal scheduler output with empty metadata; mixin should still + # bind/clear metadata even if no loads happen + scheduler_output = _make_empty_scheduler_output() + + # Invoke the no-forward path which uses the mixin context manager + KVConnectorModelRunnerMixin.kv_connector_no_forward( + scheduler_output, vllm_config) + + # Verify clear_connector_metadata was called on the connector + connector = get_kv_transfer_group() + assert connector._connector_metadata is None + # Test connector wrapper records method calls + assert connector.call_record.get("bind_connector_metadata", 0) == 1 + assert connector.call_record.get("clear_connector_metadata", 0) == 1 + finally: + # Ensure we clean up the global connector between tests + KVConnectorModelRunnerMixin.ensure_kv_transfer_shutdown() diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 016a90c196ba3..7eaff924ecc1f 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -123,6 +123,7 @@ class KVConnectorModelRunnerMixin: output.kv_connector_stats = KVConnectorModelRunnerMixin.\ get_kv_connector_stats() + kv_connector.clear_connector_metadata() @staticmethod def get_kv_connector_stats() -> Optional[KVConnectorStats]: From ac0048c0aea977e520aa19f3b9155fc19d696df7 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 22 Sep 2025 20:26:17 -0400 Subject: [PATCH 113/185] [BugFix] [DP/EP] Fix slow execution when BS <= DP (#25407) Signed-off-by: Matthew Bonanni Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford --- vllm/v1/worker/gpu_model_runner.py | 7 ++++--- vllm/v1/worker/gpu_worker.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b0cd0f4133079..89b9a3c34f2ac 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -55,7 +55,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, check_use_alibi, get_dtype_size, + GiB_bytes, cdiv, check_use_alibi, get_dtype_size, is_pin_memory_available, length_from_prompt_token_ids_or_embeds, round_up, supports_dynamo) @@ -2913,12 +2913,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Note: Overriding max_query_len to be the prefill tokens max_query_len = num_prefill_tokens elif uniform_decode: - num_reqs = num_tokens // max_query_len + assert not create_mixed_batch + num_reqs = cdiv(num_tokens, max_query_len) assert num_reqs <= max_num_reqs, \ "Do not capture num_reqs > max_num_reqs for uniform batch" num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: - num_scheduled_tokens_list[-1] += num_tokens % max_query_len + num_scheduled_tokens_list[-1] = num_tokens % max_query_len else: num_reqs = min(num_tokens, max_num_reqs) min_tokens_per_req = num_tokens // num_reqs diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 8b1e1bb8f45ca..ca8734d28b459 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -487,7 +487,7 @@ class Worker(WorkerBase): sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: - self.model_runner._dummy_run(1) + self.model_runner._dummy_run(1, uniform_decode=True) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) From 0b7bed9c386d41945b990d3ac5311ac7d9c8b499 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Mon, 22 Sep 2025 21:20:53 -0400 Subject: [PATCH 114/185] [Performance] Remove input pads in cutlass_mla and optimize v_proj output handling (#25184) Signed-off-by: Alexander Matveev --- vllm/v1/attention/backends/mla/common.py | 45 ++++++++++++++++--- vllm/v1/attention/backends/mla/cutlass_mla.py | 30 +++++++------ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 5b307810de930..9bca87c81d179 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -942,6 +942,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): qk_head_dim: int, v_head_dim: int, kv_b_proj: ColumnParallelLinear, + q_pad_num_heads: Optional[int] = None, ) -> None: if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported for MLA") @@ -959,6 +960,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.qk_head_dim = qk_head_dim self.v_head_dim = v_head_dim self.kv_b_proj = kv_b_proj + self.q_pad_num_heads = q_pad_num_heads if use_flashinfer_prefill(): logger.debug_once("Using FlashInfer prefill for MLA") @@ -1134,7 +1136,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): True, #Indicates actual_seq_lens are on GPU or CPU. ) - def _v_up_proj(self, x): + def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) if is_rocm_aiter_fp8bmm_enabled(): @@ -1146,12 +1148,23 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): transpose_bm=True) # Convert from (B, N, V) to (B, N * V) x = x.reshape(-1, self.num_heads * self.v_head_dim) + # Copy result + out.copy_(x) else: + # Convert from (B, N * V) to (N, B, V) + out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) - x = torch.bmm(x, self.W_UV) + torch.bmm(x, self.W_UV, out=out) # Reuse "out" to make it "hot" + # Convert from (N, B, V) to (B, N * V) - x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - return x + out_new = out.transpose(0, 1).reshape( + -1, self.num_heads * self.v_head_dim) + + # Adjust output buffer shape back to the original (B, N * V) + N, B, V = out.shape + out.resize_((B, N * V)) + out.copy_(out_new) # Copy result def process_weights_after_loading(self, act_dtype: torch.dtype): @@ -1559,6 +1572,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): # Convert from (B, N, P) to (N, B, P) decode_q_nope = decode_q_nope.transpose(0, 1) + # Pads the head_dim if necessary (for the underlying kernel) + if self.q_pad_num_heads is not None: + B, N, L = decode_q_pe.shape + decode_pe_padded = decode_q_pe.new_empty( + (B, self.q_pad_num_heads, L)) + decode_pe_padded.resize_((B, N, L)) + decode_pe_padded.copy_(decode_q_pe) + decode_q_pe = decode_pe_padded + if is_rocm_aiter_fp8bmm_enabled(): # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L) decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope, @@ -1567,8 +1589,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): group_size=128, transpose_bm=True) else: + # Pads the head_dim if necessary (for the underlying kernel) + N, B, P = decode_q_nope.shape + _, _, L = self.W_UK_T.shape + if self.q_pad_num_heads is not None: + decode_ql_nope = decode_q_nope.new_empty( + (self.q_pad_num_heads, B, L)) + decode_ql_nope.resize_((N, B, L)) + + else: + decode_ql_nope = decode_q_nope.new_empty((N, B, L)) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) + torch.bmm(decode_q_nope, self.W_UK_T, out=decode_ql_nope) # Convert from (N, B, L) to (B, N, L) decode_ql_nope = decode_ql_nope.transpose(0, 1) @@ -1603,5 +1636,5 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group()) # v_up projection - output[:num_decode_tokens] = self._v_up_proj(attn_out) + self._v_up_proj(attn_out, out=output[:num_decode_tokens]) return output_padded diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index ae534f3207b51..d44e20f2cb6be 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -74,6 +74,8 @@ class SM100Workspace: g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB +MAX_HEADS = 128 + class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): can_return_lse_for_decode: bool = True @@ -92,10 +94,18 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) + super().__init__(num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + q_pad_num_heads=MAX_HEADS, + **mla_args) unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): @@ -157,14 +167,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): MAX_HEADS = 128 assert H <= MAX_HEADS, f"H must be <= {MAX_HEADS}, but got {H}" - if H < MAX_HEADS: - q_nope_padded = q_nope.new_empty((B_q, MAX_HEADS, D_q_nope)) - q_nope_padded[:, :H] = q_nope - q_nope = q_nope_padded - - q_pe_padded = q_pe.new_empty((B_q, MAX_HEADS, D_q_pe)) - q_pe_padded[:, :H] = q_pe - q_pe = q_pe_padded assert len(page_table.shape) == 2 B_block_table, block_num = page_table.shape @@ -206,9 +208,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) if H < MAX_HEADS: + # Extract the subsets of the outputs + lse = lse[:, :H] if self.need_to_return_lse_for_decode else lse out = out[:, :H] - if self.need_to_return_lse_for_decode: - lse = lse[:, :H].contiguous() return out, lse From 9949aa2ef1cc77721dea9077e0f76b9deb6fd066 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 22 Sep 2025 21:42:45 -0400 Subject: [PATCH 115/185] [Perf] Apply torch.compile for `per_block_cast_to_fp8` (#24611) Signed-off-by: yewentao256 --- vllm/utils/deep_gemm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 38d92f01192b1..4083193d7650e 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -135,7 +135,7 @@ DEFAULT_BLOCK_SIZE = [128, 128] # Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38 -# TODO(wentao): optimize this function, using triton or cuda kernel +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def per_block_cast_to_fp8( x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, @@ -187,4 +187,4 @@ __all__ = [ "is_deep_gemm_e8m0_used", "is_deep_gemm_supported", "should_use_deepgemm_for_fp8_linear", -] +] \ No newline at end of file From 6fa78d8f23b88da47ce6e13815e5e82226f2af00 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 23 Sep 2025 09:48:12 +0800 Subject: [PATCH 116/185] [V0 deprecation] Remove platform v1 controling interface (#25410) Signed-off-by: Isotr0py --- tests/v1/test_async_llm_dp.py | 5 ----- vllm/engine/arg_utils.py | 27 --------------------------- vllm/platforms/cpu.py | 17 ----------------- vllm/platforms/cuda.py | 4 ---- vllm/platforms/interface.py | 14 -------------- vllm/platforms/rocm.py | 5 ----- vllm/platforms/tpu.py | 5 ----- vllm/platforms/xpu.py | 4 ---- 8 files changed, 81 deletions(-) diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 32da58011be98..cef0f362cff86 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -13,7 +13,6 @@ from vllm import SamplingParams from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import PromptType -from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import DPAsyncMPClient @@ -29,10 +28,6 @@ engine_args = AsyncEngineArgs( data_parallel_size=DP_SIZE, ) -if not current_platform.supports_v1(engine_args.create_model_config()): - pytest.skip(reason="Requires V1-supporting platform.", - allow_module_level=True) - async def generate( engine: AsyncLLM, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d4d801b155e17..17df82c081aeb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1502,12 +1502,6 @@ class EngineArgs: _raise_or_fallback(feature_name=name, recommend_to_remove=True) return False - # Platforms must decide if they can support v1 for this model - if not current_platform.supports_v1(model_config=model_config): - _raise_or_fallback( - feature_name=f"device type={current_platform.device_type}", - recommend_to_remove=False) - return False ############################################################# # Experimental Features - allow users to opt in. @@ -1524,12 +1518,6 @@ class EngineArgs: recommend_to_remove=False) return False - # The platform may be supported on V1, but off by default for now. - if not current_platform.default_v1( # noqa: SIM103 - model_config=model_config) and _warn_or_fallback( - current_platform.device_name): - return False - if (current_platform.is_cpu() and model_config.get_sliding_window() is not None): _raise_or_fallback(feature_name="sliding window (CPU backend)", @@ -1796,21 +1784,6 @@ def _raise_or_fallback(feature_name: str, recommend_to_remove: bool): logger.warning(msg) -def _warn_or_fallback(feature_name: str) -> bool: - if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - logger.warning( - "Detected VLLM_USE_V1=1 with %s. Usage should " - "be considered experimental. Please report any " - "issues on Github.", feature_name) - should_exit = False - else: - logger.info( - "%s is experimental on VLLM_USE_V1=1. " - "Falling back to V0 Engine.", feature_name) - should_exit = True - return should_exit - - def human_readable_int(value): """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cd41832bc2ea4..1e15dc6a91aa0 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -328,23 +328,6 @@ class CpuPlatform(Platform): def supports_structured_output(cls) -> bool: return True - @classmethod - def supports_v1(cls, model_config) -> bool: - """Returns whether the current platform can support v1 for the supplied - model configuration. - """ - return True - - @classmethod - def default_v1(cls, model_config) -> bool: - """Returns whether the current platform can use v1 by default for the - supplied model configuration. - """ - arch = cls.get_cpu_architecture() - return (cls.supports_v1(model_config) - and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC, - CpuArchEnum.ARM, CpuArchEnum.S390X)) - @classmethod def opaque_attention_op(cls) -> bool: return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b10bc03ee16c6..d5f3599acb1cc 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -384,10 +384,6 @@ class CudaPlatformBase(Platform): def supports_fp8(cls) -> bool: return cls.has_device_capability(89) - @classmethod - def supports_v1(cls, model_config: "ModelConfig") -> bool: - return True - @classmethod def use_custom_allreduce(cls) -> bool: return True diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index cad04ea14c01c..3f13ae72fe4d7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -482,20 +482,6 @@ class Platform: or parallel_config.distributed_executor_backend == "external_launcher") - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - """Returns whether the current platform can support v1 for the supplied - model configuration. - """ - return False - - @classmethod - def default_v1(cls, model_config: ModelConfig) -> bool: - """ - Returns whether the current platform supports v1 by default. - """ - return cls.supports_v1(model_config) - @classmethod def use_custom_allreduce(cls) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 6a49bd4a33866..878718489fa88 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -396,11 +396,6 @@ class RocmPlatform(Platform): else: return torch.float8_e4m3fn - @classmethod - def supports_v1(cls, model_config: "ModelConfig") -> bool: - # V1 support on AMD gpus is experimental - return True - @classmethod def use_custom_allreduce(cls) -> bool: # We only enable custom allreduce for MI300 series diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 9852d948bc4b8..e4c73b1bae6fb 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -174,11 +174,6 @@ class TpuPlatform(Platform): def use_all_gather(cls) -> bool: return True - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - # V1 support on TPU is experimental - return True - @classmethod def validate_request( cls, diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index eb591ae4454e4..034e039006c48 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -194,10 +194,6 @@ class XPUPlatform(Platform): def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - return True - @classmethod def device_count(cls) -> int: return torch.xpu.device_count() From c625f9043c5beb5921e94c6a5b3ac18372bab4db Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 23 Sep 2025 09:52:09 +0800 Subject: [PATCH 117/185] [V0 deprecation] Remove `_set_default_args_v0` function (#25409) Signed-off-by: Isotr0py --- vllm/engine/arg_utils.py | 83 ++++++---------------------------------- 1 file changed, 11 insertions(+), 72 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 17df82c081aeb..8c7a1b413cdb7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1147,20 +1147,15 @@ class EngineArgs: else: envs.set_vllm_use_v1(use_v1) - # Set default arguments for V0 or V1 Engine. - if use_v1: - self._set_default_args_v1(usage_context, model_config) - # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 - if current_platform.is_cpu( - ) and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): - logger.info( - "Chunked prefill is not supported for ARM and POWER " - "and S390X CPUs; " - "disabling it for V1 backend.") - self.enable_chunked_prefill = False - else: - self._set_default_args_v0(model_config) + # Set default arguments for V1 Engine. + self._set_default_args(usage_context, model_config) + # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture( + ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): + logger.info("Chunked prefill is not supported for ARM and POWER " + "and S390X CPUs; " + "disabling it for V1 backend.") + self.enable_chunked_prefill = False assert self.enable_chunked_prefill is not None sliding_window: Optional[int] = None @@ -1528,64 +1523,8 @@ class EngineArgs: return True - def _set_default_args_v0(self, model_config: ModelConfig) -> None: - """Set Default Arguments for V0 Engine.""" - - max_model_len = model_config.max_model_len - use_long_context = max_model_len > 32768 - if self.enable_chunked_prefill is None: - # Chunked prefill not supported for Multimodal or MLA in V0. - if model_config.is_multimodal_model or model_config.use_mla: - self.enable_chunked_prefill = False - - # Enable chunked prefill by default for long context (> 32K) - # models to avoid OOM errors in initial memory profiling phase. - elif use_long_context: - is_gpu = current_platform.is_cuda() - use_sliding_window = (model_config.get_sliding_window() - is not None) - use_spec_decode = self.speculative_config is not None - - if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora): - self.enable_chunked_prefill = True - logger.warning( - "Chunked prefill is enabled by default for models " - "with max_model_len > 32K. Chunked prefill might " - "not work with some features or models. If you " - "encounter any issues, please disable by launching " - "with --enable-chunked-prefill=False.") - - if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False - - if not self.enable_chunked_prefill and use_long_context: - logger.warning( - "The model has a long context length (%s). This may cause" - "OOM during the initial memory profiling phase, or result " - "in low performance due to small KV cache size. Consider " - "setting --max-model-len to a smaller value.", max_model_len) - - # Disable prefix caching for multimodal models for VLLM_V0. - if self.enable_prefix_caching and model_config.is_multimodal_model: - logger.warning( - "--enable-prefix-caching is not supported for multimodal " - "models in V0 and has been disabled.") - self.enable_prefix_caching = False - - if self.enable_prompt_embeds: - logger.warning( - "--enable-prompt-embeds and --enable-prefix-caching " - "are not supported together in V0. Prefix caching has " - "been disabled.") - self.enable_prefix_caching = False - - # Set max_num_seqs to 256 for VLLM_V0. - if self.max_num_seqs is None: - self.max_num_seqs = 256 - - def _set_default_args_v1(self, usage_context: UsageContext, - model_config: ModelConfig) -> None: + def _set_default_args(self, usage_context: UsageContext, + model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" # V1 always uses chunked prefills and prefix caching From 4741239db7b75949d577f99eb1c3a9e463dc77f3 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 22 Sep 2025 22:04:15 -0400 Subject: [PATCH 118/185] [Bug] Fix Long Context OOM Issue (#25290) Signed-off-by: yewentao256 --- vllm/v1/attention/backends/mla/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 9bca87c81d179..a177117a50bd1 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -481,7 +481,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): # which would result in up-projected context being # 2*(192*128)*(64*1024) = 3gb # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) + 64 * 1024) assert self.chunked_prefill_workspace_size >= \ scheduler_config.max_num_seqs * cache_config.block_size if self.dcp_world_size > 1: From fc97733da801f1d6464e706f41ebd5180f2abb51 Mon Sep 17 00:00:00 2001 From: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:04:47 +0800 Subject: [PATCH 119/185] [feat] Support MRoPE + YaRN (#25384) Signed-off-by: liuye.hj Co-authored-by: liuye.hj --- .../layers/rotary_embedding/__init__.py | 22 ++++++++++--- .../layers/rotary_embedding/mrope.py | 31 +++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index c9653aa9e4405..3576368981c7c 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -153,11 +153,23 @@ def get_rope( if k in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow") } - rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, dtype, - **extra_kwargs) + if "mrope_section" in rope_scaling: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", + False), + scaling_factor=scaling_factor, + **extra_kwargs) + else: + rotary_emb = YaRNScalingRotaryEmbedding( + head_size, rotary_dim, original_max_position, base, + is_neox_style, scaling_factor, dtype, **extra_kwargs) elif scaling_type == "deepseek_yarn": scaling_factor = rope_scaling["factor"] original_max_position = rope_scaling[ diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 17d04a1ad715c..9bf0d6bd15e74 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -12,6 +12,7 @@ from vllm.triton_utils import tl, triton from .base import RotaryEmbedding from .common import apply_rotary_emb_dispatch +from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale @triton.jit @@ -213,7 +214,27 @@ class MRotaryEmbedding(RotaryEmbedding): dtype: torch.dtype, mrope_section: Optional[list[int]] = None, mrope_interleaved: bool = False, + # YaRN parameters. + *, + scaling_factor: Optional[float] = None, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, ) -> None: + + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + if self.scaling_factor is not None: + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float( + yarn_get_mscale(self.scaling_factor) * attn_factor) + else: + self.mscale = 1.0 + # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get # a larger the cos and sin cache. @@ -226,6 +247,16 @@ class MRotaryEmbedding(RotaryEmbedding): if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 + def _compute_inv_freq(self, base: float) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_inv_freq(base) + return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_cos_sin_cache() + return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self) + def forward_native( self, positions: torch.Tensor, From f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 23 Sep 2025 11:09:00 +0800 Subject: [PATCH 120/185] [XPU] Fix `compile_size` is `None` case. (#25433) Signed-off-by: Kunshang Ji --- vllm/platforms/xpu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 034e039006c48..af61db5e312a4 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -113,6 +113,8 @@ class XPUPlatform(Platform): # lazy import to avoid circular import from vllm.config import CompilationLevel, CUDAGraphMode compilation_config = vllm_config.compilation_config + if compilation_config.compile_sizes is None: + compilation_config.compile_sizes = [] assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \ "CUDA graph mode should be NONE on XPU" From eea178398997b94b6df5560b597e2b7895b2ae3a Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 22 Sep 2025 20:21:48 -0700 Subject: [PATCH 121/185] [benchmarks]allow skip ready check for bench serve (#25420) Signed-off-by: Lu Fang Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Lucia (Lu) Fang --- vllm/benchmarks/serve.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7382782f11655..2a042802d0d54 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -531,18 +531,22 @@ async def benchmark( extra_body=extra_body, ) - test_output = await wait_for_endpoint( - request_func, - test_input, - session, - timeout_seconds=ready_check_timeout_sec, - ) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + if ready_check_timeout_sec > 0: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") else: - print("Initial test run completed. Starting main benchmark run...") + print("Skipping endpoint ready check.") if lora_modules: # For each input request, choose a LoRA module at random. @@ -1151,7 +1155,8 @@ def add_cli_args(parser: argparse.ArgumentParser): type=int, default=600, help="Maximum time to wait for the endpoint to become ready " - "in seconds (default: 600 seconds / 10 minutes).", + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped." ) From 78237e43bf3ac70f31b27684dc27d7eca0fa3677 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 22 Sep 2025 23:26:32 -0400 Subject: [PATCH 122/185] [Bugfix] Remove contiguous output req for context parallel MLA (#25414) Signed-off-by: Michael Goin --- vllm/attention/ops/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 189b57e8e8b82..6253e1e56b0f1 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -134,6 +134,5 @@ def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor, cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) - assert out.is_contiguous() out = cp_group.reduce_scatter(out, dim=1) return out From fafbe11af4f211abe02ef874f113bc28232eb0e8 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Tue, 23 Sep 2025 11:42:58 +0800 Subject: [PATCH 123/185] [Docs] Fix griffe warnings in vllm/lora/ops (#25369) Signed-off-by: windsonsea --- vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 6 +++--- vllm/lora/ops/xla_ops/lora_ops.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py index 39e647b9b88a4..e27604728ed06 100644 --- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py +++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py @@ -83,8 +83,8 @@ class LoRAKernelMeta: Prepare kernel metadata tensors for the current forward pass. Args: - token_lora_tensor (torch.Tensor): Tensor containing lora indices - for each input token. + token_lora_mapping (torch.Tensor): Tensor containing lora indices + for each input token. """ self._reset() @@ -136,7 +136,7 @@ class LoRAKernelMeta: Args: token_nums (int): Number of input tokens in the current forward - pass. + pass of the kernel. """ return ( self.token_lora_mapping[:token_nums], diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index 9118f3351ef0a..29bfd5753a588 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -93,7 +93,6 @@ def bgmv_shrink( inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - output_tensor (torch.Tensor): (Unused) output tensor (placeholder). lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. scaling (float, optional): Scalar multiplier applied to the output. From e8db44f8834161259b25ebb13300bd502d37af9f Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 23 Sep 2025 00:01:09 -0400 Subject: [PATCH 124/185] [DP/EP][GPTOSS] Use triton matmul-ogs kernels for GPTOSS DP/EP (#24588) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../model_executor/layers/fused_moe/config.py | 22 ++- .../fused_moe/deepep_ht_prepare_finalize.py | 18 ++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 176 ++++++++++++++---- vllm/model_executor/layers/fused_moe/layer.py | 82 +++++--- .../layers/fused_moe/modular_kernel.py | 2 +- .../layers/quantization/mxfp4.py | 51 +++-- 6 files changed, 275 insertions(+), 76 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b14bc06e913cf..3052bdb4dc1ec 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -288,7 +288,11 @@ class FusedMoEQuantConfig: @property def use_mxfp4_w4a4(self) -> bool: - return self.quant_dtype == "mxfp4" + return (self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4") + + @property + def use_mxfp4_w4a16(self) -> bool: + return (self._a1.dtype is None and self._w1.dtype == "mxfp4") @property def use_nvfp4_w4a4(self) -> bool: @@ -453,6 +457,22 @@ def int8_w8a8_moe_quant_config( ) +def mxfp4_w4a16_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> FusedMoEQuantConfig: + """ + Construct a quant config for unquantized activations and mxfp4 weights. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias), + _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias), + ) + + def mxfp4_w4a4_moe_quant_config( w1_scale: Union[torch.Tensor, "PrecisionConfig"], w2_scale: Union[torch.Tensor, "PrecisionConfig"], diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index f390f0a25875e..a250a6218715e 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) +from vllm.utils import round_up class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): @@ -18,6 +19,23 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): Prepare/Finalize using DeepEP High-Throughput kernels. """ + @staticmethod + def maybe_roundup_layer_hidden_size(hidden_size: int, + dtype: torch.dtype) -> int: + # Round up hidden size so it is compatible with DeepEP High Throughput + # kernels. + # DeepEP intranode kernels make copies in units of, + # 32(warp-size) int4 elements. Round up hidden size to respect this. + # For example, an input hidden size of 2880 with dtype torch.bfloat16 + # will be rounded up to 3072. + hidden_size_bytes = hidden_size * dtype.itemsize + xfer_atom_size = 512 # 32 * 16 (size(int4)) + if hidden_size_bytes % xfer_atom_size == 0: + return hidden_size + + hidden_size_bytes = round_up(hidden_size_bytes, xfer_atom_size) + return hidden_size_bytes // dtype.itemsize + def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int, dp_size: int, rank_expert_offset: int): super().__init__() diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index f12d3807517ff..0e84a9241e905 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -9,7 +9,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) + TopKWeightAndReduceNoOP) +from vllm.triton_utils import tl, triton from vllm.utils import has_triton_kernels logger = init_logger(__name__) @@ -19,13 +20,55 @@ if has_triton_kernels(): import triton_kernels.swiglu from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, matmul_ogs) - from triton_kernels.routing import routing + from triton_kernels.routing import (RoutingData, routing, + routing_from_bitmatrix) + from triton_kernels.tensor import Bitmatrix except (ModuleNotFoundError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " "version is compatible. Error: %s", e) +@triton.jit +def pack_bitmatrix( + bitmatrix, + topk_ids, + n_rows, # n_rows in bitmatrix / topk_ids + bm_cols: tl.constexpr, # n int32_t bitpacks in bitmatrix + n_expts_act, # num_topk + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + """ + Packs topk_ids into a bitmatrix. + code reference: + https://github.com/triton-lang/triton/blob/dd1bbc52b34d202dfe5ffea1e04fb16166c5c04e/python/triton_kernels/bench/distributed.py#L264 + """ + pid_m = tl.program_id(0) + offsets_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offsets_k = tl.arange(0, BLOCK_SIZE_K) + offsets = offsets_m[:, None] * n_expts_act + offsets_k[None, :] + mask = (offsets_m < n_rows)[:, None] & (offsets_k < n_expts_act)[None, :] + indices = tl.load(topk_ids + offsets, mask=mask, other=-1) + div = indices // 32 + rem = indices % 32 + one = tl.cast(1, tl.uint32) + + # Iterate through all the relevant bitmatrix columns. + for i in range(bm_cols): + # When BLOCK_SIZE_K=32, offs is just the column index. + offs = tl.arange(0, BLOCK_SIZE_K // 32) + i * (BLOCK_SIZE_K // 32) + # All topks that need to go into this column has the correct bit set. + # Other bits are 0. x is a 2D tensor. + x = tl.where(div[:, :, None] == offs[None, None, :], + (one << rem)[:, :, None], 0) + # Reduce x to get a single int32_t bitpack. + y = tl.reduce_or(x, axis=1) + bitmatrix_ptrs = bitmatrix + offsets_m[:, + None] * bm_cols + offs[None, :] + tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows) + + def triton_kernel_moe_forward( hidden_states: torch.Tensor, w1, # Tensor or triton_kernels.Tensor @@ -124,34 +167,88 @@ def triton_kernel_fused_experts( return intermediate_cache3 -class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): +def make_routing_data( + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, +) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: - def __init__( - self, - max_num_tokens: int, - num_dispatchers: int, - quant_config: FusedMoEQuantConfig, - ): + topk_ids = topk_ids.to(torch.int16) + topk_weights = topk_weights.to(torch.bfloat16) + + n_rows, num_topk = topk_ids.size() + + BLOCK_SIZE_M = 512 + BLOCK_SIZE_K = 32 + + bm_cols = triton.cdiv(num_local_experts, BLOCK_SIZE_K) # n_bitpacks + bitmatrix = torch.zeros((n_rows, bm_cols), + dtype=torch.uint32, + device=topk_ids.device) + + grid = (triton.cdiv(n_rows, BLOCK_SIZE_M), ) + pack_bitmatrix[grid]( + bitmatrix, + topk_ids, + n_rows, + bm_cols, + num_topk, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_K=BLOCK_SIZE_K, + ) + + bitmatrix_shape = [n_rows, bm_cols * 32] + bitmatrix_shape_max = [n_rows, None] + bitmatrix = Bitmatrix(bitmatrix, + shape=bitmatrix_shape, + shape_max=bitmatrix_shape_max, + scratchpad=None) + + # matmul_ogs expects invalid topk_weights to be -1s + topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights) + routing_data, gather_indx, scatter_indx = routing_from_bitmatrix( + bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk) + + return routing_data, gather_indx, scatter_indx + + +class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self, quant_config: FusedMoEQuantConfig): + super().__init__(quant_config) + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Weight application and reduction happens in the fused_experts kernel. + return TopKWeightAndReduceNoOP() + + def _make_routing_data( + self, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, + ) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: + return make_routing_data(topk_ids, topk_weights, num_local_experts) + + +class OAITritonExperts(BaseOAITritonExperts): + + def __init__(self, quant_config: FusedMoEQuantConfig): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16" super().__init__(quant_config) - self.max_num_tokens = max_num_tokens - self.num_dispatchers = num_dispatchers @property def activation_formats( self ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: - return (mk.FusedMoEActivationFormat.BatchedExperts, - mk.FusedMoEActivationFormat.BatchedExperts) + return (mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard) def supports_chunking(self) -> bool: - return False - - def supports_expert_map(self) -> bool: - return False - - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return True def workspace_shapes( self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int, @@ -159,13 +256,10 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_tokens_meta: Optional[mk.ExpertTokensMetadata] ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: # workspace are allocated inside the kernel - assert a.dim() == 2 - num_dp = self.num_dispatchers - num_experts = local_num_experts - max_num_tokens = self.max_num_tokens - workspace2 = (0, 0, 0) - output = (num_experts, max_num_tokens * num_dp, N) - return (output, workspace2, output, a.dtype) + workspace1 = (M, K) + workspace2 = (0, 0) + output = (M, K) + return (workspace1, workspace2, output, a.dtype) def apply( self, @@ -185,17 +279,29 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): - return triton_kernel_fused_experts( - output, + if expert_map is not None: + topk_ids = expert_map[topk_ids] + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + routing_data, gather_indx, scatter_indx = self._make_routing_data( + topk_ids, topk_weights, local_num_experts) + + experts_output = triton_kernel_fused_experts( + None, hidden_states, w1, w2, - routing_data=None, - gather_indx=None, - scatter_indx=None, + routing_data, + gather_indx, + scatter_indx, activation=activation, quant_config=self.quant_config, apply_router_weight_on_input=False, - global_num_experts=global_num_experts, - expert_map=expert_map, + global_num_experts=local_num_experts, + expert_map=None, # applied already a1q_scale=a1q_scale) + + output.copy_(experts_output, non_blocking=True) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 17ad75584a3f6..1f80e972b7f06 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -800,6 +800,49 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str: for local_index, global_index in zip(local_indices, global_indices)) +def maybe_roundup_hidden_size( + hidden_size: int, act_dtype: torch.dtype, + quant_config: Optional[QuantizationConfig], + moe_parallel_config: FusedMoEParallelConfig) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size(int): Layer hidden-size + act_dtype: Data type of the layer activations. + quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration. + moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization + strategy configuration. + + Return: + Rounded up hidden_size if rounding up is required based on the configs. + Original hidden size otherwise. + """ + + if (moe_parallel_config.use_deepep_ht_kernels): + hidden_size = ( + DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size, act_dtype)) + + # we are padding globally so EP buffer allocation works + if quant_config and quant_config.get_name() == "mxfp4": + + from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Backend, get_mxfp4_backend) + current_mxfp4_backend = get_mxfp4_backend() + if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS): + hidden_size = round_up(hidden_size, 128) + elif (current_platform.is_rocm() or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): + hidden_size = round_up(hidden_size, 256) + + return hidden_size + + @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -856,6 +899,18 @@ class FusedMoE(CustomOp): params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype + vllm_config = get_current_vllm_config() + + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + else: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + tp_size_ = (tp_size if tp_size is not None else get_tensor_model_parallel_world_size()) dp_size_ = (dp_size @@ -865,7 +920,6 @@ class FusedMoE(CustomOp): if self.is_sequence_parallel: self.sp_size = tp_size_ - vllm_config = get_current_vllm_config() self.moe_parallel_config: FusedMoEParallelConfig = ( FusedMoEParallelConfig.make( tp_size_=tp_size_, @@ -874,19 +928,10 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts - # we are padding globally so EP buffer allocation works - if quant_config and quant_config.get_name() == "mxfp4": - from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Backend, get_mxfp4_backend) - current_mxfp4_backend = get_mxfp4_backend() - if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - or current_mxfp4_backend - == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS): - hidden_size = round_up(hidden_size, 128) - elif (current_platform.is_rocm() or current_mxfp4_backend - == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or - current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): - hidden_size = round_up(hidden_size, 256) + # Round up hidden size if needed. + hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype, + quant_config, + self.moe_parallel_config) # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config @@ -967,20 +1012,13 @@ class FusedMoE(CustomOp): raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - if vllm_config.model_config is not None: - model_dtype = vllm_config.model_config.dtype - else: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - model_dtype = params_dtype - moe = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, num_local_experts=self.local_num_experts, moe_parallel_config=self.moe_parallel_config, - in_dtype=model_dtype, + in_dtype=moe_in_dtype, max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, has_bias=has_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a16c254fadf66..5fce24018e647 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -76,7 +76,7 @@ def _moe_problem_size( """ assert w1.dim() == 3 and w2.dim() == 3 E, N, _ = w1.size() - K = w2.size(1) + K = a1.size(-1) if a1.dim() == 2: # Make sure we are using the correct a1 (pre-permute). diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5c3f8a891276b..a71c8d32a22c7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -13,7 +13,10 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.fused_moe import modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config) + FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config, + mxfp4_w4a16_moe_quant_config) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts) from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) @@ -578,9 +581,14 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.w13_bias = Parameter(w13_bias, requires_grad=False) layer.w2_bias = Parameter(w2_bias, requires_grad=False) - # FIXME warp need to be adjusted based on batch size - # only apply to batched mode - if self.moe.use_ep: + # Ideally we'd use FusedMoEModularKernel.prepare_finalize object + # (stored in self.fused_experts) to determine if the MoE has a + # batched activation format. As self.fused_experts is not + # initialized at this point, we resort to checking the MoE config + # directly. + is_batched_moe = (self.moe.use_pplx_kernels + or self.moe.use_deepep_ll_kernels) + if is_batched_moe: num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8 else: num_warps = 8 @@ -640,16 +648,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if self.mxfp4_backend == Mxfp4Backend.TRITON: w1_scale = self.w13_precision_config w2_scale = self.w2_precision_config + return mxfp4_w4a16_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) else: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale - - return mxfp4_w4a4_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=w1_scale, - w2_scale=w2_scale, - ) + return mxfp4_w4a4_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) def select_gemm_impl( self, @@ -661,6 +674,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError( "Mxfp4 does not support batched experts format for EP") else: + assert self.moe_quant_config is not None if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): # B200 code-path @@ -671,13 +685,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): # TODO(bnell): part of quant_config "max_capture_size": self.max_capture_size, } - assert self.moe_quant_config is not None return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs) else: - # Use matmul_ogs from triton_kernels here! - raise NotImplementedError( - "Mxfp4 does not support non-batched experts format for EP") + return OAITritonExperts(self.moe_quant_config) def _route_and_experts( self, @@ -722,10 +733,16 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): logical_to_physical_map=logical_to_physical_map, logical_replica_count=logical_replica_count) + w13_weight = (self.w13_weight_triton_tensor + if layer.w13_weight is None else layer.w13_weight) + w2_weight = (self.w2_weight_triton_tensor + if layer.w2_weight is None else layer.w2_weight) + assert all([w is not None for w in [w13_weight, w2_weight]]) + return self.fused_experts( hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, + w1=w13_weight, + w2=w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, From 5774b0a1da93492e60cc446e6e783a43da89e105 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Mon, 22 Sep 2025 23:17:42 -0500 Subject: [PATCH 125/185] [NIXL][OOT platform] support nixl_connector with oot platform and other nixl_backend (#25121) Signed-off-by: Chendi Xue --- docs/features/disagg_prefill.md | 6 +++ docs/serving/expert_parallel_deployment.md | 2 +- .../kv_connector/unit/test_nixl_connector.py | 52 ++++++++++++++++++- .../kv_connector/v1/nixl_connector.py | 33 +++++++++--- vllm/platforms/interface.py | 15 ++++++ 5 files changed, 99 insertions(+), 9 deletions(-) diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index 69f70b8ff5ac3..cb62213cc7af0 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -31,6 +31,12 @@ Now supports 5 types of connectors: --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}' ``` +For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as: + + ```bash + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}' + ``` + - **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker): ```bash diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 7489fc2609831..f823d33df80ea 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -193,7 +193,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok 1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. -2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}` +2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}'` 3. **Client Orchestration**: Use the client-side script below to coordinate prefill/decode operations. We are actively working on routing solutions. diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 6e58d158c3f4b..fa698a2eabd91 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -27,6 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, NixlConnectorWorker, NixlKVConnectorStats) from vllm.forward_context import ForwardContext +from vllm.platforms.interface import Platform from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput @@ -56,7 +57,7 @@ class FakeNixlWrapper: def get_reg_descs(self, caches_data, memory_type: str) -> list: return [str(uuid.uuid4()) for _ in caches_data] - def register_memory(self, descs) -> None: + def register_memory(self, descs, backends) -> None: pass def get_xfer_descs(self, blocks_data, memory_type: str) -> list: @@ -855,3 +856,52 @@ def test_register_kv_caches(dist_init): assert block_len == expected_block_len, \ f"Block entry {i}: Expected block len {expected_block_len}, " \ f"got {block_len}" + + +class FakePlatform(Platform): + device_type: str = "oot" + + @classmethod + def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]: + """ + Returns a mapping from device_type to a tuple of supported + kv_buffer_device for nixl. + """ + return {'oot': ('oot', )} + + @classmethod + def get_nixl_memory_type(cls) -> Optional[str]: + """ + Returns the nixl memory type for the current platform. + """ + return 'VRAM' + + +@pytest.mark.parametrize("kv_buffer_device, nixl_memory_type", [ + ("oot", "VRAM"), +]) +def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, + nixl_memory_type): + """ + Test that register_kv_caches() passes the correct memory types from the + config to the nixl_wrapper. + """ + vllm_config = create_vllm_config() + # Override the default memory types in the config + vllm_config.kv_transfer_config.kv_buffer_device = kv_buffer_device + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( + _NIXL_SUPPORTED_DEVICE) + _NIXL_SUPPORTED_DEVICE.update(FakePlatform.get_nixl_supported_devices()) + + with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.current_platform", FakePlatform), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector._NIXL_SUPPORTED_DEVICE", _NIXL_SUPPORTED_DEVICE): # noqa: E501 + + # Create connector and replace its worker with a fake one for isolation + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + + # Verify get_reg_descs was called with the correct memory_type + assert connector.connector_worker.kv_buffer_device == kv_buffer_device + assert connector.connector_worker.nixl_memory_type == nixl_memory_type diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index d3a08af088c11..82b483447e33d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -58,6 +58,12 @@ except ImportError: logger.warning("NIXL is not available") NixlWrapper = None +try: + from nixl._api import nixl_agent_config +except ImportError: + nixl_agent_config = None + logger.warning("NIXL agent config is not available") + # Supported platforms and types of kv transfer buffer. # {device: tuple of supported kv buffer types} _NIXL_SUPPORTED_DEVICE = { @@ -65,6 +71,8 @@ _NIXL_SUPPORTED_DEVICE = { "tpu": ("cpu", ), "xpu": ("cpu", ), } +# support for oot platform by providing mapping in current_platform +_NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices()) class NixlAgentMetadata( @@ -448,8 +456,15 @@ class NixlConnectorWorker: self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size + self.nixl_backends = \ + vllm_config.kv_transfer_config.get_from_extra_config( + "backends", ["UCX"]) # Agent. - self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None) + non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"] + config = nixl_agent_config(backends=self.nixl_backends) if len( + non_ucx_backends) > 0 and nixl_agent_config is not None else None + + self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config) # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}. self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict) @@ -486,11 +501,15 @@ class NixlConnectorWorker: # used when device memory can not be registered under nixl self.host_xfer_buffers: dict[str, torch.Tensor] = {} self.use_host_buffer = self.kv_buffer_device == "cpu" - if self.kv_buffer_device == "cuda": - self.nixl_memory_type = "VRAM" - elif self.kv_buffer_device == "cpu": - self.nixl_memory_type = "DRAM" - else: + # support for oot platform which can't register nixl memory + # type based on kv_buffer_device + self.nixl_memory_type = current_platform.get_nixl_memory_type() + if self.nixl_memory_type is None: + if self.kv_buffer_device == "cuda": + self.nixl_memory_type = "VRAM" + elif self.kv_buffer_device == "cpu": + self.nixl_memory_type = "DRAM" + if self.nixl_memory_type is None: raise RuntimeError( f"{self.device_type} with {self.kv_buffer_device} kv_buffer " "is not supported.") @@ -766,7 +785,7 @@ class NixlConnectorWorker: descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type) logger.debug("Registering descs: %s", caches_data) - self.nixl_wrapper.register_memory(descs) + self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends) logger.debug("Done registering descs") self._registered_descs.append(descs) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3f13ae72fe4d7..7dd935d2eb31c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -604,6 +604,21 @@ class Platform: return _synced_weight_loader + @classmethod + def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]: + """ + Returns a mapping from device_type to a tuple of supported + kv_buffer_device for nixl. + """ + return {} + + @classmethod + def get_nixl_memory_type(cls) -> Optional[str]: + """ + Returns the nixl memory type for the current platform. + """ + return None + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED From c98be0a232764fb68353e2c9e26d3495a979044d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 23 Sep 2025 13:17:10 +0800 Subject: [PATCH 126/185] [Model] Enable DP for ViT in Qwen2-VL (#25445) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_vl.py | 78 +++++++++++++++++++------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b3c42c2572566..88813490c0fbc 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -66,6 +66,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -217,17 +218,20 @@ class Qwen2VisionMLP(nn.Module): act_layer: type[nn.Module] = QuickGELU, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.fc1 = ColumnParallelLinear(in_features, hidden_features, quant_config=quant_config, - prefix=f"{prefix}.fc1") + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel) self.act = act_layer() self.fc2 = RowParallelLinear(hidden_features, in_features, quant_config=quant_config, - prefix=f"{prefix}.fc2") + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel) def forward(self, x: torch.Tensor) -> torch.Tensor: x_parallel, _ = self.fc1(x) @@ -293,25 +297,28 @@ class Qwen2VisionAttention(nn.Module): projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() # Per attention head and per partition values. - world_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_size = world_size + self.tp_size = (1 if use_data_parallel else + parallel_state.get_tensor_model_parallel_world_size()) self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( - num_heads, world_size) + num_heads, self.tp_size) self.qkv = ColumnParallelLinear(input_size=embed_dim, output_size=3 * projection_size, quant_config=quant_config, - prefix=f"{prefix}.qkv") + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel) self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, quant_config=quant_config, - prefix=f"{prefix}.proj") + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel) # Detect attention implementation. self.attn_backend = get_vit_attn_backend( @@ -453,6 +460,7 @@ class Qwen2VisionBlock(nn.Module): norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: @@ -465,12 +473,14 @@ class Qwen2VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = Qwen2VisionMLP(dim, mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) def forward( self, @@ -531,6 +541,7 @@ class Qwen2VisionPatchMerger(nn.Module): spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -542,13 +553,15 @@ class Qwen2VisionPatchMerger(nn.Module): self.hidden_size, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp.0"), + prefix=f"{prefix}.mlp.0", + disable_tp=use_data_parallel), nn.GELU(), RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp.2"), + prefix=f"{prefix}.mlp.2", + disable_tp=use_data_parallel), ]) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -600,6 +613,7 @@ class Qwen2VisionTransformer(nn.Module): norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -613,6 +627,9 @@ class Qwen2VisionTransformer(nn.Module): num_heads = vision_config.num_heads mlp_ratio = vision_config.mlp_ratio + self.use_data_parallel = use_data_parallel + self.out_hidden_size = vision_config.hidden_size + self.spatial_merge_size = spatial_merge_size self.num_heads = num_heads self.embed_dim = embed_dim @@ -634,7 +651,8 @@ class Qwen2VisionTransformer(nn.Module): mlp_ratio=mlp_ratio, norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(depth) ]) self.merger = Qwen2VisionPatchMerger( @@ -643,6 +661,7 @@ class Qwen2VisionTransformer(nn.Module): norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype()) @@ -659,8 +678,9 @@ class Qwen2VisionTransformer(nn.Module): def device(self) -> torch.device: return self.patch_embed.proj.weight.device - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: pos_ids = [] + max_grid_size = 0 for t, h, w in grid_thw: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) @@ -678,8 +698,8 @@ class Qwen2VisionTransformer(nn.Module): ).permute(0, 2, 1, 3).flatten() pos_ids.append( torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + max_grid_size = max(max_grid_size, h, w) pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -698,7 +718,7 @@ class Qwen2VisionTransformer(nn.Module): def forward( self, x: torch.Tensor, - grid_thw: torch.Tensor, + grid_thw: list[list[int]], ) -> torch.Tensor: # patchify x = x.to(device=self.device, dtype=self.dtype) @@ -708,8 +728,9 @@ class Qwen2VisionTransformer(nn.Module): rotary_pos_emb = self.rot_pos_emb(grid_thw) # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum( + grid_thw_ = torch.tensor(grid_thw) + cu_seqlens = torch.repeat_interleave(grid_thw_[:, 1] * grid_thw_[:, 2], + grid_thw_[:, 0]).cumsum( dim=0, dtype=torch.int32) cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) @@ -1112,6 +1133,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + supports_encoder_tp_data = True + def get_mrope_input_positions( self, input_tokens: list[int], @@ -1239,6 +1262,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config self.multimodal_config = multimodal_config @@ -1249,6 +1273,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) else: self.visual = None @@ -1357,7 +1382,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = image_input["image_embeds"] else: pixel_values = image_input["pixel_values"] - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1377,7 +1410,14 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_embeds = video_input["video_embeds"] else: pixel_values_videos = video_input["pixel_values_videos"] - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size From ba8d2165b655793ff24cf074d041e0bddfc5a0cc Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 23 Sep 2025 00:56:00 -0700 Subject: [PATCH 127/185] Handle triton kernel import exception (#25319) Signed-off-by: Ming Yang --- vllm/model_executor/layers/fused_moe/config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 3052bdb4dc1ec..34bfe1c16aac7 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,11 +14,16 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.utils import cdiv, has_triton_kernels from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if has_triton_kernels(): - from triton_kernels.matmul_ogs import PrecisionConfig - logger = init_logger(__name__) +if has_triton_kernels(): + try: + from triton_kernels.matmul_ogs import PrecisionConfig + except ImportError: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible.") + def _get_config_dtype_str( dtype: torch.dtype, From 9383cd6f10c55875de7f932e1bf7b0c43f01fc67 Mon Sep 17 00:00:00 2001 From: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:07:27 +0800 Subject: [PATCH 128/185] [Frontend] Add a new xml-based tool parser for qwen3-coder (#25028) Signed-off-by: Zhikaiiii <1658973216@qq.com> --- docs/features/tool_calling.md | 9 + tests/tool_use/test_qwen3coder_tool_parser.py | 112 +- .../openai/tool_parsers/__init__.py | 2 + .../tool_parsers/qwen3xml_tool_parser.py | 1137 +++++++++++++++++ 4 files changed, 1238 insertions(+), 22 deletions(-) create mode 100644 vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 2a48596571d1d..291c313cd57af 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -319,6 +319,15 @@ Supported models: Flags: `--tool-call-parser glm45` +### Qwen3-Coder Models (`qwen3_xml`) + +Supported models: + +* `Qwen/Qwen3-480B-A35B-Instruct` +* `Qwen/Qwen3-Coder-30B-A3B-Instruct` + +Flags: `--tool-call-parser qwen3_xml` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index f06fb2b9f2f04..57eaf84d36f23 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -13,6 +13,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ToolCall) from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser) +from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import ( + Qwen3XMLToolParser) from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer @@ -29,6 +31,21 @@ def qwen3_tool_parser(qwen3_tokenizer): return Qwen3CoderToolParser(qwen3_tokenizer) +@pytest.fixture +def qwen3_xml_tool_parser(qwen3_tokenizer): + return Qwen3XMLToolParser(qwen3_tokenizer) + + +@pytest.fixture(params=["original", "xml"]) +def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, + request): + """Parameterized fixture that provides both parser types for testing""" + if request.param == "original": + return qwen3_tool_parser + else: + return qwen3_xml_tool_parser + + @pytest.fixture def sample_tools(): return [ @@ -95,7 +112,7 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall], def stream_delta_message_generator( - qwen3_tool_parser: Qwen3CoderToolParser, + qwen3_tool_parser, qwen3_tokenizer: AnyTokenizer, model_output: str, request: Optional[ChatCompletionRequest] = None @@ -144,9 +161,9 @@ def stream_delta_message_generator( read_offset = new_read_offset -def test_extract_tool_calls_no_tools(qwen3_tool_parser): +def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized): model_output = "This is a test response without any tool calls" - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=None) # type: ignore[arg-type] assert not extracted_tool_calls.tools_called assert extracted_tool_calls.tool_calls == [] @@ -294,12 +311,13 @@ circle ], "Let me calculate that area for you."), ], ) -def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output, - expected_tool_calls, expected_content): +def test_extract_tool_calls(qwen3_tool_parser_parametrized, sample_tools, + model_output, expected_tool_calls, + expected_content): request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) assert extracted_tool_calls.tools_called @@ -308,7 +326,8 @@ def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output, assert extracted_tool_calls.content == expected_content -def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser, sample_tools): +def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser_parametrized, + sample_tools): """Test fallback parsing when XML tags are missing""" model_output = ''' @@ -322,7 +341,7 @@ TX request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) assert extracted_tool_calls.tools_called @@ -331,7 +350,7 @@ TX "get_current_weather") -def test_extract_tool_calls_type_conversion(qwen3_tool_parser): +def test_extract_tool_calls_type_conversion(qwen3_tool_parser_parametrized): """Test parameter type conversion based on tool schema""" tools = [ ChatCompletionToolsParam(type="function", @@ -381,7 +400,7 @@ hello world ''' request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) @@ -536,9 +555,10 @@ circle ], "Let me calculate that area for you."), ], ) -def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, - sample_tools, model_output, - expected_tool_calls, expected_content): +def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized, + qwen3_tokenizer, sample_tools, + model_output, expected_tool_calls, + expected_content): """Test incremental streaming behavior including typed parameters""" request = ChatCompletionRequest(model=MODEL, messages=[], @@ -548,7 +568,8 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, tool_states = {} # Track state per tool index for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): # role should never be streamed from tool parser assert not delta_message.role @@ -609,7 +630,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, def test_extract_tool_calls_missing_closing_parameter_tag( - qwen3_tool_parser, sample_tools): + qwen3_tool_parser_parametrized, sample_tools): """Test handling of missing closing tag""" # Using get_current_weather from sample_tools but with malformed XML model_output = '''Let me check the weather for you: @@ -629,7 +650,7 @@ fahrenheit request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) # The parser should handle the malformed XML gracefully @@ -652,7 +673,7 @@ fahrenheit def test_extract_tool_calls_streaming_missing_closing_tag( - qwen3_tool_parser, qwen3_tokenizer, sample_tools): + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools): """Test streaming with missing closing tag""" # Using get_current_weather from sample_tools but with malformed XML model_output = '''Let me check the weather for you: @@ -677,7 +698,8 @@ fahrenheit tool_states = {} for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): if delta_message.content: other_content += delta_message.content @@ -727,9 +749,8 @@ fahrenheit assert args["unit"] == "fahrenheit" -def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser, - qwen3_tokenizer, - sample_tools): +def test_extract_tool_calls_streaming_incremental( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools): """Test that streaming is truly incremental""" model_output = '''I'll check the weather. @@ -748,7 +769,8 @@ TX chunks = [] for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): chunks.append(delta_message) # Should have multiple chunks @@ -784,3 +806,49 @@ TX parsed_args = json.loads(full_args) assert parsed_args["city"] == "Dallas" assert parsed_args["state"] == "TX" + + +def test_extract_tool_calls_complex_type_with_single_quote( + qwen3_tool_parser_parametrized): + """Test parameter type conversion based on tool schema""" + tools = [ + ChatCompletionToolsParam(type="function", + function={ + "name": "test_types", + "parameters": { + "type": "object", + "properties": { + "int_param": { + "type": "integer" + }, + "float_param": { + "type": "float" + }, + "bool_param": { + "type": "boolean" + }, + "str_param": { + "type": "string" + }, + "obj_param": { + "type": "object" + } + } + } + }) + ] + + model_output = ''' + + +{'key': 'value'} + + +''' + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( + model_output, request=request) + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["obj_param"] == {"key": "value"} diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 35096b0461361..5e77c406b8d92 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -20,6 +20,7 @@ from .openai_tool_parser import OpenAIToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser from .pythonic_tool_parser import PythonicToolParser from .qwen3coder_tool_parser import Qwen3CoderToolParser +from .qwen3xml_tool_parser import Qwen3XMLToolParser from .seed_oss_tool_parser import SeedOssToolParser from .step3_tool_parser import Step3ToolParser from .xlam_tool_parser import xLAMToolParser @@ -45,6 +46,7 @@ __all__ = [ "HunyuanA13BToolParser", "Glm4MoeModelToolParser", "Qwen3CoderToolParser", + "Qwen3XMLToolParser", "SeedOssToolParser", "Step3ToolParser", "OpenAIToolParser", diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py new file mode 100644 index 0000000000000..4ab67dfea104c --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -0,0 +1,1137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +import uuid +from collections.abc import Sequence +from typing import Any, Optional, Union +from xml.parsers.expat import ParserCreate + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class StreamingXMLToolCallParser: + """ + Simplified streaming XML tool call parser + Supports streaming input, parsing, and output + """ + + def __init__(self): + self.reset_streaming_state() + + # Tool configuration information + self.tools: Union[list[ChatCompletionToolsParam], None] = None + self.tool_call_start_token: str = '' + self.tool_call_end_token: str = '' + self.function_start_token: str = ' DeltaMessage: + """ + Parse single streaming XML chunk and return Delta response + This is the actual streaming interface that receives chunks + one by one and maintains internal state + + Args: + xml_chunk: Single XML chunk string + Returns: + DeltaMessage: Contains delta information generated by this chunk, + returns empty response if no complete elements + """ + # Record delta count before processing + initial_delta_count = len(self.deltas) + + self.streaming_buffer += xml_chunk + + found_elements = self._process_complete_xml_elements() + + if found_elements: + # If complete elements found, check if end events were missed + # some tags may not have been triggered + try: + new_deltas = self.deltas[initial_delta_count:] + # If this chunk contains + # but didn't generate '}', then complete it + if (self.current_call_id is not None + and self.function_end_token in xml_chunk): + + # - Added '}' (non-empty parameter ending) + # - Added '{}' (empty parameter function) + has_function_close = any((td.tool_calls and any( + (tc.function and tc.id == self.current_call_id + and isinstance(tc.function.arguments, str) and + (tc.function.arguments in ('}', '{}'))) + for tc in td.tool_calls)) for td in new_deltas) + if not has_function_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.current_function_name: + self._end_element('function') + # If this chunk contains + # but didn't generate final empty delta, then complete it + if (self.current_call_id is not None + and self.tool_call_end_token in xml_chunk): + has_toolcall_close = any((td.tool_calls and any( + (tc.type == 'function' and tc.function and tc.function. + arguments == '' and tc.id == self.current_call_id) + for tc in td.tool_calls)) for td in new_deltas) + if not has_toolcall_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.current_function_name: + self._end_element('function') + self._end_element('tool_call') + except Exception as e: + logger.warning("Error with fallback parsing: %s", e) + # Merge newly generated deltas into single response + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count) + return result_delta + else: + # No complete elements, check if there's unoutput text content + if self.text_content_buffer and self.tool_call_index == 0: + # Has text content but no tool_call yet, output text content + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + # Clear buffer to avoid duplicate output + self.text_content_buffer = '' + return text_delta + + # If this chunk contains end tags but wasn't triggered by parser, + # manually complete end events + # Only execute when still on the same call as when entered, + # to prevent accidentally closing new calls + # in multi scenarios + if (self.current_call_id is not None + and (self.function_end_token in xml_chunk + or self.tool_call_end_token in xml_chunk)): + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.function_end_token in xml_chunk and \ + self.current_function_name: + self._end_element('function') + if self.tool_call_end_token in xml_chunk: + self._end_element('tool_call') + # Return the merged delta result generated by this fallback + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count) + return result_delta + + # No complete elements, return empty response + return DeltaMessage(content=None) + + def _escape_xml_special_chars(self, text: str) -> str: + """ + Escape XML special characters + Args: + text: Original text + Returns: + Escaped text + """ + xml_escapes = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + "'": ''' + } + + for char, escape in xml_escapes.items(): + text = text.replace(char, escape) + + return text + + def _process_complete_xml_elements(self) -> bool: + """ + Process complete XML elements in buffer + + Returns: + bool: Whether complete elements were found and processed + """ + found_any = False + + while self.last_processed_pos < len(self.streaming_buffer): + # Find next complete xml element + element, end_pos = self._find_next_complete_element( + self.last_processed_pos) + if element is None: + # No complete element found, wait for more data + break + + # Check if this element should be skipped + if self._should_skip_element(element): + self.last_processed_pos = end_pos + continue + + # Found complete XML element, process it + try: + preprocessed_element = self._preprocess_xml_chunk(element) + # Check if this is the first tool_call start + if ((preprocessed_element.strip().startswith('') or + preprocessed_element.strip().startswith('') + and self.tool_call_index > 0 and self.current_call_id): + # Reset parser state but preserve generated deltas + if self.current_param_name: + self._end_element('parameter') + if self.current_function_open or self.current_function_name: + self._end_element('function') + # Output final tool_call tail delta + final_delta = DeltaMessage( + role=None, + content=None, + reasoning_content=None, + tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='')) + ]) + self._emit_delta(final_delta) + # Reset XML parser and current call state + self._reset_xml_parser_after_tool_call() + # Parse preprocessed element + self.parser.Parse(preprocessed_element, False) + found_any = True + + except Exception as e: + logger.warning("Error when parsing XML elements: %s", e) + + # Update processed position + self.last_processed_pos = end_pos + + return found_any + + def _should_skip_element(self, element: str) -> bool: + """ + Determine whether an element should be skipped + + Args: + element: Element to evaluate + + Returns: + bool: True means should skip, False means should process + """ + + # If it's a tool_call XML tag, don't skip + if element.startswith( + self.tool_call_start_token) or element.startswith( + self.function_start_token) or element.startswith( + self.parameter_start_token): + return False + + # If currently not parsing tool calls and not blank, + # collect this text instead of skipping + # Only process other XML elements after tool_call appears, + # otherwise treat as plain text + if self.current_call_id is None and element: + # Collect text content to buffer + self.text_content_buffer += element + return True # Still skip, but content has been collected + + # If currently parsing tool calls, + # this might be parameter value, don't skip + if self.current_call_id is not None: + return False + + # Skip blank content + return not element + + def _find_next_complete_element( + self, start_pos: int) -> tuple[Optional[str], int]: + """ + Find next complete XML element from specified position + + Args: + start_pos: Position to start searching + + Returns: + (Complete element string, element end position), + returns (None, start_pos) if no complete element found + """ + buffer = self.streaming_buffer[start_pos:] + + if not buffer: + return None, start_pos + + if buffer.startswith('<'): + # Need to ensure no new < appears, + # find the nearest one between < and > + tag_end = buffer.find('<', 1) + tag_end2 = buffer.find('>', 1) + if tag_end != -1 and tag_end2 != -1: + # Next nearest is < + if tag_end < tag_end2: + return buffer[:tag_end], start_pos + tag_end + # Next nearest is >, means found XML element + else: + return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1 + elif tag_end != -1: + return buffer[:tag_end], start_pos + tag_end + elif tag_end2 != -1: + return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1 + else: + # If currently not parsing tool calls (entering a tool_call), + # check if starts with + if self.current_call_id is None: + # Check if might be start of + if buffer == ''[:len(buffer)]: + # Might be start of , wait for more data + return None, start_pos + else: + # Not start of , treat as text + return buffer, start_pos + len(buffer) + else: + # When parsing tool calls, + # wait for more data to get complete tag + return None, start_pos + else: + # Find text content (until next < or buffer end) + next_tag_pos = buffer.find('<') + if next_tag_pos != -1: + # Found text content + text_content = buffer[:next_tag_pos] + return text_content, start_pos + next_tag_pos + else: + # Buffer end is all text, process + # (no longer wait for more data) + remaining = buffer + return remaining, start_pos + len(remaining) + + def _merge_new_deltas_to_single_response( + self, initial_count: int) -> DeltaMessage: + """ + Merge newly generated deltas from this processing + into a single DeltaMessage + + Args: + initial_count: Delta count before processing + + Returns: + Merged DeltaMessage containing all newly generated delta information + """ + if len(self.deltas) <= initial_count: + return DeltaMessage(content=None) + + # Get newly generated deltas + new_deltas = self.deltas[initial_count:] + + if len(new_deltas) == 1: + # Only one new delta, return directly + return new_deltas[0] + + # Merge multiple new deltas + merged_tool_calls: list[DeltaToolCall] = [] + merged_content: str = '' + + for delta in new_deltas: + if delta.content: + merged_content += delta.content + if delta.tool_calls: + # For tool_calls, we need to intelligently merge arguments + for tool_call in delta.tool_calls: + # Find if there's already a tool_call with the same call_id + existing_call = None + for existing in merged_tool_calls: + if existing.id == tool_call.id: + existing_call = existing + break + + if existing_call and existing_call.function: + # Merge to existing tool_call + if tool_call.function and tool_call.function.name: + existing_call.function.name = \ + tool_call.function.name + if tool_call.function \ + and tool_call.function.arguments is not None: + if existing_call.function.arguments is None: + existing_call.function.arguments = '' + + # For streaming JSON parameters, + # simply concatenate in order + new_args = tool_call.function.arguments + existing_call.function.arguments += new_args + if tool_call.type: + existing_call.type = tool_call.type + else: + # Add new tool_call + merged_tool_calls.append(tool_call) + + return DeltaMessage(content=merged_content if merged_content else None, + tool_calls=merged_tool_calls) + + def _preprocess_xml_chunk(self, chunk: str) -> str: + """ + Preprocess XML chunk, handle non-standard formats, + and escape special characters + + Args: + chunk: Original XML chunk + + Returns: + Processed XML chunk + """ + + # Check if this is a tool_call related element + is_tool_call = False + if chunk.startswith(self.tool_call_start_token) or chunk.startswith( + self.tool_call_end_token): + is_tool_call = True + if chunk.startswith(self.function_start_token) or chunk.startswith( + self.function_end_token): + is_tool_call = True + if chunk.startswith(self.parameter_start_token) or chunk.startswith( + self.parameter_end_token): + is_tool_call = True + # Handle format -> + processed = re.sub(r']+)>', r'', + chunk) + # Handle format -> + processed = re.sub(r']+)>', r'', + processed) + + original_chunk = chunk + # If in parameter value accumulation mode + if self._pre_inside_parameter: + # Parameter end: output accumulated raw text + # safely then return + if processed.startswith(''): + body_text = self._pre_param_buffer + # Trigger deferred parsing mode + # literal_eval+json output in end_element + self.defer_current_parameter = True + self.deferred_param_raw_value = body_text + # Clean up state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + safe_text = self._escape_xml_special_chars(body_text) + return f"{safe_text}" + else: + # If this is the first block of content after entering parameter + # evaluate if deferred parsing is needed; + # If not needed, exit accumulation mode + # and pass through directly + if self._pre_param_buffer == "": + # Get current parameter type + param_type = self._get_param_type( + self._pre_current_param_name + ) if self._pre_current_param_name else 'string' + # Only these types need deferred parsing to + # handle Python literals containing single quotes + is_object_type = param_type in ["object"] + is_complex_type = (param_type + in ["array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list")) + + # Only delay when contains container symbols + # and has single quotes and is complex type + has_container_hint = ('[' in original_chunk) or ( + '{' in original_chunk) or ('(' in original_chunk) + + # Determine if deferred parsing is needed + need_defer = False + if is_complex_type: + # Complex type, always need deferred parsing + need_defer = True + elif is_object_type and has_container_hint and ( + "'" in original_chunk): + # Object type with container symbols + # and single quotes, need deferred parsing + need_defer = True + + if not need_defer: + # No need for deferred parsing, + # exit parameter mode directly + self._pre_inside_parameter = False + return self._escape_xml_special_chars(original_chunk) + self._pre_param_buffer += original_chunk + return "" + + # Parameter start: enable accumulation + if processed.startswith('', processed) + if m: + self._pre_current_param_name = m.group(1) + self._pre_inside_parameter = True + self._pre_param_buffer = "" + return processed + + # If processed doesn't contain special_token, escape processed + # This is because XML parsing encounters special characters + # and reports errors, so escaping is needed + if not is_tool_call: + processed = self._escape_xml_special_chars(processed) + return processed + + def _emit_delta(self, delta: DeltaMessage): + """Emit Delta response (streaming output)""" + self.deltas.append(delta) + + def _auto_close_open_parameter_if_needed(self, + incoming_tag: Optional[str] = None + ): + """Before starting to process new elements, + if there are unclosed tags from before, + automatically complete their endings to the parser. + - If there are unclosed parameters, + it's equivalent to feeding `` + - When about to start a new function or tool_call, + if there are unclosed functions, complete ``. + - When about to start a new tool_call, + if there are unclosed tool_calls, complete ``. + """ + # First close unclosed parameters + if self.current_param_name: + self._end_element('parameter') + + # If about to start new function or tool_call, + # and there are unclosed functions, close function first + if incoming_tag in ('function', + 'tool_call') and self.current_function_name: + self._end_element('function') + + # If about to start new tool_call, + # and there are unclosed tool_calls, close tool_call first + if incoming_tag == 'tool_call' and self.current_call_id: + self._end_element('tool_call') + + def _start_element(self, name: str, attrs: dict[str, str]): + """Handle XML start element events""" + + if name == 'root': + return + + if name == 'tool_call': + # Before opening new tool_call, + # automatically complete previous unclosed tags + self._auto_close_open_parameter_if_needed('tool_call') + + self.parameters = {} + self.current_call_id = self._get_next_call_id() + self.current_param_is_first = True + self.tool_call_index += 1 + elif name.startswith('function') or (name == 'function'): + # If missing tool_call, manually complete + if not self.current_call_id: + self._start_element('tool_call', {}) + # Before opening new function, + # automatically complete previous unclosed tags (parameter/function) + self._auto_close_open_parameter_if_needed('function') + function_name = self._extract_function_name(name, attrs) + self.current_function_name = function_name + self.current_function_open = True + if function_name: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=function_name, arguments='')) + ]) + self._emit_delta(delta) + elif name.startswith('parameter') or (name == 'parameter'): + # If previous parameter hasn't ended normally, + # complete its end first, then start new parameter + self._auto_close_open_parameter_if_needed('parameter') + param_name = self._extract_parameter_name(name, attrs) + self.current_param_name = param_name + self.current_param_value = '' + self.current_param_value_converted = '' + self.start_quote_emitted = False # Reset start quote flag + + # Only output parameter name and colon, + # don't output quotes + # decide after parameter value type is determined + if param_name: + if not self.parameters: + # First parameter + # start JSON, only output parameter name and colon + json_start = f'{{"{param_name}": ' + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=json_start)) + ]) + self._emit_delta(delta) + self.current_param_is_first = True + else: + # Subsequent parameters + # add comma and parameter name, no quotes + json_continue = f', "{param_name}": ' + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=json_continue)) + ]) + self._emit_delta(delta) + self.current_param_is_first = False + + def _char_data(self, data: str): + """Handle XML character data events""" + if data and self.current_param_name: + # If preprocessing stage determines deferred parsing is needed, + # only cache character data, no streaming output + if self.defer_current_parameter: + original_data = data + if self.should_emit_end_newline: + original_data = '\n' + original_data + self.should_emit_end_newline = False + if original_data.endswith('\n'): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + return + + param_type = self._get_param_type(self.current_param_name) + + # Check if this is the first time receiving data for this parameter + # If this is the first packet of data and starts with \n, remove \n + if not self.current_param_value and data.startswith('\n'): + data = data[1:] + + # Output start quote for string type (if not already output) + if (param_type + in ['string', 'str', 'text', 'varchar', 'char', 'enum'] + and not self.start_quote_emitted): + quote_delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='"')) + ]) + self._emit_delta(quote_delta) + self.start_quote_emitted = True + + if not data: + return + + original_data = data + # Delay output of trailing newline + if self.should_emit_end_newline: + original_data = '\n' + original_data + self.should_emit_end_newline = False + if original_data.endswith('\n'): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + + # convert parameter value by param_type + converted_value = self._convert_param_value( + self.current_param_value, param_type) + output_data = self._convert_for_json_streaming( + converted_value, param_type) + + delta_data = output_data[len(self.current_param_value_converted):] + self.current_param_value_converted = output_data + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments=delta_data)) + ]) + self._emit_delta(delta) + + def _end_element(self, name: str): + """Handle XML end element events""" + + if name == 'root': + return + + # If function or tool_call ends and there are still unclosed parameters, + # complete parameter end first + if (name.startswith('function') or name == 'function' + or name == 'tool_call') and self.current_param_name: + self._auto_close_open_parameter_if_needed() + + if (name.startswith('parameter') + or name == 'parameter') and self.current_param_name: + # End current parameter + param_name = self.current_param_name + param_value = self.current_param_value + + # If in deferred parsing mode, + # perform overall parsing on raw content + # accumulated in preprocessing stage and output once + if self.defer_current_parameter: + raw_text = self.deferred_param_raw_value \ + if self.deferred_param_raw_value else param_value + parsed_value = None + output_arguments = None + try: + # If previously delayed trailing newline, + # add it back before parsing + if self.should_emit_end_newline: + raw_for_parse = raw_text + '\n' + else: + raw_for_parse = raw_text + parsed_value = ast.literal_eval(raw_for_parse) + output_arguments = json.dumps(parsed_value, + ensure_ascii=False) + except Exception: + # Fallback: output as string as-is + output_arguments = json.dumps(raw_text, ensure_ascii=False) + parsed_value = raw_text + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=output_arguments)) + ]) + self._emit_delta(delta) + + # Clean up and store + self.should_emit_end_newline = False + self.parameters[param_name] = parsed_value + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + return + + param_type = self._get_param_type(param_name) + + # convert complete parameter value by param_type + converted_value = self._convert_param_value( + param_value, param_type) + + # Decide whether to add end quote based on parameter type + if param_type in [ + 'string', 'str', 'text', 'varchar', 'char', 'enum' + ]: + # For empty string parameters, need special handling + if not param_value and not self.start_quote_emitted: + # No start quote output, + # directly output complete empty string + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='""')) + ]) + self._emit_delta(delta) + else: + # Non-empty parameter value, output end quote + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='"')) + ]) + self._emit_delta(delta) + + self.should_emit_end_newline = False + # Store converted value + self.parameters[param_name] = converted_value + self.current_param_name = None + self.current_param_value = '' + self.current_param_value_converted = '' + self.start_quote_emitted = False + + elif name.startswith('function') or name == 'function': + # if there are parameters, close JSON object + if self.parameters: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='}')) + ]) + self._emit_delta(delta) + # return empty object + else: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='{}')) + ]) + self._emit_delta(delta) + self.current_function_open = False + + elif name == 'tool_call': + # Before ending tool_call, + # ensure function is closed to complete missing right brace + if self.current_function_open: + # If there are still unclosed parameters, close them first + if self.current_param_name: + self._end_element('parameter') + # Close function, ensure output '}' or '{}' + self._end_element('function') + # Final Delta + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='')) + ]) + self._emit_delta(delta) + + # Check if there's text content to output (between tool_calls) + if self.text_content_buffer.strip(): + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + + self._reset_xml_parser_after_tool_call() + + def setup_parser(self): + """Set up XML parser event handlers""" + self.parser.buffer_text = True + self.parser.StartElementHandler = self._start_element + self.parser.EndElementHandler = self._end_element + self.parser.CharacterDataHandler = self._char_data + + def set_tools(self, tools: Union[list[ChatCompletionToolsParam], None]): + """Set tool configuration information""" + self.tools = tools + + def _get_next_call_id(self): + """Generate unique call ID""" + return f'call_{uuid.uuid4().hex[:24]}' + + def _extract_function_name(self, name: str, + attrs: dict[str, str]) -> Optional[str]: + """Extract function name from various formats""" + if attrs and 'name' in attrs: + return attrs['name'] + + if '=' in name: + parts = name.split('=', 1) + if len(parts) == 2 and parts[0] == 'function': + return parts[1] + + return None + + def _extract_parameter_name(self, name: str, + attrs: dict[str, str]) -> Optional[str]: + """Extract parameter name from various formats""" + if attrs and 'name' in attrs: + return attrs['name'] + + if '=' in name: + parts = name.split('=', 1) + if len(parts) == 2 and parts[0] == 'parameter': + return parts[1] + + return None + + def _get_param_type(self, param_name: str) -> str: + """Get parameter type based on tool configuration, defaults to string + Args: + param_name: Parameter name + + Returns: + Parameter type + """ + if not self.tools or not self.current_function_name: + return 'string' + + for tool in self.tools: + if not hasattr(tool, 'type') or not (hasattr( + tool, 'function') and hasattr(tool.function, 'name')): + continue + if tool.type == 'function' and \ + tool.function.name == self.current_function_name: + if not hasattr(tool.function, 'parameters'): + return 'string' + params = tool.function.parameters + if isinstance(params, dict) and 'properties' in params: + properties = params['properties'] + if param_name in properties and isinstance( + properties[param_name], dict): + return self.repair_param_type( + str(properties[param_name].get('type', 'string'))) + elif isinstance(params, dict) and param_name in params: + param_config = params[param_name] + if isinstance(param_config, dict): + return self.repair_param_type( + str(param_config.get('type', 'string'))) + break + return 'string' + + def repair_param_type(self, param_type: str) -> str: + """Repair unknown parameter types by treating them as string + Args: + param_type: Parameter type + + Returns: + Repaired parameter type + """ + if param_type in [ + 'string', 'str', 'text', 'varchar', 'char', 'enum' + ] or param_type.startswith('int') or param_type.startswith( + 'uint' + ) or param_type.startswith('long') or param_type.startswith( + 'short' + ) or param_type.startswith('unsigned') or param_type.startswith( + 'num') or param_type.startswith('float') or param_type in [ + 'boolean', 'bool', 'binary' + ] or (param_type in ["object", "array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list")): + return param_type + else: + return 'string' + + def _convert_param_value(self, param_value: str, param_type: str) -> Any: + """Convert value based on parameter type + Args: + param_value: Parameter value + param_type: Parameter type + + Returns: + Converted value + """ + if param_value.lower() == 'null': + return None + + param_type = param_type.strip().lower() + if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']: + return param_value + elif (param_type.startswith('int') or param_type.startswith('uint') + or param_type.startswith('long') + or param_type.startswith('short') + or param_type.startswith('unsigned')): + try: + return int(param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an integer " + "in tool '%s', degenerating to string.", param_value) + return param_value + elif param_type.startswith('num') or param_type.startswith('float'): + try: + float_param_value: float = float(param_value) + return float_param_value if float_param_value - int( + float_param_value) != 0 else int(float_param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", param_value) + return param_value + elif param_type in ['boolean', 'bool', 'binary']: + param_value = param_value.lower() + return param_value == 'true' + else: + return param_value + + def _convert_for_json_streaming(self, converted_value: Any, + param_type: str) -> str: + """Convert converted_value based on + whether it's empty and if type is string + Args: + converted_value: Converted value + param_type: Parameter type + + Returns: + Converted string for streaming output + """ + # Check if value is empty, but exclude numeric 0 + if converted_value is None or converted_value == '': + return '' + + if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']: + # String type, remove double quotes + return json.dumps(converted_value, ensure_ascii=False)[1:-1] + else: + # Non-string type, return complete JSON string + if not isinstance(converted_value, str): + return json.dumps(converted_value, ensure_ascii=False) + else: + return converted_value + + def _reset_xml_parser_after_tool_call(self): + """ + Each tool_call is treated as a separate XML document, + so we need to reset the parser after each tool_call. + """ + + # recreate XML parser + self.parser = ParserCreate() + self.setup_parser() + + # Reset current tool_call state + if self.current_call_id: + self.last_completed_call_id = self.current_call_id + self.current_call_id = None + self.current_function_name = None + self.current_function_open = False + self.parameters = {} + self.current_param_name = None + self.current_param_value = '' + self.current_param_value_converted = '' + self.current_param_is_first = False + self.should_emit_end_newline = False + self.start_quote_emitted = False + self.text_content_buffer = '' + + # Reset preprocessing and deferred parsing state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + + +@ToolParserManager.register_module("qwen3_xml") +class Qwen3XMLToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.parser = StreamingXMLToolCallParser() + + logger.info("vLLM Successfully import tool parser %s !", + self.__class__.__name__) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + self.parser.reset_streaming_state() + if request: + self.parser.set_tools(request.tools) + result = self.parser.parse_single_streaming_chunks(model_output) + if not result.tool_calls: + return ExtractedToolCallInformation( + tool_calls=[], + tools_called=False, + content=result.content, + ) + else: + tool_calls = [] + for tool_call in result.tool_calls: + if tool_call.function and tool_call.function.name: + tool_calls.append( + ToolCall( + id=tool_call.id, + type=tool_call.type, + function=FunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ), + )) + return ExtractedToolCallInformation( + tool_calls=tool_calls, + tools_called=len(tool_calls) > 0, + content=result.content, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + if not previous_text: + self.parser.reset_streaming_state() + if request: + self.parser.set_tools(request.tools) + + # Model sometimes outputs separately causing delta_text to be empty. + # If there were tool_calls before and all current tool_calls have ended, + # return an empty tool_call for outer streaming output + # to correctly output tool_call field + if not delta_text and delta_token_ids: + open_calls = current_text.count( + self.parser.tool_call_start_token) - current_text.count( + self.parser.tool_call_end_token) + if open_calls == 0 and self.parser.tool_call_index > 0: + # If current_call_id is None, use last_completed_call_id + call_id = self.parser.current_call_id or \ + self.parser.last_completed_call_id + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.parser.tool_call_index - 1, + id=call_id, + function=DeltaFunctionCall(arguments=''), + type='function', + ) + ]) + + return self.parser.parse_single_streaming_chunks(delta_text) From babad6e5dde380b453be716c7ebc2b6928658c18 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 23 Sep 2025 17:20:52 +0800 Subject: [PATCH 129/185] [Misc] Move DP for ViT code inside model executor dir (#25459) Signed-off-by: DarkLight1337 --- tests/models/test_vision.py | 424 ++++++++++++++++- tests/multimodal/test_utils.py | 426 +----------------- vllm/model_executor/models/glm4_1v.py | 3 +- .../models/idefics2_vision_model.py | 3 +- vllm/model_executor/models/intern_vit.py | 3 +- vllm/model_executor/models/kimi_vl.py | 2 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/qwen2_vl.py | 3 +- vllm/model_executor/models/qwen3_vl.py | 6 +- vllm/model_executor/models/step3_vl.py | 2 +- vllm/model_executor/models/vision.py | 281 +++++++++++- vllm/multimodal/utils.py | 293 +----------- 13 files changed, 721 insertions(+), 730 deletions(-) diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 310d3a3719b65..8744bcbd3a2a6 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -1,10 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math import pytest import torch +import torch.multiprocessing as mp -from vllm.model_executor.models.vision import resolve_visual_encoder_outputs +from tests.utils import multi_gpu_test +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import (init_distributed_environment, + initialize_model_parallel) +from vllm.model_executor.models.vision import ( + get_load_balance_assignment, resolve_visual_encoder_outputs, + run_dp_sharded_mrope_vision_model, run_dp_sharded_vision_model) +from vllm.platforms import current_platform +from vllm.utils import get_open_port, update_environment_variables @pytest.mark.parametrize( @@ -33,3 +43,415 @@ def test_resolve_visual_encoder_outputs(feature_sample_layers, post_layer_norm=None, max_possible_layers=max_possible_layers) assert torch.equal(torch.tensor(expected_features), output_tensor) + + +class SimpleLinearModel(torch.nn.Module): + """A simple linear vision model for testing.""" + + def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32): + super().__init__() + self.flatten = torch.nn.Flatten() + self.linear = torch.nn.Linear(input_dim, output_dim) + + def forward(self, x: torch.Tensor): + # Flatten the input and apply linear transformation + x = self.flatten(x) + return self.linear(x) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "batch_size", + [ + 1, # Single image + 4, # Small batch + 5, # Odd batch size (for testing padding) + ], +) +def test_run_dp_sharded_vision_model(batch_size: int): + world_size = 2 + # Launch processes + mp.spawn( + run_dp_sharded_vision_model_vs_direct, + args=( + world_size, + batch_size, + get_open_port(), + ), + nprocs=world_size, + ) + + +def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, + batch_size: int, master_port: int): + """ + Test that run_dp_sharded_vision_model produces the same results as + calling the model directly. + """ + + # Set random seed for reproducibility + current_platform.seed_everything(0) + + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create a test input tensor + image_input = torch.randn(batch_size, 3, 224, 224) + + # Create a simple linear model + vision_model = SimpleLinearModel() + + # Run the model directly on the full input + with torch.inference_mode(): + direct_output = vision_model(image_input) + + # Run the model through the sharded function + with torch.inference_mode(): + sharded_output = run_dp_sharded_vision_model(image_input, vision_model) + + # Check that the world size is set up correctly + assert get_tensor_model_parallel_world_size() == world_size + + # Check that the outputs have the same shape + assert direct_output.shape == sharded_output.shape + + # Check that the outputs are close (they should be identical) + assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5) + + +@pytest.mark.parametrize( + "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts," + "expected_grouped_sizes_per_gpu,test_description", + [ + # Empty input + ([], 2, [], [0, 0], [0, 0], "empty input"), + + # Fewer samples than GPUs + ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0 + ], "fewer samples than GPUs"), + + # Single GPU + ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"), + + # Balanced assignment + ([100, 100, 100, 100 + ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"), + + # Unbalanced sizes - this one is trickier since the algorithm is greedy + ([1000, 100, 200, 50], 2, [0, 2, 1, 3 + ], [1, 3], [1000, 350], "unbalanced sizes"), + ], +) +def test_get_load_balance_assignment_cases(sizes, num_gpus, + expected_shuffle_indices, + expected_gpu_sample_counts, + expected_grouped_sizes_per_gpu, + test_description): + """Test get_load_balance_assignment with various input cases.""" + result = get_load_balance_assignment(sizes, num_gpus=num_gpus) + (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result + + # Common assertions for all cases + assert len(shuffle_indices) == len(sizes) + assert len(gpu_sample_counts) == num_gpus + assert len(grouped_sizes_per_gpu) == num_gpus + assert sum(gpu_sample_counts) == len(sizes) + + assert shuffle_indices == expected_shuffle_indices + + assert gpu_sample_counts == expected_gpu_sample_counts + assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu + + +class SimpleMRopeVisionModel(torch.nn.Module): + """A simple vision model for testing mrope functionality.""" + + def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64): + super().__init__() + self.spatial_merge_size = spatial_merge_size + self.out_hidden_size = out_hidden_size + self.linear = torch.nn.Linear(768, out_hidden_size) + + def forward(self, pixel_values: torch.Tensor, + grid_thw_list: list[list[int]]): + """Simple forward pass that simulates spatial merging.""" + # Apply linear transformation + embeddings = self.linear(pixel_values) + + # Simulate spatial merging by reducing the number of patches + merge_factor = self.spatial_merge_size * self.spatial_merge_size + + # Group patches and merge spatially + merged_embeddings = [] + start_idx = 0 + + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + end_idx = start_idx + num_patches + + # Get patches for this image + image_patches = embeddings[start_idx:end_idx] + + # Simulate spatial merging by averaging groups of patches + merged_patches = num_patches // merge_factor + if merged_patches > 0: + # Reshape and average to simulate merging + reshaped = image_patches[:merged_patches * merge_factor].view( + merged_patches, merge_factor, -1) + merged = reshaped.mean(dim=1) + merged_embeddings.append(merged) + + start_idx = end_idx + + if merged_embeddings: + return torch.cat(merged_embeddings, dim=0) + else: + return torch.empty((0, self.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "batch_size", + [ + 1, # Single image + 3, # Small batch + 5, # Odd batch size (for testing padding) + ], +) +def test_run_dp_sharded_mrope_vision_model(batch_size: int): + world_size = 2 + # Launch processes + mp.spawn( + run_dp_sharded_mrope_vision_model_vs_direct, + args=( + world_size, + batch_size, + get_open_port(), + ), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, + world_size: int, + batch_size: int, + master_port: int): + """ + Test that run_dp_sharded_mrope_vision_model produces the same results as + calling the model directly. + """ + # Set random seed for reproducibility + current_platform.seed_everything(0) + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create test data + grid_thw_list = [] + pixel_values_list = [] + + for i in range(batch_size): + # Varying image sizes for better testing + t, h, w = 1, 4 + i, 4 + i + grid_thw_list.append([t, h, w]) + + num_patches = t * h * w + # Create random pixel values for this image + image_pixels = torch.randn(num_patches, 768) + pixel_values_list.append(image_pixels) + + # Concatenate all pixel values + pixel_values = torch.cat(pixel_values_list, dim=0) + + # Create a simple mrope vision model + vision_model = SimpleMRopeVisionModel() + + # Run the model directly on the full input (only on rank 0) + if local_rank == 0: + with torch.inference_mode(): + direct_output = vision_model(pixel_values, grid_thw_list) + + # Run the model through the sharded function + with torch.inference_mode(): + sharded_output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + sharded_output = torch.cat(sharded_output, dim=0) + + # Check that the world size is set up correctly + assert get_tensor_model_parallel_world_size() == world_size + + # Compare outputs (only on rank 0) + if local_rank == 0: + # Check that the outputs have the same shape + assert direct_output.shape == sharded_output.shape + # Check that the outputs are close (they should be identical) + assert torch.allclose(direct_output, + sharded_output, + rtol=1e-5, + atol=1e-5) + + +@multi_gpu_test(num_gpus=2) +def test_run_dp_sharded_mrope_vision_model_empty_input(): + world_size = 2 + mp.spawn( + run_dp_sharded_mrope_vision_model_empty_input_worker, + args=(world_size, get_open_port()), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_empty_input_worker( + local_rank: int, world_size: int, master_port: int): + """Test run_dp_sharded_mrope_vision_model with empty input.""" + # Set up distributed environment + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create empty inputs + pixel_values = torch.empty((0, 768)) + grid_thw_list: list[list[int]] = [] + + vision_model = SimpleMRopeVisionModel() + + # Should handle empty input gracefully + with torch.inference_mode(): + output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + + assert len(output) == 0 + + +@multi_gpu_test(num_gpus=4) +def test_run_dp_sharded_mrope_vision_model_uneven_load(): + world_size = 4 + mp.spawn( + run_dp_sharded_mrope_vision_model_uneven_load_worker, + args=(world_size, get_open_port()), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_uneven_load_worker( + local_rank: int, world_size: int, master_port: int): + """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" + # Set up distributed environment + current_platform.seed_everything(123) + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create images with very different sizes + grid_thw_list = [ + [1, 2, 2], # Small: 4 patches + [1, 8, 8], # Large: 64 patches + [1, 3, 3], # Medium: 9 patches + ] + + pixel_values_list = [] + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + image_pixels = torch.randn(num_patches, 768) + pixel_values_list.append(image_pixels) + + pixel_values = torch.cat(pixel_values_list, dim=0) + vision_model = SimpleMRopeVisionModel() + + # Should handle uneven distribution without errors + with torch.inference_mode(): + output_tuple = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + + # Verify output shape is reasonable + merge_factor = vision_model.spatial_merge_size**2 + expected_output_patches = list( + math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list) + + for i, output in enumerate(output_tuple): + assert output.shape[0] == expected_output_patches[i] + assert output.shape[1] == vision_model.out_hidden_size + + +@pytest.mark.parametrize("spatial_merge_size", [2, 4]) +def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int): + """Test SimpleMRopeVisionModel with different spatial merge sizes.""" + device = current_platform.device_type + + grid_thw_list = [[1, 4, 4], [1, 6, 6]] # Two images + pixel_values_list = [] + + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + image_pixels = torch.randn(num_patches, 768, device=device) + pixel_values_list.append(image_pixels) + + pixel_values = torch.cat(pixel_values_list, dim=0) + vision_model = SimpleMRopeVisionModel( + spatial_merge_size=spatial_merge_size).to(device) + + with torch.inference_mode(): + output = vision_model(pixel_values, grid_thw_list) + + # Verify output dimensions based on spatial merging + total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list) + merge_factor = spatial_merge_size**2 + expected_output_patches = total_patches // merge_factor + + assert output.shape[0] == expected_output_patches + assert output.shape[1] == vision_model.out_hidden_size diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index e1e8282dd66d4..f36d94ca01551 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 -import math import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory @@ -10,22 +9,11 @@ from typing import TYPE_CHECKING, NamedTuple import numpy as np import pytest -import torch -import torch.multiprocessing as mp from PIL import Image, ImageChops -from tests.utils import multi_gpu_test -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.distributed.parallel_state import (init_distributed_environment, - initialize_model_parallel) from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions, - get_load_balance_assignment, - run_dp_sharded_mrope_vision_model, - run_dp_sharded_vision_model) -from vllm.platforms import current_platform -from vllm.utils import get_open_port, update_environment_variables +from vllm.multimodal.utils import MediaConnector, argsort_mm_positions if TYPE_CHECKING: from vllm.multimodal.inputs import MultiModalPlaceholderDict @@ -404,415 +392,3 @@ def test_argsort_mm_positions(): modality_idxs = argsort_mm_positions(mm_positions) assert modality_idxs == expected_modality_idxs - - -class SimpleLinearModel(torch.nn.Module): - """A simple linear vision model for testing.""" - - def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32): - super().__init__() - self.flatten = torch.nn.Flatten() - self.linear = torch.nn.Linear(input_dim, output_dim) - - def forward(self, x: torch.Tensor): - # Flatten the input and apply linear transformation - x = self.flatten(x) - return self.linear(x) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "batch_size", - [ - 1, # Single image - 4, # Small batch - 5, # Odd batch size (for testing padding) - ], -) -def test_run_dp_sharded_vision_model(batch_size: int): - world_size = 2 - # Launch processes - mp.spawn( - run_dp_sharded_vision_model_vs_direct, - args=( - world_size, - batch_size, - get_open_port(), - ), - nprocs=world_size, - ) - - -def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, - batch_size: int, master_port: int): - """ - Test that run_dp_sharded_vision_model produces the same results as - calling the model directly. - """ - - # Set random seed for reproducibility - current_platform.seed_everything(0) - - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - # initialize distributed - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create a test input tensor - image_input = torch.randn(batch_size, 3, 224, 224) - - # Create a simple linear model - vision_model = SimpleLinearModel() - - # Run the model directly on the full input - with torch.inference_mode(): - direct_output = vision_model(image_input) - - # Run the model through the sharded function - with torch.inference_mode(): - sharded_output = run_dp_sharded_vision_model(image_input, vision_model) - - # Check that the world size is set up correctly - assert get_tensor_model_parallel_world_size() == world_size - - # Check that the outputs have the same shape - assert direct_output.shape == sharded_output.shape - - # Check that the outputs are close (they should be identical) - assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5) - - -@pytest.mark.parametrize( - "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts," - "expected_grouped_sizes_per_gpu,test_description", - [ - # Empty input - ([], 2, [], [0, 0], [0, 0], "empty input"), - - # Fewer samples than GPUs - ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0 - ], "fewer samples than GPUs"), - - # Single GPU - ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"), - - # Balanced assignment - ([100, 100, 100, 100 - ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"), - - # Unbalanced sizes - this one is trickier since the algorithm is greedy - ([1000, 100, 200, 50], 2, [0, 2, 1, 3 - ], [1, 3], [1000, 350], "unbalanced sizes"), - ], -) -def test_get_load_balance_assignment_cases(sizes, num_gpus, - expected_shuffle_indices, - expected_gpu_sample_counts, - expected_grouped_sizes_per_gpu, - test_description): - """Test get_load_balance_assignment with various input cases.""" - result = get_load_balance_assignment(sizes, num_gpus=num_gpus) - (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result - - # Common assertions for all cases - assert len(shuffle_indices) == len(sizes) - assert len(gpu_sample_counts) == num_gpus - assert len(grouped_sizes_per_gpu) == num_gpus - assert sum(gpu_sample_counts) == len(sizes) - - assert shuffle_indices == expected_shuffle_indices - - assert gpu_sample_counts == expected_gpu_sample_counts - assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu - - -class SimpleMRopeVisionModel(torch.nn.Module): - """A simple vision model for testing mrope functionality.""" - - def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64): - super().__init__() - self.spatial_merge_size = spatial_merge_size - self.out_hidden_size = out_hidden_size - self.linear = torch.nn.Linear(768, out_hidden_size) - - def forward(self, pixel_values: torch.Tensor, - grid_thw_list: list[list[int]]): - """Simple forward pass that simulates spatial merging.""" - # Apply linear transformation - embeddings = self.linear(pixel_values) - - # Simulate spatial merging by reducing the number of patches - merge_factor = self.spatial_merge_size * self.spatial_merge_size - - # Group patches and merge spatially - merged_embeddings = [] - start_idx = 0 - - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - end_idx = start_idx + num_patches - - # Get patches for this image - image_patches = embeddings[start_idx:end_idx] - - # Simulate spatial merging by averaging groups of patches - merged_patches = num_patches // merge_factor - if merged_patches > 0: - # Reshape and average to simulate merging - reshaped = image_patches[:merged_patches * merge_factor].view( - merged_patches, merge_factor, -1) - merged = reshaped.mean(dim=1) - merged_embeddings.append(merged) - - start_idx = end_idx - - if merged_embeddings: - return torch.cat(merged_embeddings, dim=0) - else: - return torch.empty((0, self.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "batch_size", - [ - 1, # Single image - 3, # Small batch - 5, # Odd batch size (for testing padding) - ], -) -def test_run_dp_sharded_mrope_vision_model(batch_size: int): - world_size = 2 - # Launch processes - mp.spawn( - run_dp_sharded_mrope_vision_model_vs_direct, - args=( - world_size, - batch_size, - get_open_port(), - ), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, - world_size: int, - batch_size: int, - master_port: int): - """ - Test that run_dp_sharded_mrope_vision_model produces the same results as - calling the model directly. - """ - # Set random seed for reproducibility - current_platform.seed_everything(0) - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - # initialize distributed - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create test data - grid_thw_list = [] - pixel_values_list = [] - - for i in range(batch_size): - # Varying image sizes for better testing - t, h, w = 1, 4 + i, 4 + i - grid_thw_list.append([t, h, w]) - - num_patches = t * h * w - # Create random pixel values for this image - image_pixels = torch.randn(num_patches, 768) - pixel_values_list.append(image_pixels) - - # Concatenate all pixel values - pixel_values = torch.cat(pixel_values_list, dim=0) - - # Create a simple mrope vision model - vision_model = SimpleMRopeVisionModel() - - # Run the model directly on the full input (only on rank 0) - if local_rank == 0: - with torch.inference_mode(): - direct_output = vision_model(pixel_values, grid_thw_list) - - # Run the model through the sharded function - with torch.inference_mode(): - sharded_output = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - sharded_output = torch.cat(sharded_output, dim=0) - - # Check that the world size is set up correctly - assert get_tensor_model_parallel_world_size() == world_size - - # Compare outputs (only on rank 0) - if local_rank == 0: - # Check that the outputs have the same shape - assert direct_output.shape == sharded_output.shape - # Check that the outputs are close (they should be identical) - assert torch.allclose(direct_output, - sharded_output, - rtol=1e-5, - atol=1e-5) - - -@multi_gpu_test(num_gpus=2) -def test_run_dp_sharded_mrope_vision_model_empty_input(): - world_size = 2 - mp.spawn( - run_dp_sharded_mrope_vision_model_empty_input_worker, - args=(world_size, get_open_port()), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_empty_input_worker( - local_rank: int, world_size: int, master_port: int): - """Test run_dp_sharded_mrope_vision_model with empty input.""" - # Set up distributed environment - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create empty inputs - pixel_values = torch.empty((0, 768)) - grid_thw_list: list[list[int]] = [] - - vision_model = SimpleMRopeVisionModel() - - # Should handle empty input gracefully - with torch.inference_mode(): - output = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - - assert len(output) == 0 - - -@multi_gpu_test(num_gpus=4) -def test_run_dp_sharded_mrope_vision_model_uneven_load(): - world_size = 4 - mp.spawn( - run_dp_sharded_mrope_vision_model_uneven_load_worker, - args=(world_size, get_open_port()), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_uneven_load_worker( - local_rank: int, world_size: int, master_port: int): - """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" - # Set up distributed environment - current_platform.seed_everything(123) - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create images with very different sizes - grid_thw_list = [ - [1, 2, 2], # Small: 4 patches - [1, 8, 8], # Large: 64 patches - [1, 3, 3], # Medium: 9 patches - ] - - pixel_values_list = [] - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - image_pixels = torch.randn(num_patches, 768) - pixel_values_list.append(image_pixels) - - pixel_values = torch.cat(pixel_values_list, dim=0) - vision_model = SimpleMRopeVisionModel() - - # Should handle uneven distribution without errors - with torch.inference_mode(): - output_tuple = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - - # Verify output shape is reasonable - merge_factor = vision_model.spatial_merge_size**2 - expected_output_patches = list( - math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list) - - for i, output in enumerate(output_tuple): - assert output.shape[0] == expected_output_patches[i] - assert output.shape[1] == vision_model.out_hidden_size - - -@pytest.mark.parametrize("spatial_merge_size", [2, 4]) -def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int): - """Test SimpleMRopeVisionModel with different spatial merge sizes.""" - device = current_platform.device_type - - grid_thw_list = [[1, 4, 4], [1, 6, 6]] # Two images - pixel_values_list = [] - - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - image_pixels = torch.randn(num_patches, 768, device=device) - pixel_values_list.append(image_pixels) - - pixel_values = torch.cat(pixel_values_list, dim=0) - vision_model = SimpleMRopeVisionModel( - spatial_merge_size=spatial_merge_size).to(device) - - with torch.inference_mode(): - output = vision_model(pixel_values, grid_thw_list) - - # Verify output dimensions based on spatial merging - total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list) - merge_factor = spatial_merge_size**2 - expected_output_patches = total_patches // merge_factor - - assert output.shape[0] == expected_output_patches - assert output.shape[1] == vision_model.out_hidden_size diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 56ec634386909..b088e0c0dd241 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -69,7 +69,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -83,7 +82,7 @@ from .qwen2_vl import (_create_qwen2vl_field_factory, from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 76737a4428232..2f0c4240413be 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -34,7 +34,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import run_dp_sharded_vision_model + +from .vision import run_dp_sharded_vision_model class Idefics2VisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 892188c047228..2c341d2839719 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -28,7 +28,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import run_dp_sharded_vision_model + +from .vision import run_dp_sharded_vision_model NORM2FN = { 'rms_norm': RMSNorm, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index f554077935bf3..503627865c4a5 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -76,13 +76,13 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix +from .vision import run_dp_sharded_mrope_vision_model # For dummy input only diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 131a66b713235..50521b5937862 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -50,7 +50,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -58,6 +57,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llama4 import Llama4ForCausalLM from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) +from .vision import run_dp_sharded_vision_model class Llama4ImagePatchInputs(TensorSchema): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 73b27572a8ebd..b740e6d87b745 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -59,7 +59,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -74,7 +73,7 @@ from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 88813490c0fbc..472e8b061a9e1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -66,7 +66,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -78,7 +77,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 98d65dea27393..ee6703f7229e5 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -83,7 +83,7 @@ from .qwen2_vl import Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1214,8 +1214,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) if self.use_data_parallel: - from vllm.multimodal.utils import ( - run_dp_sharded_mrope_vision_model) return run_dp_sharded_mrope_vision_model(self.visual, pixel_values, grid_thw_list, @@ -1245,8 +1243,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) if self.use_data_parallel: - from vllm.multimodal.utils import ( - run_dp_sharded_mrope_vision_model) return run_dp_sharded_mrope_vision_model(self.visual, pixel_values_videos, grid_thw_list, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index f667266b77bfa..5f6ad58850439 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -31,7 +31,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Step3VisionEncoderConfig from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -40,6 +39,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import run_dp_sharded_vision_model class Step3VLImagePixelInputs(TypedDict): diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 81f86db7e1875..08ad8fbeb4246 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +import math from abc import ABC, abstractmethod -from typing import Final, Generic, Optional, Protocol, TypeVar, Union +from typing import Final, Generic, Literal, Optional, Protocol, TypeVar, Union import torch from transformers import PretrainedConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform @@ -123,3 +128,277 @@ def resolve_visual_encoder_outputs( if post_layer_norm is not None and uses_last_layer: hs_pool[-1] = post_layer_norm(encoder_outputs) return torch.cat(hs_pool, dim=-1) + + +def run_dp_sharded_vision_model(image_input: torch.Tensor, + vision_model: torch.nn.Module) -> torch.Tensor: + """Run a vision model with data parallelism (DP) sharding. The function + will shard the input image tensor on the first dimension and run the vision + model + + Args: + image_input (torch.Tensor): Image input tensor. + vision_model (torch.nn.Module): Vision model. + Returns: + torch.Tensor: Output image embeddings + """ + + num_chunks = image_input.shape[0] + mp_world_size = get_tensor_model_parallel_world_size() + num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size + num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks + pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) + image_input_padded = torch.nn.functional.pad(image_input, pad) + rank = get_tensor_model_parallel_rank() + image_input_per_rank = image_input_padded[rank * + num_chunks_per_rank:(rank + 1) * + num_chunks_per_rank, ...] + + vision_embeddings = vision_model(image_input_per_rank) + # Ensure tensor is contiguous before all_gather + vision_embeddings = vision_embeddings.contiguous() + vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, + dim=0) + vision_embeddings = vision_embeddings[:num_chunks, ...] + return vision_embeddings + + +def get_load_balance_assignment( + sizes: list[int], + num_gpus: int = 2, +) -> tuple[list[int], list[int], list[int]]: + """ + Generate load balancing assignment and metadata + for distributing data across GPUs. + The load is determined by the total image sizes, + not the number of images. + + Args: + sizes: The size of each image + num_gpus: Number of GPUs to balance across + + Returns: + shuffle_indices: + Indices to reorder data for balanced loading + gpu_sample_counts: + Number of samples assigned to each GPU + grouped_sizes_per_gpu: + Total size assigned to each GPU + + Example: + ``` + sizes = [1000, 100, 200, 50] + num_gpus=2 + ``` + + """ + + n_samples = len(sizes) + + # Handle edge cases + if n_samples == 0: + return [], [0] * num_gpus, [0] * num_gpus + + # Use greedy algorithm - balance by total size, not sample count + gpu_assignments = [list[int]() for _ in range(num_gpus)] + gpu_loads = [0] * num_gpus # This tracks total SIZE, not sample count + + # Sort indices by size (largest first for better load balancing) + # sizes = [1000, 100, 200, 50] + # large_to_small_indices = [0, 2, 1, 3] + large_to_small_indices = sorted(range(n_samples), + key=lambda i: sizes[i], + reverse=True) + + for idx in large_to_small_indices: + # Find GPU with minimum current load (by total size) + min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i]) + gpu_assignments[min_gpu].append(idx) + gpu_loads[min_gpu] += sizes[idx] + + # Create shuffle indices and counts + shuffle_indices = list[int]() + gpu_sample_counts = list[int]() + for gpu_id in range(num_gpus): + # GPU_0 = [1000] = [0] + # GPU_1 = [200, 100, 50] = [2, 1, 3] + # shuffle_indices = [0, 2, 1, 3] + shuffle_indices.extend(gpu_assignments[gpu_id]) + # GPU_0 = [1] + # GPU_1 = [3] + # gpu_sample_counts = [1, 3] + gpu_sample_counts.append(len(gpu_assignments[gpu_id])) + + return (shuffle_indices, gpu_sample_counts, gpu_loads) + + +def run_dp_sharded_mrope_vision_model( + vision_model: torch.nn.Module, + pixel_values: torch.Tensor, + grid_thw_list: list[list[int]], + *, + rope_type: Literal["rope_3d", "rope_2d"], +) -> tuple[torch.Tensor, ...]: + """Run a vision model with data parallelism (DP) sharding. + The function will shard the input image tensor on the + first dimension and run the vision model. + This function is used to run the vision model with mrope. + + Args: + vision_model (torch.nn.Module): Vision model. + pixel_values (torch.Tensor): Image/Video input tensor. + grid_thw_list: List of grid dimensions for each image + rope_type: Type of rope used in the vision model. + Different rope types have different dimension to do ViT. + "rope_3d" for 3D rope (e.g., Qwen2.5-VL) + "rope_2d" for 2D rope (e.g., Kimi-VL) + Returns: + torch.Tensor: Output image embeddings + + Example: + ``` + vision_model.out_hidden_size = 64 + vision_model.spatial_merge_size = 2 + pixel_values.shape = (1350, channel) + grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]] + tp_size=2 + ``` + + """ + tp_size = get_tensor_model_parallel_world_size() + + # GPU_0 tp_rank_local = 0 + # GPU_1 tp_rank_local = 1 + tp_rank_local = get_tensor_model_parallel_rank() + + # patches_per_image = [1000, 100, 200, 50] + patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list] + # patches_per_image = [0, 1000, 1100, 1300, 1350] + cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)] + + # Get load balancing assignment with all metadata + # image_to_tp_rank = [0, 2, 1, 3] + # gpu_sample_counts = [1, 3] + # grouped_pixel_values_len = [1000, 350] + (image_to_tp_rank, gpu_sample_counts, + grouped_pixel_values_len) = get_load_balance_assignment( + patches_per_image, tp_size) + + # cu_gpu_sample_counts = [0, 1, 4] + cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)] + + # GPU_0 image_idxs_local = [0] + # GPU_1 image_idxs_local = [2, 1, 3] + image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]: + cum_gpu_sample_counts[tp_rank_local + + 1]] + + # Get the pixel values for the local images based on the image_idxs_local + if len(image_idxs_local) > 0: + pixel_values_local = torch.cat([ + pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]] + for i in image_idxs_local + ]) + else: + # Handle case where this rank has no images + pixel_values_local = torch.empty((0, pixel_values.shape[1]), + device=pixel_values.device, + dtype=pixel_values.dtype) + # embed_dim_reduction_factor = 2 * 2 + if rope_type == "rope_2d": + embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] * + vision_model.merge_kernel_size[1]) + else: + embed_dim_reduction_factor = (vision_model.spatial_merge_size * + vision_model.spatial_merge_size) + + # Find the max length across all ranks + # The output embedding of every DP rank has to be + # padded to this length for tensor_model_parallel_all_gather + # to work + max_len_per_rank = max( + grouped_pixel_values_len) // embed_dim_reduction_factor + local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] + + # Run the vision model on the local pixel_values_local + if rope_type == "rope_2d": + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model( + pixel_values_local, torch.tensor(local_grid_thw_list)) + if isinstance(image_embeds_local, list): + image_embeds_local = torch.cat(image_embeds_local, dim=0) + else: + out_dim = getattr(vision_model.config, "hidden_size", None) + image_embeds_local = torch.empty( + (0, embed_dim_reduction_factor, out_dim), + device=pixel_values.device, + dtype=pixel_values.dtype) + else: + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model(pixel_values_local, + local_grid_thw_list) + else: + # Handle empty case + image_embeds_local = torch.empty((0, vision_model.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype) + + # Pad the output based on max_len_per_rank + # for tensor_model_parallel_all_gather to work + current_len = image_embeds_local.shape[0] + if current_len < max_len_per_rank: + padding_size = max_len_per_rank - current_len + if rope_type == "rope_2d": + padding = torch.empty((padding_size, image_embeds_local.shape[1], + image_embeds_local.shape[2]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) + else: + padding = torch.empty((padding_size, image_embeds_local.shape[1]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) + image_embeds_local_padded = torch.cat([image_embeds_local, padding], + dim=0) + else: + image_embeds_local_padded = image_embeds_local + + # Do all_gather to collect embeddings from all ranks + gathered_embeds = tensor_model_parallel_all_gather( + image_embeds_local_padded, dim=0) + + # Remove padding and reconstruct per-rank embeddings + rank_embeddings = list[torch.Tensor]() + for rank in range(tp_size): + start_idx = rank * max_len_per_rank + end_idx = start_idx + (grouped_pixel_values_len[rank] // + embed_dim_reduction_factor) + rank_embeddings.append(gathered_embeds[start_idx:end_idx]) + + patches_per_output_image = [(patch_size // embed_dim_reduction_factor) + for patch_size in patches_per_image] + + # Reconstruct embeddings in the original order + original_order_embeddings = [None] * len(grid_thw_list) + current_idx = 0 + for rank in range(tp_size): + count = gpu_sample_counts[rank] + if count > 0: + # Get images assigned to this rank in shuffled order + # GPU_0 = image_idxs_local [0] + # GPU_1 = image_idxs_local [2, 1, 3] + rank_images = image_to_tp_rank[current_idx:current_idx + count] + + rank_embed = rank_embeddings[rank] + # Split rank embeddings back to individual images + embed_start = 0 + for img_idx in rank_images: + img_patches = patches_per_output_image[img_idx] + original_order_embeddings[img_idx] = rank_embed[ + embed_start:embed_start + img_patches] + embed_start += img_patches + current_idx += count + out_embeddings = tuple(embed for embed in original_order_embeddings + if embed is not None) + assert len(out_embeddings) == len( + original_order_embeddings), "Found unassigned embeddings" + return out_embeddings diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f4e2ed72e2d7d..0f8aeceb39448 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,13 +3,11 @@ import asyncio import atexit -import itertools -import math from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse from urllib.request import url2pathname @@ -21,9 +19,6 @@ from typing_extensions import deprecated import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather) from .audio import AudioMediaIO from .base import MediaIO @@ -33,12 +28,10 @@ from .video import VideoMediaIO _M = TypeVar("_M") if TYPE_CHECKING: - from .inputs import (BatchedTensorInputs, MultiModalKwargs, - MultiModalKwargsItem, MultiModalKwargsItems, - MultiModalPlaceholderDict) + from .inputs import (BatchedTensorInputs, MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalPlaceholderDict) else: BatchedTensorInputs = Any - MultiModalKwargs = Any MultiModalKwargsItem = Any MultiModalKwargsItems = Any MultiModalPlaceholderDict = Any @@ -93,7 +86,7 @@ class MediaConnector: self, url_spec: ParseResult, media_io: MediaIO[_M], - ) -> _M: + ) -> _M: # type: ignore[type-var] data_spec, data = url_spec.path.split(",", 1) media_type, data_type = data_spec.split(";", 1) @@ -107,7 +100,7 @@ class MediaConnector: self, url_spec: ParseResult, media_io: MediaIO[_M], - ) -> _M: + ) -> _M: # type: ignore[type-var] allowed_local_media_path = self.allowed_local_media_path if allowed_local_media_path is None: raise RuntimeError("Cannot load local files without " @@ -127,7 +120,7 @@ class MediaConnector: media_io: MediaIO[_M], *, fetch_timeout: Optional[int] = None, - ) -> _M: + ) -> _M: # type: ignore[type-var] url_spec = urlparse(url) if url_spec.scheme.startswith("http"): @@ -434,280 +427,6 @@ def group_mm_kwargs_by_modality( yield modality, len(items_lst), mm_kwargs_group -def run_dp_sharded_vision_model(image_input: torch.Tensor, - vision_model: torch.nn.Module) -> torch.Tensor: - """Run a vision model with data parallelism (DP) sharding. The function - will shard the input image tensor on the first dimension and run the vision - model - - Args: - image_input (torch.Tensor): Image input tensor. - vision_model (torch.nn.Module): Vision model. - Returns: - torch.Tensor: Output image embeddings - """ - - num_chunks = image_input.shape[0] - mp_world_size = get_tensor_model_parallel_world_size() - num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size - num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks - pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) - image_input_padded = torch.nn.functional.pad(image_input, pad) - rank = get_tensor_model_parallel_rank() - image_input_per_rank = image_input_padded[rank * - num_chunks_per_rank:(rank + 1) * - num_chunks_per_rank, ...] - - vision_embeddings = vision_model(image_input_per_rank) - # Ensure tensor is contiguous before all_gather - vision_embeddings = vision_embeddings.contiguous() - vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, - dim=0) - vision_embeddings = vision_embeddings[:num_chunks, ...] - return vision_embeddings - - -def get_load_balance_assignment( - sizes: list[int], - num_gpus: int = 2, -) -> tuple[list[int], list[int], list[int]]: - """ - Generate load balancing assignment and metadata - for distributing data across GPUs. - The load is determined by the total image sizes, - not the number of images. - - Args: - sizes: The size of each image - num_gpus: Number of GPUs to balance across - - Returns: - shuffle_indices: - Indices to reorder data for balanced loading - gpu_sample_counts: - Number of samples assigned to each GPU - grouped_sizes_per_gpu: - Total size assigned to each GPU - - Example: - ``` - sizes = [1000, 100, 200, 50] - num_gpus=2 - ``` - - """ - - n_samples = len(sizes) - - # Handle edge cases - if n_samples == 0: - return [], [0] * num_gpus, [0] * num_gpus - - # Use greedy algorithm - balance by total size, not sample count - gpu_assignments = [list[int]() for _ in range(num_gpus)] - gpu_loads = [0] * num_gpus # This tracks total SIZE, not sample count - - # Sort indices by size (largest first for better load balancing) - # sizes = [1000, 100, 200, 50] - # large_to_small_indices = [0, 2, 1, 3] - large_to_small_indices = sorted(range(n_samples), - key=lambda i: sizes[i], - reverse=True) - - for idx in large_to_small_indices: - # Find GPU with minimum current load (by total size) - min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i]) - gpu_assignments[min_gpu].append(idx) - gpu_loads[min_gpu] += sizes[idx] - - # Create shuffle indices and counts - shuffle_indices = list[int]() - gpu_sample_counts = list[int]() - for gpu_id in range(num_gpus): - # GPU_0 = [1000] = [0] - # GPU_1 = [200, 100, 50] = [2, 1, 3] - # shuffle_indices = [0, 2, 1, 3] - shuffle_indices.extend(gpu_assignments[gpu_id]) - # GPU_0 = [1] - # GPU_1 = [3] - # gpu_sample_counts = [1, 3] - gpu_sample_counts.append(len(gpu_assignments[gpu_id])) - - return (shuffle_indices, gpu_sample_counts, gpu_loads) - - -def run_dp_sharded_mrope_vision_model( - vision_model: torch.nn.Module, - pixel_values: torch.Tensor, - grid_thw_list: list[list[int]], - *, - rope_type: Literal["rope_3d", "rope_2d"], -) -> tuple[torch.Tensor, ...]: - """Run a vision model with data parallelism (DP) sharding. - The function will shard the input image tensor on the - first dimension and run the vision model. - This function is used to run the vision model with mrope. - - Args: - vision_model (torch.nn.Module): Vision model. - pixel_values (torch.Tensor): Image/Video input tensor. - grid_thw_list: List of grid dimensions for each image - rope_type: Type of rope used in the vision model. - Different rope types have different dimension to do ViT. - "rope_3d" for 3D rope (e.g., Qwen2.5-VL) - "rope_2d" for 2D rope (e.g., Kimi-VL) - Returns: - torch.Tensor: Output image embeddings - - Example: - ``` - vision_model.out_hidden_size = 64 - vision_model.spatial_merge_size = 2 - pixel_values.shape = (1350, channel) - grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]] - tp_size=2 - ``` - - """ - tp_size = get_tensor_model_parallel_world_size() - - # GPU_0 tp_rank_local = 0 - # GPU_1 tp_rank_local = 1 - tp_rank_local = get_tensor_model_parallel_rank() - - # patches_per_image = [1000, 100, 200, 50] - patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list] - # patches_per_image = [0, 1000, 1100, 1300, 1350] - cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)] - - # Get load balancing assignment with all metadata - # image_to_tp_rank = [0, 2, 1, 3] - # gpu_sample_counts = [1, 3] - # grouped_pixel_values_len = [1000, 350] - (image_to_tp_rank, gpu_sample_counts, - grouped_pixel_values_len) = get_load_balance_assignment( - patches_per_image, tp_size) - - # cu_gpu_sample_counts = [0, 1, 4] - cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)] - - # GPU_0 image_idxs_local = [0] - # GPU_1 image_idxs_local = [2, 1, 3] - image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]: - cum_gpu_sample_counts[tp_rank_local + - 1]] - - # Get the pixel values for the local images based on the image_idxs_local - if len(image_idxs_local) > 0: - pixel_values_local = torch.cat([ - pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]] - for i in image_idxs_local - ]) - else: - # Handle case where this rank has no images - pixel_values_local = torch.empty((0, pixel_values.shape[1]), - device=pixel_values.device, - dtype=pixel_values.dtype) - # embed_dim_reduction_factor = 2 * 2 - if rope_type == "rope_2d": - embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] * - vision_model.merge_kernel_size[1]) - else: - embed_dim_reduction_factor = (vision_model.spatial_merge_size * - vision_model.spatial_merge_size) - - # Find the max length across all ranks - # The output embedding of every DP rank has to be - # padded to this length for tensor_model_parallel_all_gather - # to work - max_len_per_rank = max( - grouped_pixel_values_len) // embed_dim_reduction_factor - local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] - - # Run the vision model on the local pixel_values_local - if rope_type == "rope_2d": - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model( - pixel_values_local, torch.tensor(local_grid_thw_list)) - if isinstance(image_embeds_local, list): - image_embeds_local = torch.cat(image_embeds_local, dim=0) - else: - out_dim = getattr(vision_model.config, "hidden_size", None) - image_embeds_local = torch.empty( - (0, embed_dim_reduction_factor, out_dim), - device=pixel_values.device, - dtype=pixel_values.dtype) - else: - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model(pixel_values_local, - local_grid_thw_list) - else: - # Handle empty case - image_embeds_local = torch.empty((0, vision_model.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype) - - # Pad the output based on max_len_per_rank - # for tensor_model_parallel_all_gather to work - current_len = image_embeds_local.shape[0] - if current_len < max_len_per_rank: - padding_size = max_len_per_rank - current_len - if rope_type == "rope_2d": - padding = torch.empty((padding_size, image_embeds_local.shape[1], - image_embeds_local.shape[2]), - dtype=image_embeds_local.dtype, - device=image_embeds_local.device) - else: - padding = torch.empty((padding_size, image_embeds_local.shape[1]), - dtype=image_embeds_local.dtype, - device=image_embeds_local.device) - image_embeds_local_padded = torch.cat([image_embeds_local, padding], - dim=0) - else: - image_embeds_local_padded = image_embeds_local - - # Do all_gather to collect embeddings from all ranks - gathered_embeds = tensor_model_parallel_all_gather( - image_embeds_local_padded, dim=0) - - # Remove padding and reconstruct per-rank embeddings - rank_embeddings = list[torch.Tensor]() - for rank in range(tp_size): - start_idx = rank * max_len_per_rank - end_idx = start_idx + (grouped_pixel_values_len[rank] // - embed_dim_reduction_factor) - rank_embeddings.append(gathered_embeds[start_idx:end_idx]) - - patches_per_output_image = [(patch_size // embed_dim_reduction_factor) - for patch_size in patches_per_image] - - # Reconstruct embeddings in the original order - original_order_embeddings = [None] * len(grid_thw_list) - current_idx = 0 - for rank in range(tp_size): - count = gpu_sample_counts[rank] - if count > 0: - # Get images assigned to this rank in shuffled order - # GPU_0 = image_idxs_local [0] - # GPU_1 = image_idxs_local [2, 1, 3] - rank_images = image_to_tp_rank[current_idx:current_idx + count] - - rank_embed = rank_embeddings[rank] - # Split rank embeddings back to individual images - embed_start = 0 - for img_idx in rank_images: - img_patches = patches_per_output_image[img_idx] - original_order_embeddings[img_idx] = rank_embed[ - embed_start:embed_start + img_patches] - embed_start += img_patches - current_idx += count - out_embeddings = tuple(embed for embed in original_order_embeddings - if embed is not None) - assert len(out_embeddings) == len( - original_order_embeddings), "Found unassigned embeddings" - return out_embeddings - - def fetch_audio( audio_url: str, audio_io_kwargs: Optional[dict[str, Any]] = None, From 4322c553a64c3a6c95a39216f31b022eda04a5ad Mon Sep 17 00:00:00 2001 From: Andreas Hartel Date: Tue, 23 Sep 2025 11:56:31 +0200 Subject: [PATCH 130/185] [Test]: Hermes tool parser stream output error in Qwen3 case (#25203) Signed-off-by: Andreas Hartel --- .../tool_parsers/test_hermes_tool_parser.py | 209 +++++++++++++++++- 1 file changed, 203 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index e0e6b2c07e171..1da06be2eba92 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -5,6 +5,11 @@ import json import pytest +from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import ( + Hermes2ProToolParser) +from vllm.transformers_utils.tokenizer import AnyTokenizer + from ....utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" @@ -37,7 +42,7 @@ TOOLS = [{ }, "unit": { "type": "string", - "enum": ["celsius", "fahrenheit"] + "enum": ["celsius", "fahrenheit"], }, }, "required": ["location"], @@ -75,7 +80,7 @@ PRODUCT_MESSAGES = [{ "user", "content": "Hi! Do you have any detailed information about the product id " - "7355608 and inserted true?" + "7355608 and inserted true?", }] @@ -144,8 +149,8 @@ async def test_streaming_tool_call(): if tool_chunk.function.name: tool_call_chunks[index]["name"] += tool_chunk.function.name if tool_chunk.function.arguments: - tool_call_chunks[index][ - "arguments"] += tool_chunk.function.arguments + tool_call_chunks[index]["arguments"] += ( + tool_chunk.function.arguments) assert len(tool_call_chunks) == 1 reconstructed_tool_call = tool_call_chunks[0] @@ -234,8 +239,8 @@ async def test_streaming_product_tool_call(): if tool_chunk.function.name: tool_call_chunks[index]["name"] += tool_chunk.function.name if tool_chunk.function.arguments: - tool_call_chunks[index][ - "arguments"] += tool_chunk.function.arguments + tool_call_chunks[index]["arguments"] += ( + tool_chunk.function.arguments) assert len(tool_call_chunks) == 1 reconstructed_tool_call = tool_call_chunks[0] @@ -258,3 +263,195 @@ async def test_streaming_product_tool_call(): print("\n[Streaming Product Test Passed]") print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") print(f"Reconstructed Arguments: {arguments}") + + +@pytest.fixture +def qwen_tokenizer() -> AnyTokenizer: + from vllm.transformers_utils.tokenizer import get_tokenizer + + return get_tokenizer("Qwen/Qwen3-32B") + + +@pytest.fixture +def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: + return Hermes2ProToolParser(qwen_tokenizer) + + +@pytest.fixture +def any_chat_request() -> ChatCompletionRequest: + return ChatCompletionRequest( + seed=42, + model="Qwen/Qwen3-32B", + messages=[], + ) + + +def test_hermes_parser_streaming_just_forward_text( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = ( + """This is some prior text that has nothing to do with tool calling.""" + ) + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + delta_text = qwen_tokenizer.decode([token]) + current_text = previous_text + delta_text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + delta_messages.append(delta) + + for delta in delta_messages: + assert delta is not None + assert not delta.tool_calls + + print(delta_messages) + assert "".join([delta.content for delta in delta_messages]) == text + + +def test_hermes_parser_streaming_failure_case_bug_19056( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}} +""" + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + + assert delta_messages[0].tool_calls[0].function.name == "final_answer" + tool_call_args = "".join(delta.tool_calls[0].function.arguments or "" + for delta in delta_messages) + assert tool_call_args == '{"trigger": true}' + + +def test_hermes_parser_streaming( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = '\ +{"name": "get_current_temperature",\ +"arguments": {"location":\ +"San Francisco, California, United States", "unit": "celsius"}}\ +' + + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + print(delta_messages) + assert (delta_messages[0].tool_calls[0].function.name == + "get_current_temperature") + tool_call_args = "".join(delta.tool_calls[0].function.arguments or "" + for delta in delta_messages) + assert tool_call_args == ( + '{"location":"San Francisco, California, United States", ' + '"unit": "celsius"}') + + +def test_hermes_parser_non_streaming_no_tool_call( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """This is not a tool call.""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called + + +def test_hermes_parser_non_streaming_tool_call_between_tags( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}} +""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_until_eos( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_invalid_json( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + # Missing closing brace to trigger exception + text = """ +{"name": "final_answer", "arguments": {"trigger": true}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called From 231c2c63e4decd0cbf863690dfffe88e1d97a003 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 23 Sep 2025 18:06:48 +0800 Subject: [PATCH 131/185] [Bugfix] Fix idefics3 `tie_word_embeddings` (#25454) Signed-off-by: Isotr0py --- vllm/model_executor/models/idefics3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 18446d126b51a..79e130119ae83 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -608,7 +608,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.text_config.tie_word_embeddings: - self.lm_head.weight = self.model.text_model.wte.weight + self.lm_head.weight = self.model.text_model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.text_config.vocab_size) def _parse_and_validate_image_input( From 273690a50ac2a5fa79fa7acc5077e49aa1af427e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 23 Sep 2025 18:19:45 +0800 Subject: [PATCH 132/185] [Core] Optimize LoRA weight loading (#25403) Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 26 ++++---- tests/lora/test_lora_manager.py | 12 ++-- tests/lora/utils.py | 8 +-- vllm/lora/layers/base_linear.py | 10 +-- vllm/lora/layers/column_parallel_linear.py | 67 ++++++++++---------- vllm/lora/layers/logits_processor.py | 8 +-- vllm/lora/layers/row_parallel_linear.py | 4 +- vllm/lora/layers/vocal_parallel_embedding.py | 10 +-- vllm/lora/lora_weights.py | 4 +- vllm/lora/models.py | 17 ++--- 10 files changed, 83 insertions(+), 83 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 6735b7cd9e436..ced0afc50cb91 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -164,8 +164,8 @@ def populate_loras( weight=layer_weights, generate_embeddings_tensor=generate_embeddings_tensor, ) - sublora.lora_b = sublora.lora_b[:, (sublora_len * - i):(sublora_len * (i + 1))] + sublora.lora_b = sublora.lora_b[(sublora_len * + i):(sublora_len * (i + 1)), :] sublora.optimize() subloras.append(sublora) @@ -304,9 +304,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: result = embedding(input_) after_a = F.embedding( input_, - lora.lora_a, + lora.lora_a.T, ) - result += (after_a @ lora.lora_b) + result += (after_a @ lora.lora_b.T) expected_results.append(result) expected_result = torch.cat(expected_results) @@ -445,9 +445,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, result = expanded_embedding(input_) after_a = F.embedding( original_input_, - lora.lora_a, + lora.lora_a.T, ) - result += (after_a @ lora.lora_b) + result += (after_a @ lora.lora_b.T) expected_results.append(result) expected_result = torch.cat(expected_results) @@ -575,7 +575,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, lm_head=linear, embedding_bias=None) result[:, vocab_size + embeddings_tensor_len:] = float("-inf") - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) logits_processor.org_vocab_size = vocab_size @@ -692,9 +692,10 @@ def test_linear_replicated( expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] result = linear(input_)[0] - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) @@ -817,7 +818,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) @@ -965,9 +966,10 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, result = linear(input_)[0] subloras = sublora_dict[lora_id] for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * - (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * - sublora.scaling) + result[:, sublora.lora_b.shape[0] * i:sublora.lora_b.shape[0] * + (i + 1)] += ( + input_ @ sublora.lora_a.T @ sublora.lora_b.T * + sublora.scaling) expected_results.append(result) expected_result = torch.cat(expected_results) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index d7684fbf34abb..6f0a852314081 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -63,9 +63,9 @@ def test_from_lora_tensors(sql_lora_files, device): assert lora.lora_b is not None assert lora.lora_a.device == torch.device(device) assert lora.lora_b.device == torch.device(device) - assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] + assert (lora.lora_a.shape[0] == lora.lora_b.shape[1] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" - assert lora.lora_a.shape[1] == 8 + assert lora.lora_a.shape[0] == 8 embeddings_module = next( (k for k in EMBEDDING_MODULES if k in module_name), None) if embeddings_module: @@ -86,8 +86,8 @@ def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str], name, 8, 16, - torch.rand([w.shape[1], 8], device=device), - torch.rand([8, w.shape[0]], device=device), + torch.rand([8, w.shape[1]], device=device), + torch.rand([w.shape[0], 8], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -109,8 +109,8 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device=device), - torch.rand([8, w.shape[0] // len(replaced_module_names)], + torch.rand([8, w.shape[1]], device=device), + torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device), ) return LoRAModel(lora_id, 8, loras) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index ab475904d4938..0432a1a9bba07 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -36,10 +36,10 @@ class DummyLoRAManager: module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([weight.shape[1], rank], + lora_a=torch.rand([rank, weight.shape[1]], dtype=weight.dtype, device=self._device), - lora_b=torch.rand([rank, weight.shape[0]], + lora_b=torch.rand([weight.shape[0], rank], dtype=weight.dtype, device=self._device), ) @@ -67,8 +67,8 @@ class DummyLoRAManager: module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), + lora_a=torch.rand([rank, input_dim], device="cuda"), + lora_b=torch.rand([output_dim, input_dim], device="cuda"), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 85a1f86ce6bf2..6cf5815ef12da 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -121,18 +121,18 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): lora_bias = self.slice_bias(lora_bias) self.lora_a_stacked[0][index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) + 0, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if lora_bias is not None: self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) assert len(self.lora_bias_stacked) self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( - lora_bias.T, non_blocking=True) + lora_bias, non_blocking=True) def apply(self, x: torch.Tensor, diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 658fd23165da0..fa4eb272a69fe 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -99,13 +99,13 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): if self.is_merged_col_linear: tp_rank = get_tensor_model_parallel_rank() shard_size = self.output_size // 2 - offset = lora_b.shape[-1] // 2 + offset = lora_b.shape[0] // 2 - left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) * - shard_size] - right_weight = lora_b[:, offset + tp_rank * shard_size:offset + - (tp_rank + 1) * shard_size] - lora_b = torch.cat([left_weight, right_weight], dim=1) + left_weight = lora_b[tp_rank * shard_size:(tp_rank + 1) * + shard_size, :] + right_weight = lora_b[offset + tp_rank * shard_size:offset + + (tp_rank + 1) * shard_size, :] + lora_b = torch.cat([left_weight, right_weight], dim=0) # Applicable to cases where the base_layer is # ColumnParallelLinear. else: @@ -113,7 +113,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] + lora_b = lora_b[start_idx:end_idx, :] return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @@ -251,9 +251,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: - sliced_lora_b[i] = lora_b_i[:, - shard_size * shard_id:shard_size * - (shard_id + 1)] + sliced_lora_b[i] = lora_b_i[shard_size * shard_id:shard_size * + (shard_id + 1), :] return sliced_lora_b def slice_bias( @@ -285,12 +284,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): for i in range(self.n_slices): if (lora_a_i := lora_a[i]) is not None: self.lora_a_stacked[i][ - index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( - lora_a_i.T, non_blocking=True) + index, 0, :lora_a_i.shape[0], :lora_a_i.shape[1]].copy_( + lora_a_i, non_blocking=True) if (lora_b_i := lora_b[i]) is not None: self.lora_b_stacked[i][ - index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( - lora_b_i.T, non_blocking=True) + index, 0, :lora_b_i.shape[0], :lora_b_i.shape[1]].copy_( + lora_b_i, non_blocking=True) if lora_bias is not None: self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], @@ -299,7 +298,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): if (lora_bias_i := lora_bias[i]) is not None: self.lora_bias_stacked[i][index, 0, :lora_bias_i.shape[0]].copy_( - lora_bias_i.T, + lora_bias_i, non_blocking=True) @classmethod @@ -345,18 +344,18 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() self.q_shard_id = tp_rank self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - lora_b_q = lora_b[:, self.q_proj_shard_size * + lora_b_q = lora_b[self.q_proj_shard_size * self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] + (self.q_shard_id + 1), :] k_offset = self.q_proj_total_size - lora_b_k = lora_b[:, k_offset + + lora_b_k = lora_b[k_offset + self.kv_proj_shard_size * self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] + self.kv_proj_shard_size * (self.kv_shard_id + 1), :] v_offset = k_offset + self.kv_proj_total_size - lora_b_v = lora_b[:, v_offset + + lora_b_v = lora_b[v_offset + self.kv_proj_shard_size * self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + self.kv_proj_shard_size * (self.kv_shard_id + 1), :] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=0) return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @@ -465,7 +464,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] + lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a def apply(self, @@ -508,10 +507,10 @@ class MergedColumnParallelLinearWithShardedLoRA( output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size lora_a = [ - lora_a[0][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[0] is not None else None, - lora_a[1][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[1] is not None else None, + lora_a[0][output_start_idx:output_start_idx + + output_shard_size, :] if lora_a[0] is not None else None, + lora_a[1][output_start_idx:output_start_idx + + output_shard_size, :] if lora_a[1] is not None else None, ] return lora_a @@ -551,7 +550,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] + lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a def apply(self, @@ -589,12 +588,12 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] lora_a = [ - lora_a[0][:, start_idx[0]:start_idx[0] + - shard_size[0]] if lora_a[0] is not None else None, - lora_a[1][:, start_idx[1]:start_idx[1] + - shard_size[1]] if lora_a[1] is not None else None, - lora_a[2][:, start_idx[2]:start_idx[2] + - shard_size[2]] if lora_a[2] is not None else None, + lora_a[0][start_idx[0]:start_idx[0] + + shard_size[0], :] if lora_a[0] is not None else None, + lora_a[1][start_idx[1]:start_idx[1] + + shard_size[1], :] if lora_a[1] is not None else None, + lora_a[2][start_idx[2]:start_idx[2] + + shard_size[2], :] if lora_a[2] is not None else None, ] return lora_a diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index a50dcfa748f2f..b8fbad3a4af01 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -140,11 +140,11 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): ): self.reset_lora(index) self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) + 0, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 18ef6fd1ddd78..cac2c92136dca 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -39,7 +39,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): shard_size = self.input_size start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - lora_a = lora_a[start_idx:end_idx, :] + lora_a = lora_a[:,start_idx:end_idx] return lora_a def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: @@ -122,7 +122,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): shard_size = self.lora_b_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] + lora_b = lora_b[ start_idx:end_idx,:] return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index 4d6218d970977..ca01c7e17fff4 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -95,11 +95,13 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( - lora_a, non_blocking=True) + # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, + # so we need transpose here + self.lora_a_stacked[index, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 958364fca592f..e3198fb3d3ae4 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -86,11 +86,11 @@ class LoRALayerWeights: embeddings_tensor_dim: Optional[int] = None, bias_enabled: Optional[bool] = False) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() - lora_a = torch.zeros([input_dim, rank], + lora_a = torch.zeros([rank, input_dim], dtype=dtype, device=device, pin_memory=pin_memory) - lora_b = torch.zeros([rank, output_dim], + lora_b = torch.zeros([output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9ea46be65cff3..cc64cc78affa7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -152,30 +152,29 @@ class LoRAModel: module_name, peft_helper, lora_embeddings_tensor) if is_bias: - loras[module_name].bias = tensor.to(device=device, - dtype=dtype).t() - bias = tensor.to(device=device, dtype=dtype).t() + loras[module_name].bias = tensor.to(device=device, dtype=dtype) + bias = tensor.to(device=device, dtype=dtype) if pin_memory: bias = bias.pin_memory() loras[module_name].bias = bias elif is_lora_a: loras[module_name].lora_a = tensor.to(device=device, - dtype=dtype).t() + dtype=dtype) if pin_memory: loras[module_name].lora_a = loras[ module_name].lora_a.pin_memory() else: loras[module_name].lora_b = tensor.to(device=device, - dtype=dtype).t() + dtype=dtype) assert embedding_padding_modules is not None if any(name in module_name for name in embedding_padding_modules ) and target_embedding_padding is not None: lora_b = loras[module_name].lora_b - assert target_embedding_padding >= lora_b.shape[1] - addition = target_embedding_padding - lora_b.shape[1] + assert target_embedding_padding >= lora_b.shape[0] + addition = target_embedding_padding - lora_b.shape[0] loras[module_name].lora_b = torch.nn.functional.pad( - lora_b, (0, addition)) + lora_b, (0, 0, 0, addition)) if pin_memory: loras[module_name].lora_b = loras[ module_name].lora_b.pin_memory() @@ -585,7 +584,6 @@ class LoRAModelManager: "cpu", bias_enabled=bias_enabled, ) - lora.optimize() else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] @@ -600,7 +598,6 @@ class LoRAModelManager: "cpu", bias_enabled=bias_enabled, ) - lora.optimize() subloras.append(lora) lora = PackedLoRALayerWeights.pack(subloras) model.loras[module_name] = lora From 0d9fe260dda994646b1e74f424b2c5e32190a78f Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 23 Sep 2025 21:05:11 +0800 Subject: [PATCH 133/185] [docs] Benchmark Serving Incorrect Arg (#25474) Signed-off-by: vllmellm --- docs/contributing/benchmarks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 2a03ce1dffd63..a97d1fa6a3a55 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -680,7 +680,7 @@ vllm bench serve \ --save-result \ --result-dir ~/vllm_benchmark_results \ --save-detailed \ - --endpoint /v1/chat/completion + --endpoint /v1/chat/completions ``` ##### Videos (ShareGPT4Video) @@ -707,7 +707,7 @@ vllm bench serve \ --save-result \ --result-dir ~/vllm_benchmark_results \ --save-detailed \ - --endpoint /v1/chat/completion + --endpoint /v1/chat/completions ``` ##### Synthetic Random Images (random-mm) From b6a136b58c10dde773045c471db762ed09fc7468 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 23 Sep 2025 21:05:46 +0800 Subject: [PATCH 134/185] [CI/Build] Fix disabled v1 attention backend selection test (#25471) Signed-off-by: Isotr0py --- tests/kernels/attention/test_attention_selector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 38ab40f88ae0b..a4e200775c09d 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -67,7 +67,6 @@ def generate_params(): return params -@pytest.mark.skip(reason="Skipped for now. Should be revisited.") @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) def test_env( @@ -189,7 +188,7 @@ def test_env( # FlashMLA only supports block_size == 64 pytest.skip("FlashMLA only supports block_size 64") else: - from vllm.attention.backends.flashmla import ( + from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501 is_flashmla_supported) is_supported, _ = is_flashmla_supported() if not is_supported: From 61d1b35561c07589331fedf5eba58eaaedcd4423 Mon Sep 17 00:00:00 2001 From: Joel Date: Tue, 23 Sep 2025 21:49:13 +0800 Subject: [PATCH 135/185] [BugFix] Register expert_map as named buffer for wake_up and sleep (#25458) Signed-off-by: wuxibin Signed-off-by: youkaichao Co-authored-by: youkaichao --- vllm/model_executor/layers/fused_moe/layer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 1f80e972b7f06..71cc2bcf174dd 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -972,12 +972,15 @@ class FusedMoE(CustomOp): "experts. Falling back to linear expert placement.") expert_placement_strategy = "linear" - self.local_num_experts, self.expert_map = determine_expert_map( + self.expert_map: Optional[torch.Tensor] + local_num_experts, expert_map = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts, expert_placement_strategy=expert_placement_strategy, ) + self.local_num_experts = local_num_experts + self.register_buffer("expert_map", expert_map) logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " "placement strategy: %s. Local/global" @@ -1154,10 +1157,12 @@ class FusedMoE(CustomOp): # ep_size and ep_rank should already be updated assert self.expert_map is not None with self.expert_map.device: - self.local_num_experts, self.expert_map = determine_expert_map( + local_num_experts, expert_map = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) + self.local_num_experts = local_num_experts + self.register_buffer("expert_map", expert_map) def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, From f05a4f0e345bbfd4a7cb3f421bd9412e1cc53e74 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 23 Sep 2025 22:08:02 +0800 Subject: [PATCH 136/185] [P/D] Support NIXL connector to disconnect during a clean shutdown (#24423) Signed-off-by: chaunceyjiang Co-authored-by: Mark McLoughlin --- .../kv_connector/unit/test_nixl_connector.py | 52 +++++++++++++++++++ .../kv_connector/v1/nixl_connector.py | 35 ++++++++++--- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index fa698a2eabd91..24cc83c28614b 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -60,6 +60,9 @@ class FakeNixlWrapper: def register_memory(self, descs, backends) -> None: pass + def deregister_memory(self, descs) -> None: + pass + def get_xfer_descs(self, blocks_data, memory_type: str) -> list: return [str(uuid.uuid4()) for _ in blocks_data] @@ -86,6 +89,12 @@ class FakeNixlWrapper: def release_xfer_handle(self, handle: int) -> None: pass + def release_dlist_handle(self, handle: int) -> None: + pass + + def remove_remote_agent(self, agent: str) -> None: + pass + def send_notif(self, agent_name: str, notif_msg: bytes) -> None: pass @@ -905,3 +914,46 @@ def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, # Verify get_reg_descs was called with the correct memory_type assert connector.connector_worker.kv_buffer_device == kv_buffer_device assert connector.connector_worker.nixl_memory_type == nixl_memory_type + + +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper) +def test_shutdown_cleans_up_resources(dist_init): + """Test that shutdown() properly cleans up all resources.""" + vllm_config = create_vllm_config() + + worker = NixlConnectorWorker(vllm_config, + vllm_config.kv_transfer_config.engine_id) + nixl_wrapper = worker.nixl_wrapper + + with patch.object(worker, '_handshake_initiation_executor') as mock_exec, \ + patch.object(worker, '_nixl_handshake_listener_t') as mock_listener, \ + patch.object(nixl_wrapper, 'release_xfer_handle') as mock_rel_xfer, \ + patch.object(nixl_wrapper, 'release_dlist_handle') as mock_rel_dlist, \ + patch.object(nixl_wrapper, 'remove_remote_agent') as mock_rem_agent, \ + patch.object(nixl_wrapper, 'deregister_memory') as mock_dereg: + + worker._recving_transfers = {"req1": [(123, time.perf_counter())]} + worker.src_xfer_side_handle = 456 + worker.dst_xfer_side_handles = {"engine1": 789} + worker._remote_agents = {"engine1": {0: "agent1"}} + worker._registered_descs = ["desc1", "desc2"] + + worker.shutdown() + + # Test idempotency + worker.shutdown() + worker.shutdown() + + mock_exec.shutdown.assert_called_with(wait=False) + mock_listener.join.assert_called_once_with(timeout=0) + + mock_rel_xfer.assert_called_once_with(123) + assert mock_rel_dlist.call_count == 2 + mock_rel_dlist.assert_any_call(456) # src handle + mock_rel_dlist.assert_any_call(789) # dst handle + mock_rem_agent.assert_called_once_with("agent1") + assert mock_dereg.call_count == 2 + mock_dereg.assert_any_call("desc1") + mock_dereg.assert_any_call("desc2") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 82b483447e33d..64feddb591c27 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -250,6 +250,10 @@ class NixlConnector(KVConnectorBase_V1): self.connector_worker.copy_blocks: self.connector_worker.save_kv_to_host(self._connector_metadata) + def shutdown(self): + if self.connector_worker is not None: + self.connector_worker.shutdown() + class NixlConnectorScheduler: """Implementation of Scheduler side methods""" @@ -586,13 +590,6 @@ class NixlConnectorWorker: self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) self.xfer_stats = NixlKVConnectorStats() - def __del__(self): - """Cleanup background threads on destruction.""" - if executor := getattr(self, "_handshake_initiation_executor", None): - executor.shutdown(wait=False) - if listener_t := getattr(self, "_nixl_handshake_listener_t", None): - listener_t.join(timeout=0) - @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, ready_event: threading.Event, base_port: int, @@ -1346,6 +1343,30 @@ class NixlConnectorWorker: return self.xfer_stats.clone_and_reset() return None + def shutdown(self): + """Shutdown the connector worker.""" + self._handshake_initiation_executor.shutdown(wait=False) + if self._nixl_handshake_listener_t is not None: + self._nixl_handshake_listener_t.join(timeout=0) + self._nixl_handshake_listener_t = None + for handles in self._recving_transfers.values(): + for handle, _ in handles: + self.nixl_wrapper.release_xfer_handle(handle) + self._recving_transfers.clear() + if self.src_xfer_side_handle: + self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle) + self.src_xfer_side_handle = 0 + for dst_xfer_side_handle in self.dst_xfer_side_handles.values(): + self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle) + self.dst_xfer_side_handles.clear() + for remote_agents in self._remote_agents.values(): + for agent_name in remote_agents.values(): + self.nixl_wrapper.remove_remote_agent(agent_name) + self._remote_agents.clear() + for desc in self._registered_descs: + self.nixl_wrapper.deregister_memory(desc) + self._registered_descs.clear() + @contextlib.contextmanager def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: From da5e7e432979254eb0d8a528c40ae73e121cca20 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Tue, 23 Sep 2025 22:23:22 +0800 Subject: [PATCH 137/185] [Docs] NixlConnector quickstart guide (#24249) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Pan Signed-off-by: Peter Pan Signed-off-by: Nicolò Lucchesi Co-authored-by: Nicolò Lucchesi --- docs/features/disagg_prefill.md | 2 +- docs/features/nixl_connector_usage.md | 159 ++++++++++++++++++ .../nixl_integration/run_accuracy_test.sh | 10 +- 3 files changed, 168 insertions(+), 3 deletions(-) create mode 100644 docs/features/nixl_connector_usage.md diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index cb62213cc7af0..2c69304db3393 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -23,7 +23,7 @@ Now supports 5 types of connectors: - **SharedStorageConnector**: refer to for the example usage of SharedStorageConnector disaggregated prefilling. - **LMCacheConnectorV1**: refer to for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission. -- **NixlConnector**: refer to for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. +- **NixlConnector**: refer to for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md). - **P2pNcclConnector**: refer to for the example usage of P2pNcclConnector disaggregated prefilling. - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as: diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md new file mode 100644 index 0000000000000..de50f091df428 --- /dev/null +++ b/docs/features/nixl_connector_usage.md @@ -0,0 +1,159 @@ +# NixlConnector Usage Guide + +NixlConnector is a high-performance KV cache transfer connector for vLLM's disaggregated prefilling feature. It provides fully asynchronous send/receive operations using the NIXL library for efficient cross-process KV cache transfer. + +## Prerequisites + +### Installation + +Install the NIXL library: `uv pip install nixl`, as a quick start. + +- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions +- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files + +### Transport Configuration + +NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables: + +```bash +# Example UCX configuration, adjust according to your enviroment +export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc +export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1" +``` + +!!! tip + When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables. + +## Basic Usage (on the same host) + +### Producer (Prefiller) Configuration + +Start a prefiller instance that produces KV caches + +```bash +# 1st GPU as prefiller +CUDA_VISIBLE_DEVICES=0 \ +UCX_NET_DEVICES=all \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \ +vllm serve Qwen/Qwen3-0.6B \ + --port 8100 \ + --enforce-eager \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' +``` + +### Consumer (Decoder) Configuration + +Start a decoder instance that consumes KV caches: + +```bash +# 2nd GPU as decoder +CUDA_VISIBLE_DEVICES=1 \ +UCX_NET_DEVICES=all \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5601 \ +vllm serve Qwen/Qwen3-0.6B \ + --port 8200 \ + --enforce-eager \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' +``` + +### Proxy Server + +Use a proxy server to route requests between prefiller and decoder: + +```bash +python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ + --port 8192 \ + --prefiller-hosts localhost \ + --prefiller-ports 8100 \ + --decoder-hosts localhost \ + --decoder-ports 8200 +``` + +## Environment Variables + +- `VLLM_NIXL_SIDE_CHANNEL_PORT`: Port for NIXL handshake communication + - Default: 5600 + - **Required for both prefiller and decoder instances** + - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine + - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank (e.g., with `--tensor-parallel-size=4` and base_port=5600, tp_rank 0..3 use ports 5600, 5601, 5602, 5603 on that node). + - Used for the initial NIXL handshake between the prefiller and the decoder + +- `VLLM_NIXL_SIDE_CHANNEL_HOST`: Host for side channel communication + - Default: "localhost" + - Set when prefiller and decoder are on different machines + - Connection info is passed via KVTransferParams from prefiller to decoder for handshake + +- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional) + - Default: 120 + - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely. + +## Multi-Instance Setup + +### Multiple Prefiller Instances on Different Machines + +```bash +# Prefiller 1 on Machine A (example IP: ${IP1}) +VLLM_NIXL_SIDE_CHANNEL_HOST=${IP1} \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \ +UCX_NET_DEVICES=all \ +vllm serve Qwen/Qwen3-0.6B --port 8000 \ + --tensor-parallel-size 8 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' + +# Prefiller 2 on Machine B (example IP: ${IP2}) +VLLM_NIXL_SIDE_CHANNEL_HOST=${IP2} \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \ +UCX_NET_DEVICES=all \ +vllm serve Qwen/Qwen3-0.6B --port 8000 \ + --tensor-parallel-size 8 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' +``` + +### Multiple Decoder Instances on Different Machines + +```bash +# Decoder 1 on Machine C (example IP: ${IP3}) +VLLM_NIXL_SIDE_CHANNEL_HOST=${IP3} \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \ +UCX_NET_DEVICES=all \ +vllm serve Qwen/Qwen3-0.6B --port 8000 \ + --tensor-parallel-size 8 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' + +# Decoder 2 on Machine D (example IP: ${IP4}) +VLLM_NIXL_SIDE_CHANNEL_HOST=${IP4} \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \ +UCX_NET_DEVICES=all \ +vllm serve Qwen/Qwen3-0.6B --port 8000 \ + --tensor-parallel-size 8 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' +``` + +### Proxy for Multiple Instances + +```bash +python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ + --port 8192 \ + --prefiller-hosts ${IP1} ${IP2} \ + --prefiller-ports 8000 8000 \ + --decoder-hosts ${IP3} ${IP4} \ + --decoder-ports 8000 8000 +``` + +### KV Role Options + +- **kv_producer**: For prefiller instances that generate KV caches +- **kv_consumer**: For decoder instances that consume KV caches from prefiller +- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined. + +!!! tip + NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`). + Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior. + +## Example Scripts/Code + +Refer to these example scripts in the vLLM repository: + +- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) +- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py) +- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 9322410ec99e9..bc88370791096 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -85,7 +85,10 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + UCX_NET_DEVICES=all \ + VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ + vllm serve $model_name \ --port $PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ @@ -117,7 +120,10 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + UCX_NET_DEVICES=all \ + VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ + vllm serve $model_name \ --port $PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ From 4c966e440e1b70ff22348b23559b410c06e7b9a8 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Tue, 23 Sep 2025 22:32:57 +0800 Subject: [PATCH 138/185] [XPU] Fix MOE DP accuracy issue on XPU (#25465) --- examples/offline_inference/data_parallel.py | 11 ++++++++++- .../device_communicators/xpu_communicator.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 98fe36d0fb796..0076d4d30ee8e 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -101,6 +101,13 @@ def parse_args(): "--quantization", type=str, ) + parser.add_argument( + "--disable-expert-parallel", + dest="enable_expert_parallel", + action="store_false", + help="Disable expert parallel (default: enabled).", + ) + parser.set_defaults(enable_expert_parallel=True) return parser.parse_args() @@ -113,6 +120,7 @@ def main( dp_master_port, GPUs_per_dp_rank, enforce_eager, + enable_expert_parallel, trust_remote_code, max_num_seqs, max_model_len, @@ -168,7 +176,7 @@ def main( model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=enforce_eager, - enable_expert_parallel=True, + enable_expert_parallel=enable_expert_parallel, trust_remote_code=trust_remote_code, max_num_seqs=max_num_seqs, max_model_len=max_model_len, @@ -229,6 +237,7 @@ if __name__ == "__main__": dp_master_port, tp_size, args.enforce_eager, + args.enable_expert_parallel, args.trust_remote_code, args.max_num_seqs, args.max_model_len, diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index 067315deb773d..b236bae261e03 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -25,6 +25,12 @@ class XpuCommunicator(DeviceCommunicatorBase): super().__init__(cpu_group, device, device_group, unique_name) if self.use_all2all: all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if all2all_backend != "naive": + logger.warning( + "`%s` all2all manager is not supported on XPU." + "Falling back to `naive` all2all manager for XPU.", + all2all_backend) + all2all_backend = "naive" if all2all_backend == "naive": from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) @@ -67,3 +73,16 @@ class XpuCommunicator(DeviceCommunicatorBase): def broadcast(self, input_: torch.Tensor, src: int = 0) -> None: dist.broadcast(input_, src=src, group=self.device_group) + + def dispatch( + self, hidden_states: torch.Tensor, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( + hidden_states, router_logits) + return hidden_states, router_logits + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine(hidden_states) + return hidden_states From 2c58742dff8613a3bd7496f2008ce927e18d38d1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Sep 2025 11:01:24 -0400 Subject: [PATCH 139/185] [UX] Change kv-cache-memory log level to debug (#25479) Signed-off-by: Michael Goin --- vllm/v1/worker/gpu_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index ca8734d28b459..ffea9bb35513a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -389,7 +389,7 @@ class Worker(WorkerBase): f"utilize gpu memory. Current kv cache memory in use is " f"{int(self.available_kv_cache_memory_bytes)} bytes.") - logger.info(msg) + logger.debug(msg) # Warm up sampler and preallocate memory buffer for logits and other # sampling related tensors of max possible shape to avoid memory From a903669e10cc98507fa5c2ae099b7161f7140cf7 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 23 Sep 2025 17:26:13 +0200 Subject: [PATCH 140/185] [V1] Remove V0 code paths for Hybrid models (#25400) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 55 +- tests/models/registry.py | 13 +- vllm/model_executor/layers/mamba/abstract.py | 5 +- .../layers/mamba/linear_attn.py | 105 +-- .../layers/mamba/mamba2_metadata.py | 177 ----- .../layers/mamba/mamba_mixer.py | 145 ++-- .../layers/mamba/mamba_mixer2.py | 170 ++-- .../layers/mamba/mamba_utils.py | 20 +- .../layers/mamba/ops/causal_conv1d.py | 3 - .../model_executor/layers/mamba/short_conv.py | 21 +- vllm/model_executor/models/bamba.py | 72 +- .../models/constant_size_cache.py | 137 ---- vllm/model_executor/models/falcon_h1.py | 65 +- .../model_executor/models/granitemoehybrid.py | 79 +- vllm/model_executor/models/jamba.py | 49 +- vllm/model_executor/models/lfm2.py | 7 - vllm/model_executor/models/mamba.py | 45 +- vllm/model_executor/models/mamba2.py | 63 +- vllm/model_executor/models/mamba_cache.py | 83 -- vllm/model_executor/models/minimax_cache.py | 36 - vllm/model_executor/models/minimax_text_01.py | 39 - vllm/model_executor/models/nemotron_h.py | 72 +- vllm/model_executor/models/phi4flash.py | 731 ------------------ vllm/model_executor/models/plamo2.py | 238 ++---- vllm/model_executor/models/qwen3_next.py | 36 +- vllm/model_executor/models/registry.py | 1 - vllm/model_executor/models/zamba2.py | 93 --- vllm/v1/attention/backends/gdn_attn.py | 8 +- vllm/v1/attention/backends/mamba2_attn.py | 15 +- vllm/v1/attention/backends/short_conv_attn.py | 14 +- vllm/v1/attention/backends/utils.py | 51 ++ 31 files changed, 352 insertions(+), 2296 deletions(-) delete mode 100644 vllm/model_executor/layers/mamba/mamba2_metadata.py delete mode 100644 vllm/model_executor/models/constant_size_cache.py delete mode 100644 vllm/model_executor/models/mamba_cache.py delete mode 100644 vllm/model_executor/models/minimax_cache.py delete mode 100644 vllm/model_executor/models/phi4flash.py diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 0b1f90e27db82..e60a86075b8bc 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -20,7 +20,9 @@ pytestmark = pytest.mark.hybrid_model SSM_MODELS = [ "state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev", - "yujiepan/mamba2-codestral-v0.1-tiny-random", + # mamba2-codestral in transformers is broken pending: + # https://github.com/huggingface/transformers/pull/40861 + #"yujiepan/mamba2-codestral-v0.1-tiny-random", ] HYBRID_MODELS = [ @@ -31,18 +33,7 @@ HYBRID_MODELS = [ "ibm-granite/granite-4.0-tiny-preview", "tiiuae/Falcon-H1-0.5B-Base", "LiquidAI/LFM2-1.2B", -] - -V1_SUPPORTED_MODELS = [ - "state-spaces/mamba-130m-hf", - "ai21labs/Jamba-tiny-dev", - "pfnet/plamo-2-1b", - "yujiepan/mamba2-codestral-v0.1-tiny-random", - "Zyphra/Zamba2-1.2B-instruct", - "hmellor/tiny-random-BambaForCausalLM", - "ibm-granite/granite-4.0-tiny-preview", - "tiiuae/Falcon-H1-0.5B-Base", - "LiquidAI/LFM2-1.2B", + "tiny-random/qwen3-next-moe", ] FULL_CUDA_GRAPH_MODELS = [ @@ -51,10 +42,6 @@ FULL_CUDA_GRAPH_MODELS = [ "Zyphra/Zamba2-1.2B-instruct", ] -V0_UNSUPPORTED_MODELS = [ - "LiquidAI/LFM2-1.2B", -] - FP32_STATE_MODELS = [ "state-spaces/mamba-130m-hf", "Zyphra/Zamba2-1.2B-instruct", @@ -88,20 +75,16 @@ def test_models( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - if model in V1_SUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v1_outputs = None + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - if model in V1_SUPPORTED_MODELS: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v1_outputs, - name_0="hf", - name_1="vllm-v1", - ) + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @@ -299,14 +282,14 @@ def test_full_cuda_graph( example_prompts, max_tokens, num_logprobs) with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v1_outputs, + outputs_1_lst=vllm_outputs, name_0="hf", - name_1="vllm-v1", + name_1="vllm", ) @@ -340,12 +323,12 @@ def test_fp32_cache_state( with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v1_outputs, + outputs_1_lst=vllm_outputs, name_0="hf", - name_1="vllm-v1", + name_1="vllm", ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 6047a7a3e98dc..8b62952ad5908 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -312,14 +312,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), - "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501 - trust_remote_code=True, - v0_only=True, - max_model_len=10240), "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", - trust_remote_code=True), + max_transformers_version="4.55.4", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501 + trust_remote_code=True), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", max_transformers_version="4.53", transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501 @@ -330,7 +328,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct", - min_transformers_version="4.56.2"), + extras={"tiny-random": "tiny-random/qwen3-next-moe"}, # noqa: E501 + min_transformers_version="4.56.3"), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501 trust_remote_code=True, @@ -644,7 +643,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL"), "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct", - min_transformers_version="4.56.2"), + min_transformers_version="4.56.3"), } _TRANSFORMERS_BACKEND_MODELS = { diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index a524e13405807..6da62b5426bb6 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -20,10 +20,7 @@ class MambaBase(AttentionLayerBase): # Contains the KV cache (mamba state) for the layer # in the shape specified by `self.get_state_shape`. - # The outer list is for v0 PP virtual engine. Though this code path - # only runs for v1, we have to do this to unify with the interface - # of Attention + v0 PP. - kv_cache: list[Iterable[torch.Tensor]] + kv_cache: tuple[torch.Tensor, ...] @abstractmethod def get_state_shape(self) -> Iterable[tuple[int, ...]]: diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 5fe37a6289e01..6a901b47b8b63 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -15,7 +15,6 @@ import torch.nn.functional as F from einops import rearrange from torch import nn -from vllm import envs from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce @@ -42,8 +41,6 @@ if TYPE_CHECKING: import torch import torch.distributed -from vllm.model_executor.models.minimax_cache import MinimaxCacheParams - class MiniMaxText01RMSNormTP(CustomOp): name = "MiniMaxText01RMSNormTP" @@ -225,11 +222,10 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): self.tp_heads:(self.tp_rank + 1) * self.tp_heads].contiguous() - if envs.VLLM_USE_V1: - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self @staticmethod def weight_direct_load(param: torch.Tensor, @@ -268,8 +264,7 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): break if _prefill_idx >= len(state_indices_tensor): break - # prefills are packed at end of batch in V1 - offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0 + offset = attn_metadata.num_decode_tokens _start = attn_metadata.query_start_loc[offset + _prefill_idx] _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1] slot_id = state_indices_tensor[offset + _prefill_idx] @@ -291,10 +286,7 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): hidden_decode = self._decode_infer(q, k, v, kv_cache, state_indices_tensor, attn_metadata) - if envs.VLLM_USE_V1: - hidden.insert(0, hidden_decode) - else: - hidden.append(hidden_decode) + hidden.insert(0, hidden_decode) if not hidden: return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) @@ -304,40 +296,28 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata): - if not envs.VLLM_USE_V1: - q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - num_prefills = getattr(attn_metadata, "num_prefills", 0) - slot_id = state_indices_tensor[num_prefills:] - else: - q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - slot_id = state_indices_tensor[:attn_metadata.num_decodes] + q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + slot_id = state_indices_tensor[:attn_metadata.num_decodes] hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope, slot_id, 32) return hidden def forward(self, hidden_states: torch.Tensor, output: torch.Tensor, - positions: torch.Tensor, - kv_caches: MinimaxCacheParams) -> None: - if not envs.VLLM_USE_V1: - self._forward(hidden_states, output, positions, kv_caches) - else: - torch.ops.vllm.linear_attention( - hidden_states, - output, - positions, - self.prefix, - ) + positions: torch.Tensor) -> None: + torch.ops.vllm.linear_attention( + hidden_states, + output, + positions, + self.prefix, + ) def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor, - positions: torch.Tensor, - kv_caches: Optional[MinimaxCacheParams]) -> None: + positions: torch.Tensor) -> None: forward_context = get_forward_context() attn_metadata: AttentionMetadata = forward_context.attn_metadata - if envs.VLLM_USE_V1 and attn_metadata is not None: + if attn_metadata is not None: assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] assert isinstance(attn_metadata, LinearAttentionMetadata) @@ -351,32 +331,26 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): qkvact = torch.nn.functional.silu(qkv32) qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1)) q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) - if envs.VLLM_USE_V1: - if attn_metadata is not None: - kv_cache = self.kv_cache[forward_context.virtual_engine][0] - state_indices_tensor = attn_metadata.state_indices_tensor + if attn_metadata is not None: + kv_cache = self.kv_cache[forward_context.virtual_engine][0] + state_indices_tensor = attn_metadata.state_indices_tensor - num_prefills = getattr(attn_metadata, "num_prefills", 0) - if num_prefills > 0: - num_decode_tokens = getattr(attn_metadata, - "num_decode_tokens", 0) - for prefill_idx in range(num_prefills): - q_start = attn_metadata.query_start_loc[ - num_decode_tokens + prefill_idx] - q_end = attn_metadata.query_start_loc[num_decode_tokens - + prefill_idx + - 1] - query_len = q_end - q_start - context_len = attn_metadata.seq_lens[ - num_decode_tokens + prefill_idx] - query_len - if context_len == 0: - block_to_clear = state_indices_tensor[ - num_decode_tokens + prefill_idx] - kv_cache[block_to_clear, ...] = 0 - else: - assert kv_caches is not None - kv_cache = kv_caches.minimax_cache - state_indices_tensor = kv_caches.state_indices_tensor + num_prefills = getattr(attn_metadata, "num_prefills", 0) + if num_prefills > 0: + num_decode_tokens = getattr(attn_metadata, "num_decode_tokens", + 0) + for prefill_idx in range(num_prefills): + q_start = attn_metadata.query_start_loc[num_decode_tokens + + prefill_idx] + q_end = attn_metadata.query_start_loc[num_decode_tokens + + prefill_idx + 1] + query_len = q_end - q_start + context_len = attn_metadata.seq_lens[ + num_decode_tokens + prefill_idx] - query_len + if context_len == 0: + block_to_clear = state_indices_tensor[num_decode_tokens + + prefill_idx] + kv_cache[block_to_clear, ...] = 0 decode_only = getattr(attn_metadata, "num_prefills", 0) == 0 if attn_metadata is None: @@ -410,8 +384,7 @@ def linear_attention( self = forward_context.no_compile_layers[layer_name] self._forward(hidden_states=hidden_states, output=output, - positions=positions, - kv_caches=None) + positions=positions) def linear_attention_fake( diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py deleted file mode 100644 index 7f376b70a7ae0..0000000000000 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ /dev/null @@ -1,177 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from dataclasses import dataclass -from typing import Optional, Union - -import numpy as np -import torch - -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.placeholder_attn import ( - PlaceholderAttentionMetadata) -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.platforms import current_platform -from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata -from vllm.v1.attention.backends.mamba2_attn import ( - Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets) - - -@dataclass -class Mamba2Metadata: - prep_initial_states: bool - chunk_size: int - - has_initial_states_p: torch.Tensor - seq_idx_p: torch.Tensor - chunk_indices_p: torch.Tensor - chunk_offsets_p: torch.Tensor - """ - With continuous batching layout of `x` in vLLM, to enable a Triton program - to handle a request in parallel, two supporting tensors are used - (batch_ptr, token_chunk_offset_ptr) - BLOCK_M = the # tokens to be handled by a Triton program - (can be customized for different hardware) - - nums_dict: - tracks the data associated with a given value of BLOCK_M - BLOCK_M = #tokens handled by a Triton program - cu_seqlen: total tokens per batch - (used as flag to update other data at each new input) - batch_ptr: tracks batch-id handled by the Triton program - token_chunk_offset_ptr: tracks token group_idx handled by the Triton program - (Triton implementation of causal_conv1d handles parallelism in 3-axes - - feature-axis - - batch-axis - - sequence-axis) - """ - nums_dict: Optional[dict] = None - cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.Tensor] = None - token_chunk_offset_ptr: Optional[torch.Tensor] = None - - -def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: - """Returns the appropriate metadata classes for the current platform.""" - if current_platform.is_rocm(): - from vllm.v1.attention.backends.rocm_aiter_fa import ( - AiterFlashAttentionMetadata) - from vllm.v1.attention.backends.triton_attn import ( - TritonAttentionMetadata) - return (AiterFlashAttentionMetadata, TritonAttentionMetadata, - PlaceholderAttentionMetadata) - if current_platform.is_cuda(): - from vllm.v1.attention.backends.flash_attn import ( - FlashAttentionMetadata) - from vllm.v1.attention.backends.xformers import ( - XFormersAttentionMetadata) - return (FlashAttentionMetadata, XFormersAttentionMetadata, - PlaceholderAttentionMetadata) - raise ValueError( - f"Unsupported platform for Mamba2: {current_platform.device_type}") - - -def prepare_mamba2_metadata( - chunk_size: int, - attn_metadata: AttentionMetadata, -) -> Mamba2Metadata: - - # compute number of prefill and decode requests - # NOTE: in V0 we assume prefills are before decodes - num_prefills = attn_metadata.num_prefills - num_prefill_tokens = attn_metadata.num_prefill_tokens - - seq_idx_p = None - chunk_indices_p, chunk_offsets_p = None, None - # Need flags to indicate if there are initial states - # currently we really only support the FlashAttention backend - has_initial_states_p = None - prep_initial_states = False - - # Compute seq_idx, chunk_indices and chunk_offsets for prefill only - if num_prefills > 0: - attn_metadata_instances = get_platform_metadata_classes() - if (isinstance(attn_metadata, attn_metadata_instances) - and attn_metadata.context_lens_tensor is not None): - # precompute flag to avoid device syncs later in mamba2 layer - # forwards - # prep is only needed for mamba2 ssd prefill processing - has_initial_states_p = ( - attn_metadata.context_lens_tensor[:num_prefills] > 0) - prep_initial_states = torch.any(has_initial_states_p).item() - query_start_loc_p = attn_metadata.query_start_loc[:num_prefills + 1] - seq_idx_p = torch.repeat_interleave(torch.arange( - num_prefills, dtype=torch.int32, device=query_start_loc_p.device), - query_start_loc_p.diff(), - output_size=num_prefill_tokens) - seq_idx_p.unsqueeze_(0) - - # We compute metadata for chunked prefill once at the top level model - # forward and reuse them in mamba layers. If not needed, they will be - # ignored inside mamba kernels. - if prep_initial_states: - chunk_indices_p, chunk_offsets_p = \ - _query_start_loc_to_chunk_indices_offsets( - query_start_loc_p, chunk_size, num_prefill_tokens) - - return Mamba2Metadata(has_initial_states_p=has_initial_states_p, - prep_initial_states=prep_initial_states, - chunk_size=chunk_size, - seq_idx_p=seq_idx_p, - chunk_indices_p=chunk_indices_p, - chunk_offsets_p=chunk_offsets_p) - - -def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor, - mamba2_metadata: Union[Mamba2Metadata, - Mamba2AttentionMetadata, - GDNAttentionMetadata]): - """ - this is triggered upon handling a new input at the first layer - """ - dim, cu_seqlen = x.shape - mamba2_metadata.cu_seqlen = cu_seqlen - seqlens = np.diff(query_start_loc.to('cpu')) - nums_dict = {} # type: ignore - for BLOCK_M in [8]: # cover all BLOCK_M values - nums = -(-seqlens // BLOCK_M) - nums_dict[BLOCK_M] = {} - nums_dict[BLOCK_M]['nums'] = nums - nums_dict[BLOCK_M]['tot'] = nums.sum().item() - mlist = torch.from_numpy(np.repeat(np.arange(len(nums)), nums)) - nums_dict[BLOCK_M]['mlist'] = mlist - mlist_len = len(nums_dict[BLOCK_M]['mlist']) - nums_dict[BLOCK_M]['mlist_len'] = mlist_len - MAX_NUM_PROGRAMS = max(1024, mlist_len) * 2 - offsetlist = [] # type: ignore - for idx, num in enumerate(nums): - offsetlist.extend(range(num)) - offsetlist = torch.tensor(offsetlist, dtype=torch.int32) - nums_dict[BLOCK_M]['offsetlist'] = offsetlist - - if mamba2_metadata.batch_ptr is None: - # Update default value after class definition - #mamba2_metadata.MAX_NUM_PROGRAMS *= 2 - mamba2_metadata.batch_ptr = torch.full((MAX_NUM_PROGRAMS, ), - PAD_SLOT_ID, - dtype=torch.int32, - device='cuda') - mamba2_metadata.token_chunk_offset_ptr = torch.full( - (MAX_NUM_PROGRAMS, ), - PAD_SLOT_ID, - dtype=torch.int32, - device='cuda') - else: - if mamba2_metadata.batch_ptr.nelement() < MAX_NUM_PROGRAMS: - mamba2_metadata.batch_ptr.resize_(MAX_NUM_PROGRAMS).fill_( - PAD_SLOT_ID) - mamba2_metadata.token_chunk_offset_ptr.resize_( # type: ignore - MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID) - - mamba2_metadata.batch_ptr[0:mlist_len].copy_(mlist) - mamba2_metadata.token_chunk_offset_ptr[ # type: ignore - 0:mlist_len].copy_(offsetlist) - nums_dict[BLOCK_M]['batch_ptr'] = mamba2_metadata.batch_ptr - nums_dict[BLOCK_M]['token_chunk_offset_ptr'] = ( - mamba2_metadata.token_chunk_offset_ptr) # type: ignore - mamba2_metadata.nums_dict = nums_dict - return mamba2_metadata diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index e704bfd451bce..a56ee13a63804 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -10,8 +10,6 @@ import torch from torch import nn from torch.nn.parameter import Parameter -from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) @@ -28,7 +26,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) -from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op @@ -149,16 +146,12 @@ class MambaMixer(MambaBase, CustomOp): has_weight=rms_norm_has_weight, ) if use_rms_norm else None - if envs.VLLM_USE_V1: - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - # The outer list is for v0 PP virtual engine. Though this code path - # only runs for v1, we have to do this to unify with the interface - # of Attention + v0 PP. - # The inner tuple is (conv_state, ssm_state) - self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The inner tuple is (conv_state, ssm_state) + self.kv_cache = (torch.tensor([]), torch.tensor([])) self.model_config = model_config self.cache_config = cache_config @@ -186,29 +179,18 @@ class MambaMixer(MambaBase, CustomOp): discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) return discrete_time_step, B, C - def forward(self, - hidden_states: torch.Tensor, - output: torch.Tensor, - mamba_cache_params: Optional[MambaCacheParams] = None): - if not envs.VLLM_USE_V1: - CustomOp.forward(self, hidden_states, output, mamba_cache_params) - else: - torch.ops.vllm.mamba_mixer( - hidden_states, - output, - self.prefix, - ) + def forward(self, hidden_states: torch.Tensor, output: torch.Tensor): + torch.ops.vllm.mamba_mixer( + hidden_states, + output, + self.prefix, + ) - def forward_native(self, - hidden_states: torch.Tensor, - output: torch.Tensor, - mamba_cache_params: Optional[MambaCacheParams] = None): + def forward_native(self, hidden_states: torch.Tensor, + output: torch.Tensor): pass - def forward_cuda(self, - hidden_states: torch.Tensor, - output: torch.Tensor, - mamba_cache_params: Optional[MambaCacheParams] = None): + def forward_cuda(self, hidden_states: torch.Tensor, output: torch.Tensor): """ Run the Mamba-1 SSM pipeline. @@ -234,31 +216,18 @@ class MambaMixer(MambaBase, CustomOp): forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata - if envs.VLLM_USE_V1: - if attn_metadata is not None: - assert isinstance(attn_metadata, dict) - attn_metadata = attn_metadata[self.prefix] - mamba1_metadata = attn_metadata - assert isinstance(mamba1_metadata, Mamba1AttentionMetadata) - query_start_loc = mamba1_metadata.query_start_loc - state_indices_tensor = mamba1_metadata.state_indices_tensor - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - conv_state = self_kv_cache[0].transpose(-1, -2) - ssm_state = self_kv_cache[1] - has_initial_states = mamba1_metadata.has_initial_states - num_padded_decodes = mamba1_metadata.num_padded_decodes - else: - assert isinstance(attn_metadata, AttentionMetadata) - assert mamba_cache_params is not None - conv_state = mamba_cache_params.conv_state - ssm_state = mamba_cache_params.ssm_state - state_indices_tensor = mamba_cache_params.state_indices_tensor - query_start_loc = attn_metadata.query_start_loc - context_lens_tensor = attn_metadata.context_lens_tensor - has_initial_states = None - if context_lens_tensor is not None: - has_initial_states = context_lens_tensor > 0 - num_padded_decodes = attn_metadata.num_decode_tokens + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + mamba1_metadata = attn_metadata + assert isinstance(mamba1_metadata, Mamba1AttentionMetadata) + query_start_loc = mamba1_metadata.query_start_loc + state_indices_tensor = mamba1_metadata.state_indices_tensor + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + has_initial_states = mamba1_metadata.has_initial_states + num_padded_decodes = mamba1_metadata.num_padded_decodes # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -267,7 +236,7 @@ class MambaMixer(MambaBase, CustomOp): conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) - if envs.VLLM_USE_V1 and attn_metadata is None: + if attn_metadata is None: # V1 profile run hidden_states_BC = hidden_states_BC.contiguous() return self.out_proj(hidden_states_BC.transpose(-2, -1))[0] @@ -368,10 +337,7 @@ class MambaMixer(MambaBase, CustomOp): out=scan_outputs_d) scan_outputs_d = scan_outputs_d.transpose(0, 1) - if envs.VLLM_USE_V1: - ssm_outputs.insert(0, scan_outputs_d) - else: - ssm_outputs.append(scan_outputs_d) + ssm_outputs.insert(0, scan_outputs_d) scan_outputs_combined = ssm_outputs[0] if len( ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1) @@ -441,40 +407,27 @@ def split_batch_to_prefill_and_decode( num_decodes: int, num_padded_decodes: int, ) -> PrefillDecodeSplit: + num_actual_tokens = num_prefill_tokens + num_padded_decodes - if envs.VLLM_USE_V1: - # In v1, decode tokens come first, then prefill tokens. - hidden_states_BC_d, hidden_states_BC_p = torch.split( - hidden_states_BC[..., :num_actual_tokens], - [num_padded_decodes, num_prefill_tokens], - dim=-1) - gate_d, gate_p = torch.split(gate[..., :num_actual_tokens], - [num_padded_decodes, num_prefill_tokens], - dim=-1) + # In v1, decode tokens come first, then prefill tokens. + hidden_states_BC_d, hidden_states_BC_p = torch.split( + hidden_states_BC[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1) + gate_d, gate_p = torch.split(gate[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1) - # num_padded_decodes accounts for CUDA graph padding when applicable - state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor[:num_padded_decodes + num_prefills], - [num_padded_decodes, num_prefills], - dim=0) - query_start_loc_p = (query_start_loc[-num_prefills - 1:] - - num_padded_decodes if num_prefills > 0 else None) - has_initial_states_p = has_initial_states[-num_prefills:] if ( - has_initial_states is not None and num_prefills > 0) else None - else: - # In v0, prefill tokens come first, then decode tokens. - hidden_states_BC_p, hidden_states_BC_d = torch.split( - hidden_states_BC, [num_prefill_tokens, num_decode_tokens], dim=-1) - gate_p, gate_d = torch.split(gate, - [num_prefill_tokens, num_decode_tokens], - dim=-1) - state_indices_tensor_p, state_indices_tensor_d = torch.split( - state_indices_tensor, [num_prefills, num_decodes], dim=0) - query_start_loc_p = (query_start_loc[:num_prefills + - 1] if num_prefills > 0 else None) - has_initial_states_p = has_initial_states[:num_prefills] if ( - has_initial_states is not None and num_prefills > 0) else None + # num_padded_decodes accounts for CUDA graph padding when applicable + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor[:num_padded_decodes + num_prefills], + [num_padded_decodes, num_prefills], + dim=0) + query_start_loc_p = (query_start_loc[-num_prefills - 1:] - + num_padded_decodes if num_prefills > 0 else None) + has_initial_states_p = has_initial_states[-num_prefills:] if ( + has_initial_states is not None and num_prefills > 0) else None return PrefillDecodeSplit( hidden_states_BC_p=hidden_states_BC_p, @@ -495,9 +448,7 @@ def mamba_mixer( ) -> None: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self.forward_cuda(hidden_states=hidden_states, - output=output, - mamba_cache_params=None) + self.forward_cuda(hidden_states=hidden_states, output=output) def mamba_mixer_fake( diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 02e6a9138c05f..047ce4c4c43d0 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -9,7 +9,6 @@ if TYPE_CHECKING: import torch from torch import nn -from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import (divide, get_tensor_model_parallel_rank, @@ -22,8 +21,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.mamba.abstract import MambaBase -from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata, - update_metadata) from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( @@ -36,7 +33,6 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import ( LoaderFunction, composed_weight_loader, sharded_weight_loader) -from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op @@ -449,16 +445,12 @@ class MambaMixer2(MambaBase, CustomOp): self.use_rms_norm, eps=rms_norm_eps) - if envs.VLLM_USE_V1: - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - # The outer list is for v0 PP virtual engine. Though this code path - # only runs for v1, we have to do this to unify with the interface - # of Attention + v0 PP. - # The inner tuple is (conv_state, ssm_state) - self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The tuple is (conv_state, ssm_state) + self.kv_cache = (torch.tensor([]), torch.tensor([])) self.model_config = model_config self.cache_config = cache_config @@ -468,8 +460,6 @@ class MambaMixer2(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, mup_vector: Optional[torch.Tensor] = None, ): pass @@ -478,59 +468,43 @@ class MambaMixer2(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, mup_vector: Optional[torch.Tensor] = None, ): - if not envs.VLLM_USE_V1: - CustomOp.forward(self, hidden_states, output, mamba_cache_params, - mamba2_metadata, mup_vector) - else: - torch.ops.vllm.mamba_mixer2( - hidden_states, - output, - self.prefix, - mup_vector, - ) + torch.ops.vllm.mamba_mixer2( + hidden_states, + output, + self.prefix, + mup_vector, + ) def forward_cuda( self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, mup_vector: Optional[torch.Tensor] = None, ): forward_context = get_forward_context() - # mamba2_metadata contains metadata necessary for the mamba2 triton + # attn_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill # modes; they are computed at top-level model forward since they # stay the same and reused for all mamba layers in the same iteration attn_metadata: AttentionMetadata = forward_context.attn_metadata - if envs.VLLM_USE_V1: - if attn_metadata is not None: - assert isinstance(attn_metadata, dict) - attn_metadata = attn_metadata[self.prefix] - mamba2_metadata = attn_metadata - assert isinstance(attn_metadata, Mamba2AttentionMetadata) - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - # conv_state = (..., dim, width-1) yet contiguous along 'dim' - conv_state = self_kv_cache[0].transpose(-1, -2) - ssm_state = self_kv_cache[1] - state_indices_tensor = attn_metadata.state_indices_tensor - else: - conv_state = mamba_cache_params.conv_state - ssm_state = mamba_cache_params.ssm_state - state_indices_tensor = mamba_cache_params.state_indices_tensor - # Common members between V1 metadata and V0 metadata - if mamba2_metadata is not None: - has_initial_states_p = mamba2_metadata.has_initial_states_p - prep_initial_states = mamba2_metadata.prep_initial_states - chunk_size = mamba2_metadata.chunk_size - seq_idx_p = mamba2_metadata.seq_idx_p - chunk_indices_p = mamba2_metadata.chunk_indices_p - chunk_offsets_p = mamba2_metadata.chunk_offsets_p + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, Mamba2AttentionMetadata) + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + # conv_state = (..., dim, width-1) yet contiguous along 'dim' + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + state_indices_tensor = attn_metadata.state_indices_tensor + has_initial_states_p = attn_metadata.has_initial_states_p + prep_initial_states = attn_metadata.prep_initial_states + chunk_size = attn_metadata.chunk_size + seq_idx_p = attn_metadata.seq_idx_p + chunk_indices_p = attn_metadata.chunk_indices_p + chunk_offsets_p = attn_metadata.chunk_offsets_p # 1. Gated MLP's linear projection projected_states, _ = self.in_proj(hidden_states) @@ -562,8 +536,8 @@ class MambaMixer2(MambaBase, CustomOp): dim=-1, ) - if envs.VLLM_USE_V1 and attn_metadata is None: - # V1 profile run + if attn_metadata is None: + # profile run hidden_states_B_C = (hidden_states_B_C.transpose( 0, 1).clone().transpose(0, 1)).contiguous() hidden_states, _B, _C = split_hidden_states_B_C_fn( @@ -579,49 +553,27 @@ class MambaMixer2(MambaBase, CustomOp): has_decode = num_decodes > 0 num_actual_tokens = num_prefill_tokens + num_decodes - # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension - if envs.VLLM_USE_V1: - hidden_states_B_C_d, hidden_states_B_C_p = torch.split( - hidden_states_B_C[:num_actual_tokens], - [num_decodes, num_prefill_tokens], - dim=0, - ) - dt_d, dt_p = torch.split( - dt[:num_actual_tokens], - [num_decodes, num_prefill_tokens], - dim=0, - ) - # Split along batch dimension - state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor[:num_actual_tokens], - [num_decodes, num_prefills], - dim=0, - ) - query_start_loc_p = ( - attn_metadata.query_start_loc[-num_prefills - 1:] - - num_decodes if has_prefill else None) - else: - hidden_states_B_C_p, hidden_states_B_C_d = torch.split( - hidden_states_B_C, - [num_prefill_tokens, num_decodes], - dim=0, - ) - dt_p, dt_d = torch.split( - dt, - [num_prefill_tokens, num_decodes], - dim=0, - ) - # Split along batch dimension - state_indices_tensor_p, state_indices_tensor_d = torch.split( - state_indices_tensor, - [num_prefills, num_decodes], - dim=0, - ) - query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + - 1] - if has_prefill else None) + hidden_states_B_C_d, hidden_states_B_C_p = torch.split( + hidden_states_B_C[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + dt_d, dt_p = torch.split( + dt[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + # Split along batch dimension + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor[:num_actual_tokens], + [num_decodes, num_prefills], + dim=0, + ) + query_start_loc_p = ( + attn_metadata.query_start_loc[-num_prefills - 1:] - + num_decodes if has_prefill else None) # Preallocate output tensor to avoid memcpy cost for merging prefill # and decode outputs @@ -633,18 +585,11 @@ class MambaMixer2(MambaBase, CustomOp): dtype=hidden_states.dtype, device=hidden_states.device, ) - if envs.VLLM_USE_V1: - preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( - preallocated_ssm_out, - [num_decodes, num_prefill_tokens], - dim=0, - ) - else: - preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( - preallocated_ssm_out, - [num_prefill_tokens, num_decodes], - dim=0, - ) + preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( + preallocated_ssm_out, + [num_decodes, num_prefill_tokens], + dim=0, + ) # Process prefill requests if has_prefill: @@ -653,9 +598,6 @@ class MambaMixer2(MambaBase, CustomOp): # pointed to by "state_indices_tensor" x = hidden_states_B_C_p.transpose( 0, 1) # this is the form that causal-conv see - if mamba2_metadata.cu_seqlen is None: - mamba2_metadata = update_metadata(x, query_start_loc_p, - mamba2_metadata) hidden_states_B_C_p = causal_conv1d_fn( x, conv_weights, @@ -664,7 +606,7 @@ class MambaMixer2(MambaBase, CustomOp): conv_states=conv_state, has_initial_state=has_initial_states_p, cache_indices=state_indices_tensor_p, - metadata=mamba2_metadata, + metadata=attn_metadata, query_start_loc=query_start_loc_p).transpose( 0, 1)[:num_prefill_tokens] @@ -806,8 +748,6 @@ def mamba_mixer2( self = forward_context.no_compile_layers[layer_name] self.forward_cuda(hidden_states=hidden_states, output=output, - mamba_cache_params=None, - mamba2_metadata=None, mup_vector=mup_vector) diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index a6c1af91de421..677a4b9d87fc7 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -100,7 +100,6 @@ class MambaStateShapeCalculator: intermediate_size: int, state_size: int, conv_kernel: int, - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int]]: conv_state_shape = (divide(intermediate_size, tp_world_size), conv_kernel - 1) @@ -108,11 +107,7 @@ class MambaStateShapeCalculator: temporal_state_shape = (divide(intermediate_size, tp_world_size), state_size) - # In V0, the conv_state shape was swapped during allocation in - # MambaCacheManager, but in V1 it needs to be determined here at the - # calculation level - if use_v1: - conv_state_shape = conv_state_shape[1], conv_state_shape[0] + conv_state_shape = conv_state_shape[1], conv_state_shape[0] return conv_state_shape, temporal_state_shape @@ -126,7 +121,6 @@ class MambaStateShapeCalculator: head_dim: int, state_size: int, conv_kernel: int, - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: # if n_groups is not divisible by world_size, need to extend the shards # to ensure all groups needed by a head is sharded along with it @@ -137,8 +131,6 @@ class MambaStateShapeCalculator: # contiguous along 'dim' axis conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size)) - if not use_v1: - conv_state_shape = conv_state_shape[1], conv_state_shape[0] # These are not TP-ed as they depend on A, dt_bias, D # - they are typically small @@ -153,12 +145,9 @@ class MambaStateShapeCalculator: tp_world_size: int, intermediate_size: int, conv_kernel: int, - use_v1: bool = True, ) -> tuple[tuple[int, int]]: conv_dim = divide(intermediate_size, tp_world_size) conv_state_shape = (conv_kernel - 1, conv_dim) - if not use_v1: - conv_state_shape = conv_state_shape[1], conv_state_shape[0] return (conv_state_shape, ) @classmethod @@ -183,7 +172,6 @@ class MambaStateShapeCalculator: head_v_dim: int, conv_kernel_size: int, num_spec: int = 0, - use_v1: bool = True, ): conv_dim = (head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads) conv_state_shape = ( @@ -191,11 +179,7 @@ class MambaStateShapeCalculator: conv_kernel_size - 1 + num_spec, ) - # In V0, the conv_state shape was swapped during allocation in - # MambaCacheManager, but in V1 it needs to be determined here at the - # calculation level - if use_v1: - conv_state_shape = conv_state_shape[1], conv_state_shape[0] + conv_state_shape = conv_state_shape[1], conv_state_shape[0] temporal_state_shape = (divide(num_v_heads, tp_world_size), head_k_dim, head_v_dim) diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 8cfd0962c5bfe..010fcdda156c2 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -420,9 +420,7 @@ def causal_conv1d_fn( x = x.to(conv_states.dtype) out = torch.empty_like(x) if metadata is not None: - cu_seqlen = metadata.cu_seqlen nums_dict = metadata.nums_dict - #x = metadata.x args = nums_dict batch_ptr = metadata.batch_ptr token_chunk_offset_ptr = metadata.token_chunk_offset_ptr @@ -926,7 +924,6 @@ def causal_conv1d_update( query_start_loc: Optional[torch.Tensor] = None, max_query_len: int = -1, pad_slot_id: int = PAD_SLOT_ID, - metadata=None, validate_data=False, ): """ diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 335191a5c82c1..ffdcd702aab40 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -8,7 +8,6 @@ if TYPE_CHECKING: import torch -from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size @@ -18,7 +17,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.mamba.abstract import MambaBase -from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( @@ -71,15 +69,11 @@ class ShortConv(MambaBase, CustomOp): prefix=f"{prefix}.out_proj", ) - assert envs.VLLM_USE_V1, ("ShortConv layers are only supported in V1") compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self - # The outer list is for v0 PP virtual engine. Though this code path - # only runs for v1, we have to do this to unify with the interface - # of Attention + v0 PP. - self.kv_cache = [(torch.tensor([]), )] + self.kv_cache = (torch.tensor([]), ) self.model_config = model_config self.cache_config = cache_config @@ -89,7 +83,6 @@ class ShortConv(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - conv_metadata: ShortConvAttentionMetadata, ): return @@ -97,7 +90,6 @@ class ShortConv(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - conv_metadata: ShortConvAttentionMetadata, ): torch.ops.vllm.short_conv( hidden_states, @@ -109,7 +101,6 @@ class ShortConv(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - conv_metadata: ShortConvAttentionMetadata, ): forward_context = get_forward_context() # ShortConvAttentionMetadata contains metadata necessary for the @@ -121,7 +112,6 @@ class ShortConv(MambaBase, CustomOp): if attn_metadata is not None: assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] - conv_metadata = attn_metadata assert isinstance(attn_metadata, ShortConvAttentionMetadata) self_kv_cache = self.kv_cache[forward_context.virtual_engine] conv_state = self_kv_cache[0].transpose(-1, -2) @@ -181,9 +171,6 @@ class ShortConv(MambaBase, CustomOp): if has_prefill: Bx_p = (B_p * x_p).transpose(0, 1) - if conv_metadata.cu_seqlen is None: - conv_metadata = update_metadata(Bx_p, query_start_loc_p, - conv_metadata) Bx = causal_conv1d_fn(Bx_p, conv_weights, self.conv.bias, @@ -191,7 +178,7 @@ class ShortConv(MambaBase, CustomOp): conv_states=conv_state, has_initial_state=has_initial_states_p, cache_indices=state_indices_tensor_p, - metadata=conv_metadata, + metadata=attn_metadata, query_start_loc=query_start_loc_p).transpose( 0, 1)[:num_prefill_tokens] @@ -248,9 +235,7 @@ def short_conv( ) -> None: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self.forward_cuda(hidden_states=hidden_states, - output=output, - conv_metadata=None) + self.forward_cuda(hidden_states=hidden_states, output=output) def short_conv_fake( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 584981ef3ebfd..4a6154dc548aa 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -9,21 +9,17 @@ import torch from torch import nn from transformers import BambaConfig -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -32,10 +28,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, SupportsQuant) @@ -115,8 +108,6 @@ class BambaMixerDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): if residual is None: @@ -127,7 +118,7 @@ class BambaMixerDecoderLayer(nn.Module): hidden_states, residual) output = torch.empty_like(hidden_states) - self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata) + self.mamba(hidden_states, output) # Fully Connected hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) @@ -315,22 +306,10 @@ class BambaModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - attn_metadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -343,23 +322,11 @@ class BambaModel(nn.Module): residual = intermediate_tensors["residual"] residual = None - num_attn = 0 for i, layer in enumerate(self.layers): - if isinstance(layer, BambaAttentionDecoderLayer): - num_attn += 1 - - layer_mamba_cache_params = None - if isinstance(layer, - BambaMixerDecoderLayer) and mamba_cache_params: - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - i - num_attn) - hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, residual=residual, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) if not get_pp_group().is_last_rank: @@ -457,13 +424,11 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -482,7 +447,6 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, head_dim=hf_config.mamba_d_head, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -515,8 +479,6 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, if not lora_config else lora_config.lora_vocab_padding_size, prefix=maybe_prefix(prefix, "lm_head"), ) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -534,39 +496,11 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_mamba_layers = \ - self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, - LayerBlockType.mamba - ) - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - hidden_states = self.model(input_ids, positions, mamba_cache_params, - intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py deleted file mode 100644 index f03c58a12932f..0000000000000 --- a/vllm/model_executor/models/constant_size_cache.py +++ /dev/null @@ -1,137 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import ABC, abstractmethod -from typing import Any - -import torch - -from vllm.attention.backends.utils import PAD_SLOT_ID - - -class ConstantSizeCache(ABC): - """ - Abstract base class for managing constant size caches - like Mamba and Minimax. - """ - - def __init__(self, max_batch_size: int): - # Maps between the request id and a dict that maps between the seq_id - # and its index inside the cache - self.cache_indices_mapping: dict[str, dict[int, int]] = {} - self.free_cache_indices = list(range(max_batch_size)) - - @property - @abstractmethod - def cache(self) -> Any: - """Return the underlying cache tensor(s)""" - pass - - @abstractmethod - def _copy_cache(self, from_index: int, to_index: int): - """Copy cache data from one index to another""" - pass - - def current_run_tensors(self, **kwargs) -> tuple: - """ - Return the tensors for the current run's conv and ssm state. - """ - if "seqlen_agnostic_capture_inputs" not in kwargs: - # We get here only on Prefill/Eager mode runs - request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] - finished_requests_ids = kwargs["finished_requests_ids"] - - self._release_finished_requests(finished_requests_ids) - state_indices = self._prepare_current_run_cache( - request_ids_to_seq_ids, finished_requests_ids) - - state_indices_tensor = torch.as_tensor(state_indices, - dtype=torch.int32, - device="cuda") - cache_tensors = self.cache - else: - # CUDA graph capturing runs - cache_tensors, state_indices_tensor = kwargs[ - "seqlen_agnostic_capture_inputs"] - - return (cache_tensors, state_indices_tensor) - - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - """ - Copy the relevant state_indices into the CUDA graph input buffer - """ - assert all( - key in kwargs - for key in ["request_ids_to_seq_ids", "finished_requests_ids"]) - finished_requests_ids = kwargs["finished_requests_ids"] - request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] - assert "seqlen_agnostic_capture_inputs" in input_buffers - _, input_state_indices_buffer = input_buffers[ - "seqlen_agnostic_capture_inputs"] - - self._release_finished_requests(finished_requests_ids) - state_indices = self._prepare_current_run_cache( - request_ids_to_seq_ids, finished_requests_ids) - cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len( - state_indices) - state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len) - - input_state_indices_buffer.copy_( - torch.as_tensor(state_indices, dtype=torch.int32, device="cuda")) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - """ - Provide the CUDA graph capture runs with a buffer in adjusted size. - The buffer is used to maintain the Cache during the CUDA graph replay - runs. - """ - state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size, - dtype=torch.int32, - device="cuda") - return (self.cache, state_indices_tensor) - - def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, - finished_requests_ids) -> int: - """ - Assign (req_id,seq_id) pair to a `destination_index` index, if - already occupied, move the occupying index to a free index. - """ - if cur_rid in finished_requests_ids: - # set as pad, do not allocate destination index - return PAD_SLOT_ID - elif cur_rid not in self.cache_indices_mapping: - destination_index = self.free_cache_indices.pop() - self.cache_indices_mapping[cur_rid] = {seq_id: destination_index} - return destination_index - elif seq_id not in (seq_ids2indices := - self.cache_indices_mapping[cur_rid]): - # parallel sampling , where n > 1, assume prefill have - # already happened, so we copy the - # existing cache into the siblings seq_ids caches - index_exists = next(iter(seq_ids2indices.values())) - # case of decoding n>1, copy prefill cache to decoding indices - destination_index = self.free_cache_indices.pop() - self._copy_cache(from_index=index_exists, - to_index=destination_index) - self.cache_indices_mapping[cur_rid][seq_id] = destination_index - return destination_index - else: - return self.cache_indices_mapping[cur_rid][seq_id] - - def _prepare_current_run_cache( - self, request_ids_to_seq_ids: dict[str, list[int]], - finished_requests_ids: list[str]) -> list[int]: - return [ - self._assign_seq_id_to_cache_index(req_id, seq_id, - finished_requests_ids) - for req_id, seq_ids in request_ids_to_seq_ids.items() - for seq_id in seq_ids - ] - - def _release_finished_requests(self, - finished_seq_groups_req_ids: list[str]): - for req_id in finished_seq_groups_req_ids: - if req_id in self.cache_indices_mapping: - for seq_id in self.cache_indices_mapping[req_id]: - self.free_cache_indices.append( - self.cache_indices_mapping[req_id][seq_id]) - self.cache_indices_mapping.pop(req_id) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 83efdd2e433fc..f382018e2222c 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -8,21 +8,17 @@ import torch from torch import nn from transformers import FalconH1Config -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -31,8 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP @@ -179,16 +173,12 @@ class FalconH1SSMDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): output = torch.empty_like(hidden_states) self.mamba( hidden_states, output, - mamba_cache_params, - mamba2_metadata=mamba2_metadata, mup_vector=self.mup_vector, ) return output, residual @@ -364,8 +354,6 @@ class FalconH1ParallelHybrid(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): residual = hidden_states @@ -382,12 +370,10 @@ class FalconH1ParallelHybrid(nn.Module): # Process input through the SSM branch. # FalconH1SSMDecoderLayer expects hidden_states, attn_metadata, - # residual, mamba_cache_params, and sequence_idx. + # residual, and sequence_idx. ssm_hidden, _ = self.mamba( hidden_states=hidden_states * self.ssm_in_multiplier, residual=residual, - mamba_cache_params=mamba_cache_params, - mamba2_metadata=mamba2_metadata, **kwargs, ) # Sum the outputs from both branches. @@ -464,25 +450,10 @@ class FalconH1Model(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - # pass a sequence index tensor, that is required for - # proper continuous batching computation including - # chunked prefill - attn_metadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds * self.embedding_multiplier @@ -495,14 +466,9 @@ class FalconH1Model(nn.Module): for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - layer_mamba_cache_params = None - if mamba_cache_params: - layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i) hidden_states = layer( positions=positions, hidden_states=hidden_states, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -541,13 +507,11 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -570,7 +534,6 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, head_dim=hf_config.mamba_d_head, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -592,7 +555,6 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, prefix=maybe_prefix(prefix, "model")) self.tie_word_embeddings = config.tie_word_embeddings self.unpadded_vocab_size = config.vocab_size - self.mamba_cache: Optional[MambaCacheManager] = None if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size if get_pp_group().is_last_rank: @@ -637,40 +599,15 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, **kwargs, ): - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager( - self.vllm_config, - self.config.num_hidden_layers, - *mamba_state_shape, - *mamba_state_dtype, - ) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - hidden_states = self.model( input_ids, positions, - mamba_cache_params, intermediate_tensors, inputs_embeds, ) return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index e89a1a4a0f7d3..f5751fe47bb8b 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -9,19 +9,15 @@ import torch from torch import nn from transformers import GraniteMoeHybridConfig -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -30,10 +26,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType from .granitemoe import GraniteMoeMoE from .granitemoeshared import GraniteMoeSharedMLP @@ -102,14 +95,12 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) output = torch.empty_like(hidden_states) - self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata) + self.mamba(hidden_states, output) hidden_states = residual + output * self.residual_multiplier residual = hidden_states @@ -182,8 +173,6 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, ) -> torch.Tensor: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -366,22 +355,10 @@ class GraniteMoeHybridModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - attn_metadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -399,20 +376,9 @@ class GraniteMoeHybridModel(nn.Module): for i, layer in enumerate(self.layers): if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer): num_attn += 1 - - layer_mamba_cache_params = None - if isinstance( - layer, - GraniteMoeHybridMambaDecoderLayer) and mamba_cache_params: - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - i - num_attn) - - hidden_states, residual = layer( - positions=positions, - hidden_states=hidden_states, - residual=residual, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata) + hidden_states, residual = layer(positions=positions, + hidden_states=hidden_states, + residual=residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -552,13 +518,11 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -577,7 +541,6 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, head_dim=hf_config.mamba_d_head, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -620,9 +583,6 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, scale=1 / self.config.logits_scaling) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -636,38 +596,11 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_mamba_layers = ( - self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, - LayerBlockType.mamba)) - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - hidden_states = self.model(input_ids, positions, mamba_cache_params, - intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 12a49029195ff..e8277e259bc5b 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -9,7 +9,6 @@ import torch from torch import nn from transformers import JambaConfig -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig @@ -30,10 +29,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaMLP as JambaMLP -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, @@ -145,7 +141,6 @@ class JambaMambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, **kwargs, ): if residual is None: @@ -156,7 +151,7 @@ class JambaMambaDecoderLayer(nn.Module): hidden_states, residual) output = torch.empty_like(hidden_states) - self.mamba(hidden_states, output, mamba_cache_params) + self.mamba(hidden_states, output) # Fully Connected hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) @@ -333,7 +328,6 @@ class JambaModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -348,24 +342,11 @@ class JambaModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - kv_cache_index = 0 - mamba_cache_index = 0 for layer in islice(self.layers, self.start_layer, self.end_layer): - layer_mamba_cache_params = None - if isinstance(layer, JambaAttentionDecoderLayer): - kv_cache_index += 1 - if isinstance(layer, - JambaMambaDecoderLayer) and mamba_cache_params: - current_state_layer = mamba_cache_index - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - current_state_layer) - mamba_cache_index += 1 + hidden_states, residual = layer(positions=positions, + hidden_states=hidden_states, + residual=residual) - hidden_states, residual = layer( - positions=positions, - hidden_states=hidden_states, - residual=residual, - mamba_cache_params=layer_mamba_cache_params) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, @@ -503,8 +484,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, if not lora_config else lora_config.lora_vocab_padding_size, prefix=maybe_prefix(prefix, "lm_head"), ) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -521,24 +500,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - # NOTE: mamba_cache_params is not needed for v1 - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - state_shape = self.get_mamba_state_shape_from_config( - self.vllm_config) - state_dtype = self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_layers, *state_shape, - *state_dtype) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - hidden_states = self.model(input_ids, positions, mamba_cache_params, - intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): @@ -574,7 +538,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, intermediate_size=hf_config.mamba_expand * hidden_size, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=envs.VLLM_USE_V1, ) def compute_logits( diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index dd97afbeb668a..53c36e4e52d81 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -8,7 +8,6 @@ import torch import torch.nn as nn from transformers import Lfm2Config -from vllm import envs from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig @@ -297,7 +296,6 @@ class Lfm2ShortConvDecoderLayer(nn.Module): self.conv( hidden_states, output, - conv_metadata=None, ) hidden_states, residual = self.ffn_norm(output, residual) hidden_states = self.feed_forward(hidden_states) @@ -459,13 +457,11 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int]]: """ Calculate shapes for LFM2's convolutional cache. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -478,7 +474,6 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, tp_world_size=parallel_config.tensor_parallel_size, intermediate_size=hf_config.conv_dim, conv_kernel=hf_config.conv_L_cache, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -489,8 +484,6 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, scheduler_config = vllm_config.scheduler_config assert (not cache_config.enable_prefix_caching ), "Lfm2 currently does not support prefix caching" - assert envs.VLLM_USE_V1, ( - "Lfm2ForCausalLM doesn't support vLLM v0. Please enable v1") super().__init__() self.config = config diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 36141a5d50641..5bd268291c7d9 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -8,7 +8,6 @@ import torch from torch import nn from transformers import MambaConfig -from vllm import envs from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group @@ -24,10 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree, SupportsPP) -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -72,7 +68,6 @@ class MambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, **kwargs, ): if residual is None: @@ -82,7 +77,7 @@ class MambaDecoderLayer(nn.Module): hidden_states, residual = self.norm(hidden_states, residual) output = torch.empty_like(hidden_states) - self.mixer(hidden_states, output, mamba_cache_params) + self.mixer(hidden_states, output) return output, residual @@ -134,7 +129,6 @@ class MambaModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: Optional[MambaCacheParams] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -151,17 +145,9 @@ class MambaModel(nn.Module): for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - - layer_cache_params = None - if mamba_cache_params is not None: - layer_cache_params = mamba_cache_params.at_layer_idx( - i - self.start_layer) - - hidden_states, residual = layer( - positions=positions, - hidden_states=hidden_states, - residual=residual, - mamba_cache_params=layer_cache_params) + hidden_states, residual = layer(positions=positions, + hidden_states=hidden_states, + residual=residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, @@ -225,9 +211,6 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): prefix=maybe_prefix(prefix, "lm_head"), ) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -244,22 +227,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - state_shape = self.get_mamba_state_shape_from_config( - self.vllm_config) - state_dtype = self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_layers, *state_shape, - *state_dtype) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - hidden_states = self.backbone(input_ids, positions, mamba_cache_params, + hidden_states = self.backbone(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states @@ -288,8 +256,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): tp_world_size=parallel_config.tensor_parallel_size, intermediate_size=hf_config.intermediate_size, state_size=hf_config.state_size, - conv_kernel=hf_config.conv_kernel, - use_v1=envs.VLLM_USE_V1) + conv_kernel=hf_config.conv_kernel) def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): return self.mamba_cache.copy_inputs_before_cuda_graphs( diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 9c3108146d2e5..97e9c5785e726 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -8,16 +8,11 @@ import torch from torch import nn from transformers import MambaConfig -from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -28,10 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree) -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -74,8 +66,6 @@ class Mamba2DecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): if residual is None: @@ -85,7 +75,7 @@ class Mamba2DecoderLayer(nn.Module): hidden_states, residual = self.norm(hidden_states, residual) output = torch.empty_like(hidden_states) - self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata) + self.mixer(hidden_states, output) return output, residual @@ -137,7 +127,6 @@ class Mamba2Model(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -152,25 +141,10 @@ class Mamba2Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - for i, layer in enumerate(self.layers): - hidden_states, residual = layer( - positions=positions, - hidden_states=hidden_states, - residual=residual, - mamba_cache_params=mamba_cache_params.at_layer_idx( - i - self.start_layer) if mamba_cache_params else None, - mamba2_metadata=mamba2_metadata) + hidden_states, residual = layer(positions=positions, + hidden_states=hidden_states, + residual=residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -222,13 +196,11 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -247,7 +219,6 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): head_dim=hf_config.head_dim, state_size=hf_config.state_size, conv_kernel=hf_config.conv_kernel, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -282,9 +253,6 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -300,29 +268,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_mamba_layers = ( - self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, - LayerBlockType.mamba)) - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - else: - # NOTE: mamba_cache_params is not needed for v1 - mamba_cache_params = None - - hidden_states = self.backbone(input_ids, positions, mamba_cache_params, + hidden_states = self.backbone(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py deleted file mode 100644 index 6b16e3ce7d984..0000000000000 --- a/vllm/model_executor/models/mamba_cache.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass - -import torch - -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.config import VllmConfig -from vllm.model_executor.models.constant_size_cache import ConstantSizeCache - - -@dataclass -class MambaCacheParams: - conv_state: torch.Tensor = torch.Tensor() - ssm_state: torch.Tensor = torch.Tensor() - state_indices_tensor: torch.Tensor = torch.Tensor() - - def at_layer_idx(self, layer_idx): - return MambaCacheParams(self.conv_state[layer_idx], - self.ssm_state[layer_idx], - self.state_indices_tensor) - - -class MambaCacheManager(ConstantSizeCache): - - def __init__(self, vllm_config: VllmConfig, num_mamba_layers: int, - conv_state_shape: tuple[int, int], - temporal_state_shape: tuple[int, int], - conv_state_dtype: torch.dtype, - temporal_state_dtype: torch.dtype): - - self.conv_state_dtype = conv_state_dtype - self.temporal_state_dtype = temporal_state_dtype - - # Determine max batch size to set size of MambaCache - max_batch_size = vllm_config.scheduler_config.max_num_seqs - if not vllm_config.model_config.enforce_eager: - max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size) - - # Initialize parent class - super().__init__(max_batch_size) - - # assume conv_state = (dim, state_len) - assert conv_state_shape[0] > conv_state_shape[1] - conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) + - (conv_state_shape[1], conv_state_shape[0]), - dtype=self.conv_state_dtype, - device="cuda").transpose(-1, -2) - temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) + - temporal_state_shape, - dtype=self.temporal_state_dtype, - device="cuda") - - self._mamba_cache = (conv_state, temporal_state) - - @property - def cache(self): - return self._mamba_cache - - def _copy_cache(self, from_index: int, to_index: int): - for cache_t in self.cache: - cache_t[:, to_index].copy_(cache_t[:, from_index], - non_blocking=True) - - def current_run_tensors(self, **kwargs) -> MambaCacheParams: - """ - Return the tensors for the current run's conv and ssm state. - """ - cache_tensors, state_indices_tensor = super().current_run_tensors( - **kwargs) - return MambaCacheParams(cache_tensors[0], cache_tensors[1], - state_indices_tensor) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - """ - Provide the CUDA graph capture runs with a buffer in adjusted size. - The buffer is used to maintain the Mamba Cache during the CUDA graph - replay runs. - """ - return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size, - dtype=torch.int32, - device="cuda") diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py deleted file mode 100644 index 9164ac06a3b0a..0000000000000 --- a/vllm/model_executor/models/minimax_cache.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from dataclasses import dataclass - -import torch - -from vllm.model_executor.models.constant_size_cache import ConstantSizeCache - - -@dataclass -class MinimaxCacheParams: - minimax_cache: torch.Tensor = torch.Tensor() - state_indices_tensor: torch.Tensor = torch.Tensor() - - def at_layer_idx(self, layer_idx): - return MinimaxCacheParams(self.minimax_cache[layer_idx, ...], - self.state_indices_tensor) - - -class MinimaxCacheManager(ConstantSizeCache): - - def __init__(self, dtype, cache_shape): - super().__init__(cache_shape[1]) # max_batch_size is cache_shape[1] - self._minimax_cache = torch.empty(size=cache_shape, - dtype=dtype, - device="cuda") - - @property - def cache(self): - return self._minimax_cache - - def _copy_cache(self, from_index: int, to_index: int): - assert len(self.cache) > 0 - for cache_t in self.cache: - cache_t[:, to_index].copy_(cache_t[:, from_index], - non_blocking=True) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 1d2c7dea811e0..cc9a959f63313 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ import torch.distributed from torch import nn from transformers import MiniMaxConfig -from vllm import envs from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig @@ -44,7 +43,6 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid -from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -404,7 +402,6 @@ class MiniMaxText01DecoderLayer(nn.Module): def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - kv_caches: Union[list[dict], Optional[torch.Tensor]], attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], is_warmup: bool = False, @@ -418,7 +415,6 @@ class MiniMaxText01DecoderLayer(nn.Module): hidden_states=layernorm_output, output=self_attention_output, positions=positions, - kv_caches=kv_caches, ) residual = residual * self.layernorm_attention_alpha @@ -563,10 +559,6 @@ class MiniMaxText01Model(nn.Module): self._dtype = _dummy.dtype del _dummy - if not envs.VLLM_USE_V1: - self.minimax_cache = MinimaxCacheManager( - dtype=torch.float32, cache_shape=self.cache_shape) - norm_kwargs = {} if hasattr(config, "rms_norm_eps"): norm_kwargs["eps"] = config.rms_norm_eps @@ -614,25 +606,6 @@ class MiniMaxText01Model(nn.Module): **kwargs) -> Union[torch.Tensor, IntermediateTensors]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata - if not envs.VLLM_USE_V1 and attn_metadata is None: - return None - if not envs.VLLM_USE_V1: - if "request_ids_to_seq_ids" not in kwargs: - kwargs["request_ids_to_seq_ids"] = {} - if "finished_requests_ids" not in kwargs: - kwargs["finished_requests_ids"] = [] - ( - minimax_cache_tensors, - state_indices_tensor, - ) = self.minimax_cache.current_run_tensors(**kwargs) - if getattr(attn_metadata, "num_prefills", 0) > 0: - self._clear_prefill_cache(attn_metadata, minimax_cache_tensors, - **kwargs) - - minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors, - state_indices_tensor) - else: - minimax_cache_params = None if get_pp_group().is_first_rank: if inputs_embeds is None: @@ -645,20 +618,10 @@ class MiniMaxText01Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - minimax_cache_index = 0 - for layer in islice(self.layers, self.start_layer, self.end_layer): - _caches = None - if not envs.VLLM_USE_V1 and isinstance( - layer.self_attn, MiniMaxText01LinearAttention): - current_state_layer = minimax_cache_index - _caches = minimax_cache_params.at_layer_idx( - current_state_layer) - minimax_cache_index += 1 hidden_states, residual = layer( hidden_states=hidden_states, positions=positions, - kv_caches=_caches, attn_metadata=attn_metadata, residual=residual, ) @@ -1003,13 +966,11 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, ...], ...]: """Calculate shape for MiniMaxText01LinearAttention cache. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index ff571541a60a5..987920ecc3310 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -23,21 +23,17 @@ from typing import Optional import torch from torch import nn -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -49,14 +45,11 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, SupportsQuant) -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronHConfig -from vllm.utils import LayerBlockType class NemotronHMLP(nn.Module): @@ -181,8 +174,6 @@ class NemotronHMambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): if residual is None: @@ -192,7 +183,7 @@ class NemotronHMambaDecoderLayer(nn.Module): hidden_states, residual = self.norm(hidden_states, residual) output = torch.empty_like(hidden_states) - self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata) + self.mixer(hidden_states, output) return output, residual @@ -370,22 +361,10 @@ class NemotronHModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - attn_metadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -398,22 +377,11 @@ class NemotronHModel(nn.Module): residual = intermediate_tensors["residual"] residual = None - num_non_mamba_layers = 0 for i, layer in enumerate(self.layers): - layer_mamba_cache_params = None - if isinstance(layer, - NemotronHMambaDecoderLayer) and mamba_cache_params: - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - i - num_non_mamba_layers) - else: - num_non_mamba_layers += 1 - hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, residual=residual, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) if not get_pp_group().is_last_rank: @@ -508,13 +476,11 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -533,7 +499,6 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, head_dim=hf_config.mamba_head_dim, state_size=hf_config.ssm_state_size, conv_kernel=hf_config.conv_kernel, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -566,8 +531,6 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, if not lora_config else lora_config.lora_vocab_padding_size, prefix=maybe_prefix(prefix, "lm_head"), ) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -584,40 +547,11 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - - num_mamba_layers = \ - self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, - LayerBlockType.mamba - ) - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - hidden_states = self.model(input_ids, positions, mamba_cache_params, - intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py deleted file mode 100644 index ae153558e37aa..0000000000000 --- a/vllm/model_executor/models/phi4flash.py +++ /dev/null @@ -1,731 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math -from collections.abc import Iterable -from typing import Optional, Union - -import torch -import torch.nn as nn -from transformers.activations import ACT2FN - -import vllm.envs as envs -from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.selector import _Backend -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.forward_context import ForwardContext, get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( - causal_conv1d_fn, causal_conv1d_update) -from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( - selective_scan_fn, selective_state_update) -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, - SupportsV0Only) -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) -from vllm.sequence import IntermediateTensors - -from .utils import make_layers, maybe_prefix - -logger = init_logger(__name__) - - -class SwiGLUActivation(nn.Module): - - def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - return x1 * nn.functional.silu(x2) - - -class SambaYMLP(nn.Module): - """Gated Linear Unit. - - Reference: - Language Modeling with Gated Convolutional Networks. - https://arxiv.org/pdf/1612.08083v3.pdf. - - """ - - def __init__(self, config): - super().__init__() - - self.config = config - self.fc1 = nn.Linear(config.hidden_size, - 2 * config.intermediate_size, - bias=False) - self.fc2 = nn.Linear(config.intermediate_size, - config.hidden_size, - bias=False) - - self.activation_fn = ACT2FN[config.hidden_act] - - def forward(self, hidden_states): - y = self.fc1(hidden_states) - gate, y = y.chunk(2, dim=-1) - y = y * self.activation_fn(gate) - return self.fc2(y) - - -def get_virtual_engine(): - forward_context: ForwardContext = get_forward_context() - return forward_context.virtual_engine - - -class SambaYAttention(nn.Module): - - def __init__(self, - config, - layer_idx: Optional[int] = None, - yoco_cross: bool = False, - cache_config: Optional[CacheConfig] = None, - prefix: str = ""): - super().__init__() - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing " - "a `layer_idx` is not recommended and will lead to errors " - "during the forward call if caching is used. Please make " - "sure to provide a `layer_idx` when creating this class.") - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.yoco_cross = yoco_cross - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError("hidden_size must be divisible by num_heads " - f"(got `hidden_size`: {self.hidden_size} and " - f"`num_heads`: {self.num_heads}).") - - op_size = self.num_heads * self.head_dim + 2 * ( - self.num_key_value_heads * self.head_dim) - self.out_proj = nn.Linear(self.num_heads * self.head_dim, - self.hidden_size, - bias=True) - if yoco_cross: - self.Wqkv = nn.Linear(self.hidden_size, - self.num_heads * self.head_dim, - bias=True) - else: - self.Wqkv = nn.Linear(self.hidden_size, op_size, bias=True) - - # disable sliding window for the second half of the model - is_sliding = config.layer_types[layer_idx] == "sliding_attention" - sliding_window = config.sliding_window if is_sliding else None - - assert self.num_heads % 2 == 0, 'num_heads should be even' - assert self.num_key_value_heads % 2 == 0, 'num_heads should be even' - - self.lambda_init = self.lambda_init_fn(layer_idx) - self.lambda_q1 = nn.Parameter( - torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, - std=0.1)) - self.lambda_k1 = nn.Parameter( - torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, - std=0.1)) - self.lambda_q2 = nn.Parameter( - torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, - std=0.1)) - self.lambda_k2 = nn.Parameter( - torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, - std=0.1)) - self.subln = nn.RMSNorm(2 * self.head_dim, - eps=1e-5, - elementwise_affine=True) - - params = { - 'differential_flash_attention_config': { - 'lambda_init': self.lambda_init, - 'lambda_q1': self.lambda_q1, - 'lambda_k1': self.lambda_k1, - 'lambda_q2': self.lambda_q2, - 'lambda_k2': self.lambda_k2, - "subln": self.subln, - } - } - - if yoco_cross: - kv_shared_layer_index = config.num_hidden_layers // 2 + 1 - kv_sharing_target_layer_name = \ - f"model.layers.{kv_shared_layer_index}.self_attn.attn" - else: - kv_sharing_target_layer_name = None - - self.attn = Attention( - self.num_heads, - self.head_dim, - self.head_dim**-0.5, - num_kv_heads=self.num_key_value_heads, - cache_config=cache_config, - per_layer_sliding_window=sliding_window, - prefix=f"{prefix}.attn", - attn_type=AttentionType.DECODER, - kv_sharing_target_layer_name=kv_sharing_target_layer_name, - **params) - assert self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN,\ - "DIFFERENTIAL_FLASH_ATTN required" - - def lambda_init_fn(self, depth): - return 0.8 - 0.6 * math.exp(-0.3 * depth) - - def forward( - self, - hidden_states: torch.Tensor, - ): - - if not self.yoco_cross: # need to generate kv-cache - qkv = self.Wqkv(hidden_states) - q, k, v = qkv.split([ - self.hidden_size, self.num_key_value_heads * self.head_dim, - self.num_key_value_heads * self.head_dim - ], - dim=-1) - attn_output = self.attn(q, k, v) - else: # reuse the kv cache, full attention - q = self.Wqkv(hidden_states) - attn_output = self.attn(q, None, None) - attn_output = attn_output.view(-1, self.num_heads * self.head_dim) - return self.out_proj(attn_output) - - -class Phi4Mamba(nn.Module): - - def __init__( - self, - d_model, - d_state=16, - d_conv=4, - expand=2, - dt_rank="auto", - dt_min=0.001, - dt_max=0.1, - dt_init="random", # difference - dt_scale=1.0, # difference - dt_init_floor=1e-4, - conv_bias=True, - bias=False, - use_fast_path=True, # Fused kernel options - layer_idx=None, - device=None, - dtype=None, - yoco_cross=False, - yoco_kv=False, - ): - factory_kwargs = {"params_dtype": dtype} # difference - super().__init__() - self.yoco_cross = yoco_cross - self.yoco_kv = yoco_kv - self.d_model = d_model - self.d_state = d_state - self.d_conv = d_conv - self.expand = expand - self.d_inner = int(self.expand * self.d_model) - self.dt_rank = math.ceil(self.d_model / - 16) if dt_rank == "auto" else dt_rank - self.use_fast_path = use_fast_path - self.layer_idx = layer_idx - self.swiGluActivation = SwiGLUActivation() - if self.yoco_cross: - self.in_proj = MergedColumnParallelLinear(self.d_model, - [self.d_inner], - bias=bias, - **factory_kwargs) - self.out_proj = RowParallelLinear(self.d_inner, - self.d_model, - bias=bias, - **factory_kwargs) - return - self.conv1d = ColumnParallelLinear( - input_size=d_conv, - output_size=self.d_inner, - bias=conv_bias, - params_dtype=dtype, - ) - # unsqueeze to fit conv1d weights shape into the linear weights shape. - # Can't do this in `weight_loader` since it already exists in - # `ColumnParallelLinear` and `set_weight_attrs` - # doesn't allow to override it - self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) - - self.in_proj = MergedColumnParallelLinear( - self.d_model, - [self.d_inner] * 2, - bias=bias, - params_dtype=dtype, - ) - - # selective projection used to make dt, B and C input dependent - self.x_proj = RowParallelLinear( - self.d_inner, - self.dt_rank + self.d_state * 2, - bias=False, - params_dtype=dtype, - ) - - # time step projection (discretization) - - # In the forward we need to apply dt_proj without the bias, - # as the bias is added in the selective scan kernel. - self.dt_proj = ColumnParallelLinear( - self.dt_rank, - self.d_inner, - bias=True, - skip_bias_add=True, - params_dtype=dtype, - ) - - # # D "skip" parameter - # self.D = nn.Parameter(torch.ones(self.d_inner)) # Keep in fp32 - self.A = nn.Parameter( - torch.empty( - self.d_inner, - self.d_state, - dtype=torch.float32, - )) - self.D = nn.Parameter(torch.ones(self.d_inner, dtype=torch.float32)) - - self.out_proj = RowParallelLinear( - self.d_inner, - self.d_model, - bias=bias, - input_is_parallel=True, - params_dtype=dtype, - ) - self.activation = "silu" - - def forward(self, - hidden_states: torch.Tensor, - attn_metadata: AttentionMetadata, - mamba_cache_params: MambaCacheParams, - yoco_key_values=None) -> torch.Tensor: - - if self.yoco_cross: - out = self.in_proj(hidden_states)[0] - out = self.swiGluActivation(yoco_key_values, out) - out = self.out_proj(out) - return out[0], yoco_key_values - - # 1. Gated MLP's linear projection - # projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) - projected_states = self.in_proj( - hidden_states.to(self.in_proj.weight.dtype))[0].transpose(-2, -1) - hidden_states, gate = projected_states.chunk(2, dim=-2) - - # 2. Convolution sequence transformation - conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), - self.conv1d.weight.size(2)) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - hidden_states = causal_conv1d_fn( - hidden_states, - conv_weights, - self.conv1d.bias, - activation=self.activation, - conv_states=mamba_cache_params.conv_state, - has_initial_state=attn_metadata.context_lens_tensor > 0, - cache_indices=mamba_cache_params.state_indices_tensor, - query_start_loc=attn_metadata.query_start_loc) - else: - hidden_states = causal_conv1d_update( - hidden_states.transpose(0, 1), - mamba_cache_params.conv_state, - conv_weights, - self.conv1d.bias, - self.activation, - conv_state_indices=mamba_cache_params.state_indices_tensor) - hidden_states = hidden_states.transpose(0, 1) - - # 3. State Space Model sequence transformation - # 3.a. input varying initialization of time_step, B and C - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] - - time_step, B, C = torch.split( - ssm_parameters, - [self.dt_rank, self.d_state, self.d_state], - dim=-1, - ) - - # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't. - - discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) - # 3.c perform the recurrence y ← SSM(A, B, C)(x) - time_proj_bias = (self.dt_proj.bias.float() if hasattr( - self.dt_proj, "bias") else None) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - scan_outputs = selective_scan_fn( - hidden_states, - mamba_cache_params.ssm_state, - discrete_time_step, - self.A, - B.transpose(-2, -1), - C.transpose(-2, -1), - self.D.float(), - # z, - None if self.yoco_kv else gate, - time_proj_bias, - delta_softplus=True, - cache_indices=mamba_cache_params.state_indices_tensor, - has_initial_state=attn_metadata.context_lens_tensor > 0, - query_start_loc=attn_metadata.query_start_loc) - else: - scan_outputs = torch.empty_like(hidden_states.transpose(0, 1)) - selective_state_update( - mamba_cache_params.ssm_state, - hidden_states.transpose(0, 1), - discrete_time_step.transpose(0, 1), - self.A, - B, - C, - self.D, - # z - # gate.transpose(0, 1), - None if self.yoco_kv else gate.transpose(0, 1), - time_proj_bias, - dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor, - out=scan_outputs) - scan_outputs = scan_outputs.transpose(0, 1) - - # 4. Final linear projection - if self.yoco_kv: - # gate = gate.transpose(-1,-2).contiguous() - yoco_key_values = scan_outputs.transpose(-2, -1) - scan_outputs = self.swiGluActivation(scan_outputs, gate) - - contextualized_states = self.out_proj(scan_outputs.transpose(-2, - -1))[0] - - return contextualized_states, yoco_key_values - - -class SambaYDecoderLayer(nn.Module): - - def __init__( - self, - config, - layer_idx, - cache_config, - prefix: str = "", - ) -> None: - super().__init__() - - self.config = config - self.layer_idx = layer_idx - - self.mlp = SambaYMLP(config) - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - self.yoco_mb = False - self.yoco_cross = False - if layer_idx >= config.num_hidden_layers // 2: - self.yoco_mb = True - self.yoco_cross = (layer_idx - >= (config.num_hidden_layers // 2 + 2)) - self.use_mamba = config.mb_per_layer > 0 and \ - layer_idx % config.mb_per_layer == 0 - if self.use_mamba: - factory_kwargs = {"dtype": None} - self.attn = Phi4Mamba(config.hidden_size, - layer_idx=layer_idx, - yoco_cross=self.yoco_cross, - yoco_kv=self.yoco_mb, - **factory_kwargs) - else: - self.attn = SambaYAttention(config, - layer_idx=layer_idx, - yoco_cross=self.yoco_cross, - cache_config=cache_config, - prefix=f"{prefix}.self_attn") - self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - positions: torch.Tensor, - attn_metadata: AttentionMetadata, - mamba_cache_params: MambaCacheParams, - ssm_output: Optional[torch.LongTensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if self.use_mamba: - assert mamba_cache_params is not None - else: - assert mamba_cache_params is None - - residual = hidden_states - hidden_states = self.input_layernorm( - hidden_states.to(dtype=self.input_layernorm.weight.dtype)) - - if self.use_mamba: - attn_outputs, ssm_output = self.attn(hidden_states, - attn_metadata, - mamba_cache_params, - yoco_key_values=ssm_output) - residual = residual.to(torch.float32) - else: - attn_outputs = self.attn(hidden_states, ) - hidden_states = residual + attn_outputs - residual = hidden_states - hidden_states = self.post_attention_layernorm( - hidden_states.to(dtype=self.post_attention_layernorm.weight.dtype)) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - return hidden_states, ssm_output - - -class SambaYModel(nn.Module): - - def __init__(self, - config, - cache_config=None, - quant_config=None, - lora_config=None, - prefix: str = "") -> None: - super().__init__() - self.config = config - self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - ) - - # Pipeline parallel is not supported since the second half of - # the layers share the kv cache. - if get_pp_group().world_size != 1: - raise ValueError("Pipeline Parallel not supported") - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: SambaYDecoderLayer(config, - int(prefix.split('.')[-1]), - cache_config, - prefix=prefix), - prefix=f"{prefix}.layers") - self.final_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: Optional[torch.Tensor], - positions: torch.Tensor, - attn_metadata: AttentionMetadata, - mamba_cache_params: MambaCacheParams, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - - mamba_state_idx = 0 - ssm_output = None - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] - if i == self.config.num_hidden_layers // 2 + 2: - # profile run - kv_cache_idx = self.config.num_hidden_layers // 2 + 1 - cache_layer = self.layers[kv_cache_idx] - kv_cache = cache_layer.attn.attn.kv_cache - if kv_cache[0].numel() == 0: - break - - # Starting from this layer, we do not need to calculate - # the kv cache since we reuse the kv cache from last layer. - # If in prefill phase, we can prune> truncate - # the hidden state to save computation cost. - if attn_metadata.prefill_metadata and not envs.VLLM_USE_V1: - selected_token_indices = torch.cumsum( - attn_metadata.seq_lens_tensor, dim=0) - 1 - hidden_states = hidden_states.index_select( - 0, selected_token_indices) - ssm_output = ssm_output.index_select( - 0, selected_token_indices) - - if layer.use_mamba: - if i < self.config.num_hidden_layers // 2 or \ - not layer.yoco_cross: - mamba_cache = mamba_cache_params.at_layer_idx( - mamba_state_idx) - mamba_state_idx += 1 - else: - mamba_cache = mamba_cache_params.at_layer_idx( - mamba_state_idx - 1) - - hidden_states, ssm_output = layer(hidden_states, - positions, - attn_metadata, - mamba_cache, - ssm_output=ssm_output) - else: - hidden_states, ssm_output = layer( - hidden_states, - positions, - attn_metadata, - None, # mamba_cache_params - ssm_output=ssm_output) - - hidden_states = self.final_layernorm( - hidden_states.to(dtype=self.final_layernorm.weight.dtype)) - return hidden_states - - -class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - lora_config = vllm_config.lora_config - quant_config = vllm_config.quant_config - scheduler_config = vllm_config.scheduler_config - self.compilation_config = vllm_config.compilation_config - self.vllm_config = vllm_config - # Prefix caching and chunked prefill is not supported for this model. - assert not cache_config.enable_prefix_caching, \ - "Phi4flash currently does not support prefix caching" - assert not scheduler_config.chunked_prefill_enabled, \ - "Phi4Flash currently does not support prefix caching" - super().__init__() - self.config = config - self.model_config = vllm_config.model_config - self.scheduler_config = scheduler_config - self.model = SambaYModel(config, - cache_config=cache_config, - prefix=maybe_prefix(prefix, "model")) - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=( - DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "lm_head"), - ) - self.embedding_bias = None - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size, - logits_as_input=False) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[torch.Tensor, IntermediateTensors]: - if self.mamba_cache is None: - num_mamba_layers = self.config.num_hidden_layers \ - // 2 // self.config.mb_per_layer + 1 - self.mamba_cache = MambaCacheManager( - self.vllm_config, - num_mamba_layers, - *self._get_mamba_cache_shape(), - self.lm_head.weight.dtype, - self.lm_head.weight.dtype, - ) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - - attn_metadata = get_forward_context().attn_metadata - # input_ids and hidden_states isn't a one-to-one mapping in prefill - # stage due to YOCO optimization. - hidden_states = self.model(input_ids, positions, attn_metadata, - mamba_cache_params, intermediate_tensors, - inputs_embeds) - return hidden_states - - def _get_mamba_cache_shape( - self - ) -> tuple[Optional[tuple[int, int]], Optional[tuple[int, int]]]: - world_size = get_tensor_model_parallel_world_size() - hidden_size = self.config.hidden_size - mamba_expand = self.config.mamba_expand # 2 - mamba_d_conv = self.config.mamba_d_conv # 4 - mamba_d_state = self.config.mamba_d_state # 16 - conv_state_shape = ( - mamba_expand * hidden_size // world_size, - mamba_d_conv - 1, - ) - temporal_state_shape = ( - mamba_expand * hidden_size // world_size, - mamba_d_state, - ) - return conv_state_shape, temporal_state_shape - - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> Optional[torch.Tensor]: - processed_logits = self.logits_processor( - self.lm_head, - hidden_states, - self.embedding_bias, - ) - return processed_logits - - def load_weights( - self, - weights: Iterable[tuple[str, torch.Tensor]], - ): - weights = {name: weight for name, weight in weights} - adjusted_weights = {} - for name, weight in weights.items(): - if "A_log" in name: - name = name.replace("A_log", "A") - weight = -torch.exp(weight.float()) - if "inner_cross_attn." in name: - name = name.replace("inner_cross_attn.", "") - adjusted_weights[name] = weight - adjusted_weights["lm_head.weight"] = weights[ - "model.embed_tokens.weight"] - loaded_params: set[str] = set() - for name, param in self.named_parameters(): - weight = adjusted_weights.get(name) - if weight is not None and weight.shape != param.shape: - logger.warning("Shape mismatch: %s %s %s", name, weight.shape, - param.shape) - loaded_params.add(name) - missing_keys, unexpected_keys = self.load_state_dict(adjusted_weights, - strict=False) - assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}" - assert len(missing_keys) == 0, f"Missing keys: {missing_keys}" - return loaded_params diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 33ee1cf44afd1..0292f3bf8317d 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -12,7 +12,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile @@ -29,8 +28,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.abstract import MambaBase -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata, update_metadata) from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( @@ -47,15 +44,13 @@ from vllm.model_executor.model_loader.weight_utils import ( composed_weight_loader, default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsPP) -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.model_executor.models.utils import ( is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType, direct_register_custom_op +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata @@ -194,17 +189,13 @@ class Plamo2MambaMixer(MambaBase, CustomOp): self.chunk_size = self.config.mamba_chunk_size - if envs.VLLM_USE_V1: - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - # The outer list is for v0 PP virtual engine. Though this code path - # only runs for v1, we have to do this to unify with the interface - # of Attention + v0 PP. - # The inner tuple is (conv_state, ssm_state) - self.kv_cache = [(torch.tensor([]), torch.tensor([]))] - assert self.chunk_size != -1, "chunk_size must be set for v1" + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The tuple is (conv_state, ssm_state) + self.kv_cache = (torch.tensor([]), torch.tensor([])) + assert self.chunk_size != -1, "chunk_size must be set for v1" self.prefix = prefix @@ -227,8 +218,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): pass @@ -237,59 +226,43 @@ class Plamo2MambaMixer(MambaBase, CustomOp): self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): - if not envs.VLLM_USE_V1: - CustomOp.forward(self, hidden_states, output, mamba_cache_params, - mamba2_metadata) - else: - torch.ops.vllm.plamo2_mamba_mixer( - hidden_states, - output, - self.prefix, - ) + torch.ops.vllm.plamo2_mamba_mixer( + hidden_states, + output, + self.prefix, + ) def forward_cuda( self, hidden_states: torch.Tensor, output: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): forward_context = get_forward_context() - # mamba2_metadata contains metadata necessary for the mamba2 triton + # attn_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill # modes; they are computed at top-level model forward since they # stay the same and reused for all mamba layers in the same iteration attn_metadata: AttentionMetadata = forward_context.attn_metadata - if envs.VLLM_USE_V1: - if attn_metadata is not None: - assert isinstance(attn_metadata, dict) - attn_metadata = attn_metadata[self.prefix] - mamba2_metadata = attn_metadata - assert isinstance(attn_metadata, Mamba2AttentionMetadata) - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - # conv_state = (..., dim, width-1) yet contiguous along 'dim' - conv_state = self_kv_cache[0].transpose(-1, -2) - ssm_state = self_kv_cache[1] - state_indices_tensor = attn_metadata.state_indices_tensor - else: - conv_state = mamba_cache_params.conv_state - ssm_state = mamba_cache_params.ssm_state - state_indices_tensor = mamba_cache_params.state_indices_tensor - # Common members between V1 metadata and V0 metadata - if mamba2_metadata is not None: - has_initial_states_p = mamba2_metadata.has_initial_states_p - prep_initial_states = mamba2_metadata.prep_initial_states - chunk_size = mamba2_metadata.chunk_size - seq_idx_p = mamba2_metadata.seq_idx_p - chunk_indices_p = mamba2_metadata.chunk_indices_p - chunk_offsets_p = mamba2_metadata.chunk_offsets_p + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, Mamba2AttentionMetadata) + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + # conv_state = (..., dim, width-1) yet contiguous along 'dim' + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + state_indices_tensor = attn_metadata.state_indices_tensor + has_initial_states_p = attn_metadata.has_initial_states_p + prep_initial_states = attn_metadata.prep_initial_states + chunk_size = attn_metadata.chunk_size + seq_idx_p = attn_metadata.seq_idx_p + chunk_indices_p = attn_metadata.chunk_indices_p + chunk_offsets_p = attn_metadata.chunk_offsets_p # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states) @@ -299,8 +272,8 @@ class Plamo2MambaMixer(MambaBase, CustomOp): conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) - if envs.VLLM_USE_V1 and attn_metadata is None: - # V1 profile run + if attn_metadata is None: + # profile run hidden_states = (hidden_states.transpose(0, 1).clone().transpose( 0, 1)).contiguous() output[:] = self.out_proj(hidden_states) @@ -316,42 +289,23 @@ class Plamo2MambaMixer(MambaBase, CustomOp): # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension - if envs.VLLM_USE_V1: - hidden_states_d, hidden_states_p = torch.split( - hidden_states[:num_actual_tokens], - [num_decodes, num_prefill_tokens], - dim=0, - ) - gate_d, gate_p = torch.split(gate[:num_actual_tokens], - [num_decodes, num_prefill_tokens], - dim=0) - # Split along batch dimension - state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor, - [num_decodes, num_prefills], - dim=0, - ) - query_start_loc_p = ( - attn_metadata.query_start_loc[-num_prefills - 1:] - - num_decodes if has_prefill else None) - else: - hidden_states_p, hidden_states_d = torch.split( - hidden_states, - [num_prefill_tokens, num_decodes], - dim=0, - ) - gate_p, gate_d = torch.split(gate, - [num_prefill_tokens, num_decodes], - dim=0) - # Split along batch dimension - state_indices_tensor_p, state_indices_tensor_d = torch.split( - state_indices_tensor, - [num_prefills, num_decodes], - dim=0, - ) - query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + - 1] - if has_prefill else None) + hidden_states_d, hidden_states_p = torch.split( + hidden_states[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + gate_d, gate_p = torch.split(gate[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0) + # Split along batch dimension + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor, + [num_decodes, num_prefills], + dim=0, + ) + query_start_loc_p = ( + attn_metadata.query_start_loc[-num_prefills - 1:] - + num_decodes if has_prefill else None) # Preallocate output tensor to avoid memcpy cost for merging prefill # and decode outputs @@ -363,18 +317,11 @@ class Plamo2MambaMixer(MambaBase, CustomOp): dtype=hidden_states.dtype, device=hidden_states.device, ) - if envs.VLLM_USE_V1: - preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( - preallocated_ssm_out, - [num_decodes, num_prefill_tokens], - dim=0, - ) - else: - preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( - preallocated_ssm_out, - [num_prefill_tokens, num_decodes], - dim=0, - ) + preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( + preallocated_ssm_out, + [num_decodes, num_prefill_tokens], + dim=0, + ) # Process prefill requests if has_prefill: @@ -383,9 +330,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp): # pointed to by "state_indices_tensor" x = hidden_states_p.transpose( 0, 1) # this is the form that causal-conv see - if mamba2_metadata.cu_seqlen is None: - mamba2_metadata = update_metadata(x, query_start_loc_p, - mamba2_metadata) hidden_states_p = causal_conv1d_fn( x, conv_weights, @@ -394,7 +338,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp): conv_states=conv_state, has_initial_state=has_initial_states_p, cache_indices=state_indices_tensor_p, - metadata=mamba2_metadata, + metadata=attn_metadata, query_start_loc=query_start_loc_p) hidden_states_p = hidden_states_p.transpose(0, 1) hidden_states_p = hidden_states_p[:num_prefill_tokens] @@ -470,7 +414,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp): -1, self.num_heads // self.tp_size, self.head_dim) # - the hidden is reshaped into (bs, num_heads, head_dim) - # - mamba_cache_params.ssm_state's slots will be selected + # - ssm_state's slots will be selected # using state_indices_tensor_d # NOTE: final output is an in-place update of out tensor @@ -530,10 +474,7 @@ def plamo2_mamba_mixer( ) -> None: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self.forward_cuda(hidden_states=hidden_states, - output=output, - mamba_cache_params=None, - mamba2_metadata=None) + self.forward_cuda(hidden_states=hidden_states, output=output) def plamo2_mamba_mixer_fake( @@ -731,8 +672,6 @@ class Plamo2DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, **kwargs, ): if residual is None: @@ -747,8 +686,6 @@ class Plamo2DecoderLayer(nn.Module): output = torch.empty_like(hidden_states) mixer_kwargs = { "output": output, - "mamba_cache_params": mamba_cache_params, - "mamba2_metadata": mamba2_metadata, } else: mixer_kwargs = { @@ -790,23 +727,12 @@ class Plamo2Decoder(torch.nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, ) -> torch.Tensor: - mamba_cache_index = 0 for layer in islice(self.layers, self.start_layer, self.end_layer): - layer_mamba_cache_params = None - if layer.is_mamba and mamba_cache_params is not None: - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - mamba_cache_index) - mamba_cache_index += 1 - hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, residual=residual, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) return hidden_states, residual @@ -844,7 +770,6 @@ class Plamo2Model(torch.nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -859,23 +784,10 @@ class Plamo2Model(torch.nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - if not envs.VLLM_USE_V1: - attn_metadata: AttentionMetadata = get_forward_context( - ).attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - hidden_states, residual = self.layers( positions=positions, hidden_states=hidden_states, residual=residual, - mamba_cache_params=mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -925,9 +837,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): if self.config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.config.vocab_size) self.make_empty_intermediate_tensors = ( @@ -942,39 +851,11 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_mamba_layers = ( - self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, - LayerBlockType.mamba)) - mamba_state_shape = self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - else: - # NOTE: mamba_cache_params is not needed for v1 - mamba_cache_params = None - - hidden_states = self.model(input_ids, positions, mamba_cache_params, - intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs(self, batch_size: int): - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - @classmethod def get_mamba_state_dtype_from_config( cls, @@ -991,12 +872,10 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: - conv_state_shape: Shape for convolutional state cache @@ -1015,7 +894,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): head_dim=hf_config.hidden_size_per_head, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=use_v1, ) def compute_logits( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 24cebc5bfdd82..ab23b494e561e 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -11,7 +11,6 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm import envs from vllm.attention import Attention, AttentionBackend, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig, @@ -35,7 +34,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.abstract import MambaBase -from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata from vllm.model_executor.layers.mamba.mamba_mixer2 import ( mamba_v2_sharded_weight_loader) from vllm.model_executor.layers.mamba.mamba_utils import ( @@ -51,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, sharded_weight_loader) -from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -198,14 +195,8 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return MambaStateShapeCalculator.gated_delta_net_state_shape( - self.tp_size, - self.num_k_heads, - self.num_v_heads, - self.head_k_dim, - self.head_v_dim, - self.conv_kernel_size, - self.num_spec, - use_v1=True) + self.tp_size, self.num_k_heads, self.num_v_heads, self.head_k_dim, + self.head_v_dim, self.conv_kernel_size, self.num_spec) def __init__( self, @@ -394,7 +385,6 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): self, hidden_states: torch.Tensor, output: torch.Tensor, - cache_params: Optional[MambaCacheParams] = None, ): return torch.ops.vllm.gdn_attention( hidden_states, @@ -416,7 +406,6 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] - conv_metadata = attn_metadata assert isinstance(attn_metadata, GDNAttentionMetadata) has_initial_state = attn_metadata.has_initial_state spec_query_start_loc = attn_metadata.spec_query_start_loc @@ -479,12 +468,8 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # 2.2: process the remaining part if attn_metadata.num_prefills > 0: mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1) - if conv_metadata.cu_seqlen is None: - conv_metadata = update_metadata(mixed_qkv_non_spec_T, - non_spec_query_start_loc, - conv_metadata) # - "cache_indices" updates the conv_state cache in positions - # pointed to by "mamba_cache_params.state_indices_tensor" + # pointed to by "state_indices_tensor" mixed_qkv_non_spec = causal_conv1d_fn( mixed_qkv_non_spec_T, conv_weights, @@ -494,7 +479,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): has_initial_state=has_initial_state, cache_indices=non_spec_state_indices_tensor, query_start_loc=non_spec_query_start_loc, - metadata=conv_metadata, + metadata=attn_metadata, ).transpose(0, 1) elif attn_metadata.num_decodes > 0: mixed_qkv_non_spec = causal_conv1d_update( @@ -1075,7 +1060,6 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ "Qwen3Next currently does not support prefix caching" - assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1" self.quant_config = vllm_config.quant_config super().__init__() @@ -1195,14 +1179,10 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, num_spec = (vllm_config.speculative_config.num_speculative_tokens if vllm_config.speculative_config else 0) return MambaStateShapeCalculator.gated_delta_net_state_shape( - tp_size, - hf_config.linear_num_key_heads, - hf_config.linear_num_value_heads, - hf_config.linear_key_head_dim, - hf_config.linear_value_head_dim, - hf_config.linear_conv_kernel_dim, - num_spec, - use_v1=True) + tp_size, hf_config.linear_num_key_heads, + hf_config.linear_num_value_heads, hf_config.linear_key_head_dim, + hf_config.linear_value_head_dim, hf_config.linear_conv_kernel_dim, + num_spec) def compute_logits( self, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 86123bc092b9b..6ab3fa902c387 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -134,7 +134,6 @@ _TEXT_GENERATION_MODELS = { "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), - "Phi4FlashForCausalLM": ("phi4flash", "Phi4FlashForCausalLM"), "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 4350e38e02f96..a0d93045b74cf 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -15,12 +15,10 @@ import torch from torch import nn from transformers import Zamba2Config -from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -29,8 +27,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) @@ -39,8 +35,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.mamba_cache import (MambaCacheManager, - MambaCacheParams) from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid @@ -515,8 +509,6 @@ class Zamba2MambaDecoderLayer(nn.Module): def forward( self, hidden_states: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, transformer_hidden_states: Optional[torch.Tensor] = None, positions: Optional[torch.Tensor] = None, original_hidden_states: Optional[torch.Tensor] = None, @@ -525,8 +517,6 @@ class Zamba2MambaDecoderLayer(nn.Module): Args: hidden_states: Input tensor [batch_size, seq_len, hidden_size] - mamba_cache_params: Parameters for Mamba's state caches - (one for conv, one for ssm) transformer_hidden_states: Optional output from transformer path Added to input if provided (used in hybrid architecture) positions: Optional position IDs (unused in Mamba) @@ -555,8 +545,6 @@ class Zamba2MambaDecoderLayer(nn.Module): self.mamba( hidden_states, output, - mamba_cache_params=mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) # residual connection after mamba @@ -607,8 +595,6 @@ class Zamba2HybridLayer(nn.Module): hidden_states: torch.Tensor, original_hidden_states: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, - mamba2_metadata: Mamba2Metadata, ) -> torch.Tensor: """Forward pass through the hybrid layer. @@ -623,8 +609,6 @@ class Zamba2HybridLayer(nn.Module): original_hidden_states: Original input for transformer residual connection positions: Position IDs for positional embeddings - mamba_cache_params: Parameters for Mamba's state caches - (one for conv, one for ssm) Returns: Output tensor combining transformer and Mamba representations @@ -644,8 +628,6 @@ class Zamba2HybridLayer(nn.Module): layer_outputs = self.mamba_decoder( hidden_states, transformer_hidden_states=transformer_hidden_states, - mamba_cache_params=mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) return layer_outputs @@ -752,7 +734,6 @@ class Zamba2Model(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: """Forward pass through the model. @@ -760,8 +741,6 @@ class Zamba2Model(nn.Module): Args: input_ids: Input token IDs positions: Position IDs for embeddings - mamba_cache_params: Parameters for Mamba's state caches - (one for conv, one for ssm) inputs_embeds: Optional pre-computed input embeddings Returns: @@ -773,33 +752,13 @@ class Zamba2Model(nn.Module): inputs_embeds = self.get_input_embeddings(input_ids) hidden_states = inputs_embeds - attn_metadata = get_forward_context().attn_metadata - - if not envs.VLLM_USE_V1: - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.chunk_size, - attn_metadata=attn_metadata, - ) - else: - # v1 get mamba2_metadata from forward_context - mamba2_metadata = None - # Process through layers original_hidden_states = torch.clone(hidden_states) for layer_idx, layer in enumerate(self.layers): - - layer_mamba_cache_params = None - if (isinstance(layer, (Zamba2HybridLayer, Zamba2MambaDecoderLayer)) - and mamba_cache_params): - layer_mamba_cache_params = mamba_cache_params.at_layer_idx( - layer_idx) - layer_outputs = layer( hidden_states, original_hidden_states=original_hidden_states, positions=positions, - mamba_cache_params=layer_mamba_cache_params, - mamba2_metadata=mamba2_metadata, ) hidden_states = layer_outputs @@ -870,13 +829,11 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): def get_mamba_state_shape_from_config( cls, vllm_config: "VllmConfig", - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: @@ -896,7 +853,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): head_dim=hf_config.mamba_headdim, state_size=hf_config.mamba_d_state, conv_kernel=hf_config.mamba_d_conv, - use_v1=use_v1, ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -945,9 +901,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): # Tie weights with input embeddings if using same dimensions self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) - # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Optional[MambaCacheManager] = None - # Initialize logits processing and sampling self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -977,61 +930,15 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): Returns: Output hidden states """ - # Initialize Mamba cache if needed - mamba_cache_params = None - if not envs.VLLM_USE_V1: - if self.mamba_cache is None: - num_mamba_layers = self.config.num_hidden_layers - mamba_state_shape = \ - self.get_mamba_state_shape_from_config( - self.vllm_config, use_v1=False) - mamba_state_dtype = \ - self.get_mamba_state_dtype_from_config( - self.vllm_config) - self.mamba_cache = MambaCacheManager(self.vllm_config, - num_mamba_layers, - *mamba_state_shape, - *mamba_state_dtype) - - # Get cache parameters for current run - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - # Forward pass through model hidden_states = self.model( input_ids, positions, - mamba_cache_params, inputs_embeds, ) return hidden_states - def copy_inputs_before_cuda_graphs( - self, input_buffers: dict[str, torch.Tensor], - **kwargs: Any) -> dict[str, torch.Tensor]: - """Copy inputs before CUDA graph capture. - - Args: - input_buffers: Dictionary of input tensors - **kwargs: Additional arguments passed to cache manager - - Returns: - Updated input buffers - """ - return self.mamba_cache.copy_inputs_before_cuda_graphs( - input_buffers, **kwargs) - - def get_seqlen_agnostic_capture_inputs( - self, batch_size: int) -> dict[str, torch.Tensor]: - """Get inputs for sequence-length-agnostic graph capture. - - Args: - batch_size: Size of batch to capture - Returns: - Dictionary of capture inputs - """ - return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 5dadc52d0fb1c..06a87a4a3c8b2 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -12,6 +12,7 @@ from vllm.config import VllmConfig from vllm.v1.attention.backends.utils import (AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + compute_causal_conv1d_metadata, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec @@ -52,7 +53,6 @@ class GDNAttentionMetadata: # The following attributes are for triton implementation of causal_conv1d nums_dict: Optional[dict] = None - cu_seqlen: Optional[int] = None batch_ptr: Optional[torch.Tensor] = None token_chunk_offset_ptr: Optional[torch.Tensor] = None @@ -134,6 +134,7 @@ class GDNAttentionMetadataBuilder( context_lens = m.num_computed_tokens_cpu context_lens_tensor = context_lens.to(query_start_loc.device) seq_lens_tensor = m.seq_lens + nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None if (not self.use_spec_decode or num_draft_tokens is None or num_draft_tokens.sum().item() == 0): @@ -210,6 +211,8 @@ class GDNAttentionMetadataBuilder( has_initial_state = context_lens_tensor > 0 if spec_sequence_masks is not None: has_initial_state = has_initial_state[~spec_sequence_masks] + nums_dict, batch_ptr, token_chunk_offset_ptr = \ + compute_causal_conv1d_metadata(non_spec_query_start_loc) else: has_initial_state = None num_actual_tokens = num_prefill_tokens + num_decode_tokens + \ @@ -297,6 +300,9 @@ class GDNAttentionMetadataBuilder( spec_sequence_masks=spec_sequence_masks, spec_token_masks=spec_token_masks, num_accepted_tokens=num_accepted_tokens, + nums_dict=nums_dict, + batch_ptr=batch_ptr, + token_chunk_offset_ptr=token_chunk_offset_ptr, ) return attn_metadata diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 2fe1f14ca1db0..f45fc75334a21 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -7,11 +7,12 @@ from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadataBuilder) -from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, +from vllm.v1.attention.backends.utils import (PAD_SLOT_ID, + CommonAttentionMetadata, + compute_causal_conv1d_metadata, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec @@ -131,7 +132,6 @@ class Mamba2AttentionMetadata: # The following attributes are for triton implementation of causal_conv1d nums_dict: Optional[dict] = None - cu_seqlen: Optional[int] = None batch_ptr: Optional[torch.Tensor] = None token_chunk_offset_ptr: Optional[torch.Tensor] = None @@ -161,6 +161,9 @@ class Mamba2AttentionMetadataBuilder( has_initial_states_p = None prep_initial_states = False + # for causal_conv1d + nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( @@ -198,6 +201,9 @@ class Mamba2AttentionMetadataBuilder( query_start_loc_p, self.chunk_size, num_prefill_tokens)) + nums_dict, batch_ptr, token_chunk_offset_ptr = \ + compute_causal_conv1d_metadata(query_start_loc_p) + elif num_decodes <= self.decode_cudagraph_max_bs: # Pad state tensor for CUDA graph num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes) @@ -220,5 +226,8 @@ class Mamba2AttentionMetadataBuilder( chunk_indices_p=chunk_indices_p, chunk_offsets_p=chunk_offsets_p, state_indices_tensor=state_indices_tensor, + nums_dict=nums_dict, + batch_ptr=batch_ptr, + token_chunk_offset_ptr=token_chunk_offset_ptr, ) return attn_metadata diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index 717c40b37ecfb..428e409659798 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -9,6 +9,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata, + compute_causal_conv1d_metadata, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec @@ -33,7 +34,6 @@ class ShortConvAttentionMetadata: # For causal_conv1d nums_dict: Optional[dict] = None - cu_seqlen: Optional[int] = None batch_ptr: Optional[torch.Tensor] = None token_chunk_offset_ptr: Optional[torch.Tensor] = None @@ -57,6 +57,9 @@ class ShortConvAttentionMetadataBuilder( state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + # for causal_conv1d + nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( split_decodes_and_prefills( common_attn_metadata, @@ -70,6 +73,12 @@ class ShortConvAttentionMetadataBuilder( has_initial_states = has_initial_states_cpu.to( query_start_loc.device) + query_start_loc_p = common_attn_metadata.query_start_loc[ + -num_prefills - 1:] - num_decode_tokens + + nums_dict, batch_ptr, token_chunk_offset_ptr = \ + compute_causal_conv1d_metadata(query_start_loc_p) + attn_metadata = ShortConvAttentionMetadata( num_prefills=num_prefills, num_prefill_tokens=num_prefill_tokens, @@ -78,5 +87,8 @@ class ShortConvAttentionMetadataBuilder( query_start_loc=query_start_loc, has_initial_states=has_initial_states, state_indices_tensor=state_indices_tensor, + nums_dict=nums_dict, + batch_ptr=batch_ptr, + token_chunk_offset_ptr=token_chunk_offset_ptr, ) return attn_metadata diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 63326d19194f0..6ef489f5a7a28 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -34,6 +34,8 @@ logger = init_logger(__name__) KVCacheLayoutType = Literal["NHD", "HND"] _KV_CACHE_LAYOUT_OVERRIDE: Union[KVCacheLayoutType, None] = None +PAD_SLOT_ID = -1 + def is_valid_kv_cache_layout(value: str) -> bool: return value in get_args(KVCacheLayoutType) @@ -838,3 +840,52 @@ def create_fast_prefill_custom_backend( builder_cls=FastPrefillAttentionBuilder) return attn_backend + + +def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor): + + # Needed for causal_conv1d + seqlens = query_start_loc_p.diff().to('cpu') + nums_dict = {} # type: ignore + batch_ptr = None + token_chunk_offset_ptr = None + for BLOCK_M in [8]: # cover all BLOCK_M values + nums = -(-seqlens // BLOCK_M) + nums_dict[BLOCK_M] = {} + nums_dict[BLOCK_M]['nums'] = nums + nums_dict[BLOCK_M]['tot'] = nums.sum().item() + mlist = torch.from_numpy(np.repeat(np.arange(len(nums)), nums)) + nums_dict[BLOCK_M]['mlist'] = mlist + mlist_len = len(nums_dict[BLOCK_M]['mlist']) + nums_dict[BLOCK_M]['mlist_len'] = mlist_len + MAX_NUM_PROGRAMS = max(1024, mlist_len) * 2 + offsetlist = [] # type: ignore + for idx, num in enumerate(nums): + offsetlist.extend(range(num)) + offsetlist = torch.tensor(offsetlist, dtype=torch.int32) + nums_dict[BLOCK_M]['offsetlist'] = offsetlist + + if batch_ptr is None: + # Update default value after class definition + batch_ptr = torch.full((MAX_NUM_PROGRAMS, ), + PAD_SLOT_ID, + dtype=torch.int32, + device='cuda') + token_chunk_offset_ptr = torch.full((MAX_NUM_PROGRAMS, ), + PAD_SLOT_ID, + dtype=torch.int32, + device='cuda') + else: + if batch_ptr.nelement() < MAX_NUM_PROGRAMS: + batch_ptr.resize_(MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID) + token_chunk_offset_ptr.resize_( # type: ignore + MAX_NUM_PROGRAMS).fill_(PAD_SLOT_ID) + + batch_ptr[0:mlist_len].copy_(mlist) + token_chunk_offset_ptr[ # type: ignore + 0:mlist_len].copy_(offsetlist) + nums_dict[BLOCK_M]['batch_ptr'] = batch_ptr + nums_dict[BLOCK_M]['token_chunk_offset_ptr'] = (token_chunk_offset_ptr + ) # type: ignore + + return nums_dict, batch_ptr, token_chunk_offset_ptr From cc1dc7ed6d2d901df46847724ad90147872a9d79 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 23 Sep 2025 12:02:10 -0400 Subject: [PATCH 141/185] [Core/DBO][2/N] Dual-Batch Overlap add DeepEP High Throughput support and Prefill support (#24845) Signed-off-by: Sage Moore Signed-off-by: Lucas Wilkinson Signed-off-by: yewentao256 Signed-off-by: Lucas Wilkinson Signed-off-by: Tyler Michael Smith Co-authored-by: Sage Moore Co-authored-by: yewentao256 Co-authored-by: Tyler Michael Smith --- .../v1/attention/test_attention_splitting.py | 83 ++++++++- tests/v1/spec_decode/test_eagle.py | 6 +- vllm/config/__init__.py | 12 +- vllm/config/parallel.py | 14 +- .../device_communicators/all2all.py | 28 ++- .../base_device_communicator.py | 6 + vllm/engine/arg_utils.py | 6 + vllm/envs.py | 11 ++ .../fused_moe/deepep_ht_prepare_finalize.py | 76 +++++--- .../fused_moe/deepep_ll_prepare_finalize.py | 10 +- .../layers/fused_moe/modular_kernel.py | 129 +++++++++---- vllm/v1/attention/backends/utils.py | 44 ++++- vllm/v1/spec_decode/eagle.py | 9 +- vllm/v1/worker/gpu_model_runner.py | 173 ++++++++---------- vllm/v1/worker/gpu_ubatch_wrapper.py | 87 ++++++++- vllm/v1/worker/ubatch_splitting.py | 76 ++++++-- vllm/v1/worker/ubatch_utils.py | 8 + vllm/v1/worker/ubatching.py | 35 ++-- vllm/v1/worker/utils.py | 25 ++- 19 files changed, 602 insertions(+), 236 deletions(-) diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py index c74dbb3ebb17e..7d7a46910be89 100644 --- a/tests/v1/attention/test_attention_splitting.py +++ b/tests/v1/attention/test_attention_splitting.py @@ -5,11 +5,12 @@ import pytest import torch from tests.v1.attention.test_attention_backends import BATCH_SPECS -from tests.v1.attention.utils import create_common_attn_metadata +from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm.v1.attention.backends.utils import (UBatchSlice, _make_metadata_with_slice, slice_query_start_locs, split_attn_metadata) +from vllm.v1.worker.ubatch_utils import create_ubatch_slices @pytest.fixture @@ -155,3 +156,83 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata): assert results[1].num_reqs == mid_point assert results[1].num_actual_tokens == mid_point assert torch.equal(results[1].seq_lens, torch.tensor([2048] * mid_point)) + + +@pytest.mark.parametrize( + "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs", + [ + # Split in the middle of request 1 + ([32, 40], [8, 8], 12, 2, 1), + # Split inside the first request + ([32, 40], [8, 8], 4, 1, 2), + ], +) +def test_prefill_split_across_ubatches(seq_lens, query_lens, split_point, + expected_first_reqs, + expected_second_reqs): + """Test splitting a prefill across ubatches""" + import numpy as np + + device = torch.device("cpu") + batch_spec = BatchSpec(seq_lens=seq_lens, query_lens=query_lens) + common = create_common_attn_metadata(batch_spec, + block_size=16, + device=device) + + num_scheduled_tokens = np.array(query_lens, dtype=np.int32) + qsl_np = common.query_start_loc_cpu.numpy() + num_tokens = common.num_actual_tokens + + ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point) + assert len(ubatch_slices) == 2 + + first_meta = _make_metadata_with_slice(ubatch_slices[0], common) + second_meta = _make_metadata_with_slice(ubatch_slices[1], common) + + # Token counts match the split + assert first_meta.num_actual_tokens == split_point + assert second_meta.num_actual_tokens == num_tokens - split_point + + # Number of requests per ubatch + assert first_meta.num_reqs == expected_first_reqs + assert second_meta.num_reqs == expected_second_reqs + + # Identify which request is split and how many tokens are in the first chunk + split_req_idx = int(np.searchsorted(qsl_np, split_point, side="right") - 1) + tokens_in_first_chunk = split_point - int(qsl_np[split_req_idx]) + orig_q_lens = (common.query_start_loc_cpu[1:] - + common.query_start_loc_cpu[:-1]) + + # Check query length continuity: first-chunk + second-chunk == original qlen + # First ubatch last request query length + qlen_first_last = int(first_meta.query_start_loc_cpu[-1] - + first_meta.query_start_loc_cpu[-2]) + # Second ubatch first request query length + qlen_second_first = int(second_meta.query_start_loc_cpu[1] - + second_meta.query_start_loc_cpu[0]) + assert qlen_first_last == tokens_in_first_chunk + assert qlen_first_last + qlen_second_first == int( + orig_q_lens[split_req_idx]) + + # Check seq_lens adjustments + # Context lengths per original request + context_lens = [s - q for s, q in zip(seq_lens, query_lens)] + + # First ubatch: last request's seq_len should be + # context + tokens_in_first_chunk + expected_seqlen = context_lens[split_req_idx] + tokens_in_first_chunk + assert int(first_meta.seq_lens[-1]) == expected_seqlen + + # For full preceding requests in first ubatch, seq_lens should match + # originals + for i in range(first_meta.num_reqs - 1): + assert int(first_meta.seq_lens[i]) == seq_lens[i] + + # Second ubatch: first request (continuation) seq_len should be full + # original + assert int(second_meta.seq_lens[0]) == seq_lens[split_req_idx] + # Any following full requests in second ubatch should match originals + for j in range(1, second_meta.num_reqs): + # Map to original request index + orig_idx = split_req_idx + j + assert int(second_meta.seq_lens[j]) == seq_lens[orig_idx] diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index e7f6b68fc3f77..23bfabfcf89b1 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -532,9 +532,8 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): # Mock runner for attention metadata building proposer.runner = mock.MagicMock() proposer.runner.attn_groups.append([mock.MagicMock()]) - proposer.runner.attn_groups[0][0].metadata_builders = [ + proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \ attn_metadata_builder - ] result = proposer.propose(target_token_ids=target_token_ids, target_positions=target_positions, @@ -659,9 +658,8 @@ def test_propose_tree(spec_token_tree): # Mock runner for attention metadata building. proposer.runner = mock.MagicMock() proposer.runner.attn_groups.append([mock.MagicMock()]) - proposer.runner.attn_groups[0][0].metadata_builders = [ + proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \ attn_metadata_builder - ] # Setup inputs for the proposer. target_token_ids = torch.randint(0, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 92fc68f8927ca..a2562a10b45a6 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -638,11 +638,13 @@ class VllmConfig: if self.parallel_config.enable_dbo: a2a_backend = envs.VLLM_ALL2ALL_BACKEND - assert a2a_backend == "deepep_low_latency", \ - "Microbatching currently only supports the deepep_low_latency "\ - f"all2all backend. {a2a_backend} is not supported. To fix set "\ - "the VLLM_ALL2ALL_BACKEND environment variable to "\ - "deepep_low_latency and install the DeepEP kerenls." + assert a2a_backend in \ + ["deepep_low_latency", "deepep_high_throughput"], \ + "Microbatching currently only supports the deepep_low_latency and "\ + f"deepep_high_throughput all2all backend. {a2a_backend} is not "\ + "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\ + "variable to deepep_low_latency or deepep_high_throughput and "\ + "install the DeepEP kernels." if not self.instance_id: self.instance_id = random_uuid()[:5] diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index a84d882430166..f80eb1adc7fd3 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -139,12 +139,18 @@ class ParallelConfig: """Disable the custom all-reduce kernel and fall back to NCCL.""" enable_dbo: bool = False - """Enable microbatching for the model executor.""" + """Enable dual batch overlap for the model executor.""" dbo_decode_token_threshold: int = 32 - """The threshold for microbatching. If the number of tokens in the - request is greater than this threshold, microbatching will be used. - Otherwise, the request will be processed in a single batch.""" + """The threshold for dual batch overlap for batches only containing decodes. + If the number of tokens in the request is greater than this threshold, + microbatching will be used. Otherwise, the request will be processed in a + single batch.""" + dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune + """The threshold for dual batch overlap for batches that contain one or more + prefills. If the number of tokens in the request is greater than this + threshold, microbatching will be used. Otherwise, the request will be + processed in a single batch.""" ray_workers_use_nsight: bool = False """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 149df73d8667b..ae18429f62518 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any +from typing import Any, Optional import torch import torch.distributed as dist +import vllm.envs as envs from vllm.distributed import get_dp_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger @@ -200,12 +201,12 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): def _make_all2all_kwargs(self) -> dict[Any, Any]: # Defaults for internode and intranode are taken from DeepEP tests. - num_nvl_bytes = 1024 * 1024 * 1024 + num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 num_rdma_bytes = None num_qps_per_rank = None if self.internode: - num_rdma_bytes = 1024 * 1024 * 1024 + num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 num_qps_per_rank = self.num_sms // 2 else: num_rdma_bytes = 0 @@ -230,13 +231,18 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): logger.debug("DeepEP all2all args %s", buffer_kwargs) handle: deep_ep.Buffer = self.handle_cache.get_or_create( buffer_kwargs, deep_ep.Buffer) - # It is dangerous to set num sms outside this function. num_sms is not - # a part of the hash-key that identifies this object. If we are in a - # situation where we make objects with different num_sms, the hash key - # in get_or_create must be updated. - handle.set_num_sms(self.num_sms) return handle + def set_num_sms(self, num_sms: int): + import deep_ep + + # Right now the buffers are sized for only what the kernels were + # created with. So we can only reduce the number of SMS used + # but not increase it. + if num_sms > self.num_sms: + num_sms = self.num_sms + deep_ep.Buffer.set_num_sms(num_sms) + class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): """ @@ -265,7 +271,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): import deep_ep # Defaults for internode and intranode are taken from DeepEP tests. - num_nvl_bytes = 1024 * 1024 * 1024 + num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 num_qps_per_rank = num_local_experts num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, @@ -291,3 +297,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): handle: deep_ep.Buffer = self.handle_cache.get_or_create( buffer_kwargs, deep_ep.Buffer) return handle + + # DeepEP LL uses RDMA so no SMs are used for communication + def max_sms_used(self) -> Optional[int]: + return 0 \ No newline at end of file diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 01f59b44a0e69..586441c917830 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -60,6 +60,12 @@ class All2AllManagerBase: # and reuse it for the same config. raise NotImplementedError + def set_num_sms(self, num_sms: int): + pass + + def max_sms_used(self) -> Optional[int]: + return None # None means it could use the whole GPU + def dispatch(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): raise NotImplementedError diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8c7a1b413cdb7..556a490ffa109 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -330,6 +330,8 @@ class EngineArgs: enable_dbo: bool = ParallelConfig.enable_dbo dbo_decode_token_threshold: int = \ ParallelConfig.dbo_decode_token_threshold + dbo_prefill_token_threshold: int = \ + ParallelConfig.dbo_prefill_token_threshold eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb expert_placement_strategy: ExpertPlacementStrategy = \ @@ -698,6 +700,9 @@ class EngineArgs: parallel_group.add_argument( "--dbo-decode-token-threshold", **parallel_kwargs["dbo_decode_token_threshold"]) + parallel_group.add_argument( + "--dbo-prefill-token-threshold", + **parallel_kwargs["dbo_prefill_token_threshold"]) parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) parallel_group.add_argument("--eplb-config", @@ -1316,6 +1321,7 @@ class EngineArgs: enable_expert_parallel=self.enable_expert_parallel, enable_dbo=self.enable_dbo, dbo_decode_token_threshold=self.dbo_decode_token_threshold, + dbo_prefill_token_threshold=self.dbo_prefill_token_threshold, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, expert_placement_strategy=self.expert_placement_strategy, diff --git a/vllm/envs.py b/vllm/envs.py index ee5efff8bcd92..f6eafe892ef2d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -189,6 +189,8 @@ if TYPE_CHECKING: VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 + VLLM_DBO_COMM_SMS: int = 20 GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None @@ -1392,6 +1394,15 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", "VLLM_OBJECT_STORAGE_SHM_BUFFER"), + # The size in MB of the buffers (NVL and RDMA) used by DeepEP + "VLLM_DEEPEP_BUFFER_SIZE_MB": + lambda: int(os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")), + + # The number of SMs to allocate for communication kernels when running DBO + # the rest of the SMs on the device will be allocated to compute + "VLLM_DBO_COMM_SMS": + lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")), + # Valid values are container,code_interpreter,web_search_preview # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter "GPT_OSS_SYSTEM_TOOL_MCP_LABELS": diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index a250a6218715e..9e9a9afc18a03 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -12,6 +12,11 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) from vllm.utils import round_up +from vllm.v1.worker.ubatching import ( + dbo_current_ubatch_id, dbo_enabled, dbo_switch_to_comm, + dbo_switch_to_compute, dbo_switch_to_compute_sync, + dbo_yield_and_switch_from_comm_to_compute, + dbo_yield_and_switch_from_compute_to_comm) class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): @@ -46,9 +51,9 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.async_prepare = True # The dispatch function returns a handle that the combine function - # requires. We store the handle here so it is available to the - # combine function. - self.handle = None + # requires. Under DBO microbatching we must track one handle per + # micro-batch to avoid races between threads. + self.handles = [None, None] # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164 self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160] @@ -89,6 +94,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): has_scales = token_scales is not None + # We yield before launching the dispatch kernel since the dispatch + # kernel will block the CPU so we want to queue up all the compute + # for the other ubatch before the dispatch kernel starts. + dbo_yield_and_switch_from_compute_to_comm() + (num_tokens_per_rank, num_tokens_per_rdma_rank, dispatch_expert_num_tokens, is_token_in_rank, event) = self.buffer.get_dispatch_layout( @@ -104,7 +114,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ( token_data, expert_topk_ids, expert_topk_weights, - expert_num_tokens_per_expert_list, self.handle, event + expert_num_tokens_per_expert_list, handle, event ) = self.buffer.dispatch( x=token_data, handle=None, @@ -119,9 +129,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_alignment=1, config=self._get_dispatch_config(), previous_event=None, - async_finish=self.async_prepare, + async_finish=self.async_prepare and not dbo_enabled(), allocate_on_comm_stream=False) + # record the handle for this ubatch + a2a_idx = dbo_current_ubatch_id() + self.handles[a2a_idx] = handle + + dbo_switch_to_compute_sync() + return lambda: self._receiver( event, has_scales, @@ -146,7 +162,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1_scale: Optional[torch.Tensor], quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - if self.async_prepare: + if event.event is not None: event.current_stream_wait() if has_scales: @@ -207,7 +223,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[Callable, mk.ReceiverType]: + ) -> mk.ReceiverType: if apply_router_weight_on_input: topk = topk_ids.size(1) @@ -233,14 +249,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1q_scale = None a1_post_scale = quant_config.a1_scale - return (lambda *args: None, - self._do_dispatch(tokens=a1q, - token_scales=a1q_scale, - rank_topk_ids=topk_ids, - rank_topk_weights=topk_weights, - num_experts=num_experts, - a1_scale=a1_post_scale, - quant_config=quant_config)) + return self._do_dispatch(tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=topk_ids, + rank_topk_weights=topk_weights, + num_experts=num_experts, + a1_scale=a1_post_scale, + quant_config=quant_config) def prepare( self, @@ -252,10 +267,9 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - (_, receiver) = self.prepare_async(a1, topk_weights, topk_ids, - num_experts, expert_map, - apply_router_weight_on_input, - quant_config) + receiver = self.prepare_async(a1, topk_weights, topk_ids, num_experts, + expert_map, apply_router_weight_on_input, + quant_config) return receiver() def _finalize( @@ -269,7 +283,9 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_async: bool, ) -> Optional[Callable]: - assert self.handle is not None + a2a_idx = dbo_current_ubatch_id() + handle = self.handles[a2a_idx] + assert handle is not None # fused_expert_output can have 0 tokens - This happens when none of the # tokens from the all2all reach this EP rank. @@ -283,25 +299,35 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids=topk_ids, apply_router_weight_on_input=apply_router_weight_on_input, ) - + dbo_yield_and_switch_from_compute_to_comm() combined_x, _, event = self.buffer.combine( x=fused_expert_output, - handle=self.handle, + handle=handle, topk_weights=None, config=self._get_combine_config(), previous_event=None, - async_finish=do_async, + async_finish=do_async and not dbo_enabled(), allocate_on_comm_stream=False) + dbo_switch_to_compute() + if do_async: def _receiver(): - event.current_stream_wait() + if event.event is not None: + event.current_stream_wait() + dbo_switch_to_comm() # Respect inplace outputs. output.copy_(combined_x, non_blocking=True) - return lambda: _receiver() + # TODO(lucas): refactor the modular kernel so this will be + # handled there + dbo_yield_and_switch_from_comm_to_compute() + + return _receiver else: + # TODO(lucas): support this case with the refactored modular kernel + assert not dbo_enabled() # Respect inplace outputs. output.copy_(combined_x, non_blocking=True) return None diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 101fc8798c427..a9554291db69c 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -206,7 +206,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, do_async: bool, - ) -> Optional[Callable]: + ) -> tuple[Callable, Callable]: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") @@ -233,7 +233,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return_recv_hook=do_recv_hook, out=output) - return recv_hook + return recv_hook, lambda: None def finalize_async( self, @@ -243,8 +243,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> Callable: - recv_hook = self._finalize( + ) -> tuple[Callable, Callable]: + return self._finalize( output, fused_expert_output, topk_weights, @@ -253,8 +253,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): weight_and_reduce_impl, do_async=True, ) - assert recv_hook is not None - return recv_hook def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 5fce24018e647..4ba14196682a5 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -13,7 +13,8 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable _resize_cache, count_expert_num_tokens) from vllm.utils import cdiv -from vllm.v1.worker.ubatching import (dbo_enabled, dbo_maybe_run_recv_hook, +from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled, + dbo_maybe_run_recv_hook, dbo_register_recv_hook, dbo_yield) # @@ -223,7 +224,7 @@ class FusedMoEPrepareAndFinalize(ABC): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[Callable, ReceiverType]: + ) -> Union[tuple[Callable, ReceiverType], ReceiverType]: """ Perform any quantization (and/or) dispatching needed for this kernel but do not wait for results from other workers. @@ -239,10 +240,21 @@ class FusedMoEPrepareAndFinalize(ABC): - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. - Returns a callback that when invoked waits for results from other - workers and has the same return signature as `prepare`, e.g. + Returns a callback or a hook callback pair that when invoked waits for + results from other workers and has the same return signature as + `prepare`, if a hook is returned this is more lightweight check that + the recv is complete without doing extra work (used by DBO, will be + refactored in the very near future) + + e.g. - receiver = obj.prepare_async(...) + ret = obj.prepare_async(...) + + if isinstance(ret, tuple): + hook, receiver = ret + hook() + + if hook is not None: a, a_scales, expert_meta, topk_ids, topk_weights = receiver() is equivalent to: @@ -284,7 +296,7 @@ class FusedMoEPrepareAndFinalize(ABC): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: TopKWeightAndReduce, - ) -> Callable: + ) -> Union[tuple[Callable, Callable], Callable]: """ Perform any combine plus apply weights and perform a reduction on the fused experts output but do not wait for results from other workers. @@ -298,11 +310,17 @@ class FusedMoEPrepareAndFinalize(ABC): - weight_and_reduce_impl: An optional TopKWeightAndReduce implementation. - Returns a callback that when invoked waits for results from other - workers and has the same return signature as `finalize`, e.g. + Returns a callback or a hook callback pair that when invoked waits for + results from other workers and has the same return signature as + `finalize`, if a hook is returned this is more lightweight check that + the recv is complete without doing extra work (used by DBO, will be + refactored in the very near future) - receiver = obj.finalize_async(output, ...) + ret = obj.finalize_async(output, ...) ... output not valid yet ... + if isinstance(ret, tuple): + hook, receiver = ret + hook() receiver() ... output valid here ... @@ -600,9 +618,23 @@ class FusedMoEModularKernel(torch.nn.Module): layer due to any layer specific state that may be used by the component objects. """ - fused_out_buffer = SharedResizableBuffer() - workspace13_buffer = SharedResizableBuffer() - workspace2_buffer = SharedResizableBuffer() + + class SharedBuffers: + + def __init__(self) -> None: + self.fused_out = SharedResizableBuffer() + self.workspace13 = SharedResizableBuffer() + self.workspace2 = SharedResizableBuffer() + + # Persistent buffers that are shared across `FusedMoEModularKernel` + # instances (layers), to save memory and allocattions. + # + # We have two sets of buffers to support dual batch overlap (DBO) where each + # microbatch (ubatch) should use its own set of buffers to avoid + # cross-ubatch contimination. + # NOTE that memory is lazily allocated for these buffers, meaning that if + # DBO isn't being used, the second SharedBuffers will be empty. + shared_buffers: list[SharedBuffers] = [SharedBuffers(), SharedBuffers()] def __init__( self, @@ -647,14 +679,18 @@ class FusedMoEModularKernel(torch.nn.Module): a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts, expert_tokens_meta) + # select per-ubatch buffers to avoid cross-ubatch reuse under DBO + ubatch_idx = dbo_current_ubatch_id() + buffers = self.shared_buffers[ubatch_idx] + # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - workspace13 = self.workspace13_buffer.get(workspace13_shape, - device=a1.device, - dtype=workspace_dtype) - workspace2 = self.workspace2_buffer.get(workspace2_shape, - device=a1.device, - dtype=workspace_dtype) + workspace13 = buffers.workspace13.get(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = buffers.workspace2.get(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) assert fused_out is None or fused_out.shape == fused_out_shape, ( f"fused_out {fused_out.shape} but expected {fused_out_shape}") @@ -733,9 +769,11 @@ class FusedMoEModularKernel(torch.nn.Module): (_, _, fused_out_shape, _) = self.fused_experts.workspace_shapes( a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts, expert_tokens_meta) - fused_out = self.fused_out_buffer.get(fused_out_shape, - device=a1q.device, - dtype=a1.dtype) + ubatch_idx = dbo_current_ubatch_id() + buffers = self.shared_buffers[ubatch_idx] + fused_out = buffers.fused_out.get(fused_out_shape, + device=a1q.device, + dtype=a1.dtype) def slice_input_tensors( chunk_idx: int @@ -868,6 +906,7 @@ class FusedMoEModularKernel(torch.nn.Module): if not self.prepare_finalize.supports_async(): # We shouldn't be running an a2a kernel that doesn't # support async prepare/finalize + # TODO(lucas): enable in follow-up assert not dbo_enabled() (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, @@ -883,7 +922,7 @@ class FusedMoEModularKernel(torch.nn.Module): else: # Overlap shared expert compute with all2all dispatch. dbo_maybe_run_recv_hook() - hook, receiver = self.prepare_finalize.prepare_async( + prepare_ret = self.prepare_finalize.prepare_async( a1, topk_weights, topk_ids, @@ -893,13 +932,21 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.quant_config, ) - # If DBO is being used, register the hook with the ubatch context - # and call it in dbo_maybe_run_recv_hook instead of passing it to - # the receiver. - dbo_register_recv_hook(hook) - dbo_yield() - if not dbo_enabled(): - hook() + # TODO(lucas): refactor this in the alternative schedules followup + # currently unpack if we have hook + receiver pair or just + # receiver (see finalize_async docstring) + hook, receiver = prepare_ret \ + if isinstance(prepare_ret, tuple) else (None, prepare_ret) + + if hook is not None: + if dbo_enabled(): + # If DBO is being used, register the hook with the ubatch + # context and call it in dbo_maybe_run_recv_hook instead of + # passing it to the receiver. + dbo_register_recv_hook(hook) + dbo_yield() + else: + hook() (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, _expert_topk_weights) = receiver() @@ -952,7 +999,7 @@ class FusedMoEModularKernel(torch.nn.Module): if self.shared_experts is not None: shared_output = self.shared_experts(a1) else: - recv_hook = self.prepare_finalize.finalize_async( + finalize_ret = self.prepare_finalize.finalize_async( output, fused_out, topk_weights, @@ -964,11 +1011,23 @@ class FusedMoEModularKernel(torch.nn.Module): if self.shared_experts is not None: shared_output = self.shared_experts(a1) - assert recv_hook is not None - dbo_register_recv_hook(recv_hook) - dbo_yield() - if not dbo_enabled(): - recv_hook() + # TODO(lucas): refactor this in the alternative schedules followup + # currently unpack if we have hook + receiver pair or just + # receiver (see finalize_async docstring) + hook, receiver = finalize_ret \ + if isinstance(finalize_ret, tuple) else (None, finalize_ret) + + if hook is not None: + if dbo_enabled(): + # If DBO is being used, register the hook with the ubatch + # context and call it in dbo_maybe_run_recv_hook instead of + # passing it to the receiver. + dbo_register_recv_hook(hook) + dbo_yield() + else: + hook() + + receiver() if self.shared_experts is None: return output diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 6ef489f5a7a28..f837439f953e8 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -107,19 +107,57 @@ def _make_metadata_with_slice( the requests included in ubatch_slice """ + assert not ubatch_slice.is_empty(), ( + f"Ubatch slice {ubatch_slice} is empty") + request_slice = ubatch_slice.request_slice token_slice = ubatch_slice.token_slice + start_locs = attn_metadata.query_start_loc_cpu + first_req = request_slice.start + first_tok = token_slice.start + last_req = request_slice.stop - 1 + last_tok = token_slice.stop - 1 + + assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], \ + "Token slice start outside of first request" + assert start_locs[last_req] <= last_tok < start_locs[last_req+1], \ + "Token slice end outside of last request" + + # If the "middle" request has tokens in both ubatches, we have to split it. + # If ubatch_slice is the first ubatch then we will be splitting the last + # request. If it's the second microbatch, then we will be splitting the + # first request + splits_first_request = first_tok > start_locs[first_req] + splits_last_request = last_tok < start_locs[last_req + 1] - 1 + + query_start_loc_cpu = slice_query_start_locs(start_locs, request_slice) query_start_loc = slice_query_start_locs(attn_metadata.query_start_loc, request_slice) + assert len(query_start_loc) >= 2, ( f"query_start_loc must have at least 2 elements, " f"got {len(query_start_loc)}") - query_start_loc_cpu = slice_query_start_locs( - attn_metadata.query_start_loc_cpu, request_slice) + if splits_first_request: + tokens_skipped = first_tok - start_locs[first_req] + query_start_loc[1:] -= tokens_skipped + query_start_loc_cpu[1:] -= tokens_skipped seq_lens = attn_metadata.seq_lens[request_slice] seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice] + + if splits_last_request: + tokens_skipped = query_start_loc_cpu[-1] - token_slice.stop + query_start_loc[-1] -= tokens_skipped + query_start_loc_cpu[-1] -= tokens_skipped + + # Make sure we don't modify the seq_lens tensors + # (not cudagraph compatible) + seq_lens = seq_lens.clone() + seq_lens_cpu = seq_lens_cpu.clone() + seq_lens[-1] -= tokens_skipped + seq_lens_cpu[-1] -= tokens_skipped + max_seq_len = int(seq_lens_cpu.max()) num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[ request_slice] @@ -167,6 +205,7 @@ def split_attn_metadata( for ubatch_slice in ubatch_slices: results.append( _make_metadata_with_slice(ubatch_slice, common_attn_metadata)) + return results @@ -696,7 +735,6 @@ def split_decodes_and_prefills( return num_reqs, 0, num_tokens, 0 first_prefill = is_prefill.int().argmax(dim=-1).item() - assert torch.all(query_lens[first_prefill:] > decode_threshold) assert torch.all(query_lens[:first_prefill] <= decode_threshold) num_decodes = first_prefill num_prefills = num_reqs - num_decodes diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index dc97d5c8f39d4..a0f40828d42f5 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -30,7 +30,6 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch -from vllm.v1.worker.ubatching import dbo_current_ubatch_id logger = init_logger(__name__) @@ -192,9 +191,8 @@ class EagleProposer: assert self.runner is not None # FIXME: need to consider multiple kv_cache_groups - ubatch_id = dbo_current_ubatch_id() attn_metadata_builder = \ - self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata = attn_metadata_builder.build_for_drafting( common_attn_metadata=common_attn_metadata, draft_index=0) @@ -330,7 +328,7 @@ class EagleProposer: # Rebuild attention metadata attn_metadata_builder = \ - self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata = attn_metadata_builder\ .build_for_drafting(common_attn_metadata=common_attn_metadata, draft_index=token_index + 1) @@ -538,9 +536,8 @@ class EagleProposer: hidden_states: torch.Tensor, common_attn_metadata: CommonAttentionMetadata, ) -> list[torch.Tensor]: - ubatch_id = dbo_current_ubatch_id() tree_attn_metadata_builder = \ - self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + self.runner.attn_groups[0][0].get_metadata_builder() assert isinstance(tree_attn_metadata_builder, TreeAttentionMetadataBuilder) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 89b9a3c34f2ac..ed324138c6fe1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -96,7 +96,8 @@ from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from vllm.v1.worker.ubatch_splitting import get_dp_padding_ubatch, ubatch_split +from vllm.v1.worker.ubatch_splitting import (check_ubatch_thresholds, + ubatch_split) from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices from vllm.v1.worker.utils import is_residual_scattered_for_sp @@ -1032,7 +1033,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens_padded = num_tokens_unpadded + self.get_local_padding( num_tokens_unpadded) ubatch_slices, num_tokens_after_padding = \ - ubatch_split(max_num_scheduled_tokens, + ubatch_split(num_scheduled_tokens, num_tokens_unpadded, num_tokens_padded, self.vllm_config) @@ -1206,7 +1207,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ubatch_slices, common_attn_metadata) for ubid, common_attn_metadata in enumerate( common_attn_metadata_list): - assert common_attn_metadata.max_query_len == 1 attn_metadata_i = (attn_group.get_metadata_builder( ubatch_id=ubid).build( common_prefix_len=common_prefix_len, @@ -2182,9 +2182,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) = self._preprocess(scheduler_output, intermediate_tensors, ubatch_slices, num_tokens_after_padding) - if ubatch_slices is not None: - num_input_tokens = num_input_tokens // 2 - uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( num_scheduled_tokens @@ -2194,6 +2191,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode, batch_descriptor = \ self.cudagraph_dispatcher.dispatch(batch_descriptor) + # This is currently to get around the assert in the DPMetadata + # where it wants `num_tokens_across_dp` to align with `num_tokens` + if ubatch_slices is not None: + num_input_tokens = ubatch_slices[0].num_tokens + # Run the model. # Use persistent buffers for CUDA graphs. with (set_forward_context( @@ -2821,7 +2823,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, force_attention: bool = False, uniform_decode: bool = False, - allow_microbatching: bool = False, + allow_microbatching: bool = True, skip_eplb: bool = False, is_profile: bool = False, create_mixed_batch: bool = False, @@ -2847,32 +2849,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (1 token) and prefill (multiple tokens) requests. remove_lora: If False, dummy LoRAs are not destroyed after the run """ - ubatch_enabled = self.parallel_config.enable_dbo - num_tokens_across_dp = None - num_pad = 0 - should_ubatch = False - if ubatch_enabled: - should_ubatch = num_tokens >= \ - self.parallel_config.dbo_decode_token_threshold and \ - allow_microbatching - - (should_ubatch, num_tokens_across_dp) = get_dp_padding_ubatch( - num_tokens, num_tokens, should_ubatch, self.vllm_config) - - # Currently the dummy run should only be ubatching during - # cuda graph capture, meaning all DP ranks should already - # have the same batch size - if num_tokens_across_dp is not None: - assert int(num_tokens_across_dp[0]) == num_tokens // 2 - assert cudagraph_runtime_mode in { CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL } - if not should_ubatch: - num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) - num_tokens += num_pad - # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using # different graphs and/or modes for mixed prefill-decode batches vs. @@ -2888,10 +2868,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # for GQA/MQA. max_query_len = self.uniform_decode_query_len if uniform_decode else \ num_tokens - if allow_microbatching: - assert self.uniform_decode_query_len == 1 - assert uniform_decode is True - assert max_query_len == 1 # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively @@ -2930,20 +2906,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(num_scheduled_tokens_list) == num_reqs num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) + total_num_scheduled_tokens = int(num_scheduled_tokens.sum()) ubatch_slices = None + num_tokens_after_padding = None + # We currently only microbatch if the number of tokens is # over a certain threshold. - if should_ubatch: - # We only support decode-only cudagraphs - assert num_reqs == num_tokens - assert num_tokens % 2 == 0 - ubatch_slices = [ - UBatchSlice(slice(0, num_reqs // 2), slice(0, - num_tokens // 2)), - UBatchSlice(slice(num_reqs // 2, num_reqs), - slice(num_tokens // 2, num_tokens)) - ] + if self.parallel_config.enable_dbo and allow_microbatching: + ubatch_slices, num_tokens_after_padding = ubatch_split( + num_scheduled_tokens, + total_num_scheduled_tokens, + total_num_scheduled_tokens, + self.vllm_config, + ) + + # If we failed to microbatch, currently need to resynchronize + # TODO(lucas,sage): we should be able to avoid this second sync by + # refactoring `get_dp_padding_ubatch` and `get_dp_padding` into + # a single `coordinate_batch_across_dp` function. + if num_tokens_after_padding is None: + num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) + num_tokens_after_padding = num_tokens + num_pad + else: + num_tokens_across_dp = num_tokens_after_padding + num_tokens_after_padding = int(num_tokens_after_padding[0].item()) attn_metadata: Optional[PerLayerAttnMetadata] = None @@ -2966,6 +2953,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.seq_lens.np[num_reqs:] = 0 self.seq_lens.copy_to_gpu() + cum_num_tokens, _ = self._get_cumsum_and_arange( + num_scheduled_tokens) + self.query_start_loc.np[1:num_reqs + 1] = cum_num_tokens + self.query_start_loc.copy_to_gpu() + for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): common_attn_metadata = CommonAttentionMetadata( @@ -3060,7 +3052,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, - num_tokens=num_tokens, + num_tokens=num_tokens_after_padding, num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, @@ -3395,56 +3387,51 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): desc="Capturing CUDA graphs ({}, {})".format( "decode" if uniform_decode else "mixed prefill-decode", cudagraph_runtime_mode.name)) - enable_dbo = self.parallel_config.enable_dbo - # DBO Only supports running Full cudagraphs with uniform - # decode lengths - if enable_dbo and uniform_decode: - for num_tokens in compilation_cases: - # If the number of tokens is greater than the microbatching - # threshold, don't generate a microbatched cudagraph - if (num_tokens - < self.parallel_config.dbo_decode_token_threshold): - continue - # Warmup + # We skip EPLB here since we don't want to record dummy metrics + for num_tokens in compilation_cases: + # We currently only capture ubatched graphs when its a FULL + # cudagraph and for uniform decode batches. + capture_ubatched_graph = self.parallel_config.enable_dbo \ + and cudagraph_runtime_mode == CUDAGraphMode.FULL \ + and uniform_decode \ + and check_ubatch_thresholds( + config=self.vllm_config.parallel_config, + num_tokens=num_tokens, + uniform_decode=uniform_decode, + ) + + # Currently we capture both microbatched and non-microbatched + # graphs when capture_ubatched_graph is True, this is because + # occasionally we will be forced out of microbatching due to other + # DP ranks not microbatching (usually caused by an empty second + # microbatch; once we resolve this, we can remove the + # non-microbatched graph capture). + allow_microbatching_options = [True, False] if \ + capture_ubatched_graph else [False] + for allow_microbatching in allow_microbatching_options: for _ in range( self.compilation_config.cudagraph_num_of_warmups): + # Use CUDAGraphRuntimeStyle.NONE (default) for warmup. + # But be careful, warm up with `NONE`is orthogonal to + # if we want to warm up attention or not. This is + # different from the case where `FULL` implies capture + # attention while `PIECEWISE` implies no attention. force_attention = ( cudagraph_runtime_mode == CUDAGraphMode.FULL) self._dummy_run(num_tokens, cudagraph_runtime_mode=CUDAGraphMode.NONE, force_attention=force_attention, - uniform_decode=True, - allow_microbatching=True, - skip_eplb=True) - - # Graph Capture + uniform_decode=uniform_decode, + allow_microbatching=allow_microbatching, + skip_eplb=True, + remove_lora=False) self._dummy_run(num_tokens, - cudagraph_runtime_mode=CUDAGraphMode.FULL, - uniform_decode=True, - allow_microbatching=True, - skip_eplb=True) - # We skip EPLB here since we don't want to record dummy metrics - for num_tokens in compilation_cases: - for _ in range(self.compilation_config.cudagraph_num_of_warmups): - # Use CUDAGraphRuntimeStyle.NONE (default) for warmup. - # But be careful, warm up with `NONE`is orthogonal to - # if we want to warm up attention or not. This is - # different from the case where `FULL` implies capture - # attention while `PIECEWISE` implies no attention. - force_attention = ( - cudagraph_runtime_mode == CUDAGraphMode.FULL) - self._dummy_run(num_tokens, - cudagraph_runtime_mode=CUDAGraphMode.NONE, - force_attention=force_attention, + cudagraph_runtime_mode=cudagraph_runtime_mode, uniform_decode=uniform_decode, + allow_microbatching=allow_microbatching, skip_eplb=True, remove_lora=False) - self._dummy_run(num_tokens, - cudagraph_runtime_mode=cudagraph_runtime_mode, - uniform_decode=uniform_decode, - skip_eplb=True, - remove_lora=False) self.maybe_remove_all_loras(self.lora_config) def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: @@ -3500,24 +3487,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_groups: list[AttentionGroup] = [] for (attn_backend, kv_cache_spec), layer_names in attn_backends_map.items(): - attn_metadata_builders = [] - attn_metadata_builders.append(attn_backend.get_builder_cls()( - kv_cache_spec, + attn_group = AttentionGroup.create_with_metadata_builders( + attn_backend, layer_names, + kv_cache_spec, self.vllm_config, self.device, - )) - if self.parallel_config.enable_dbo: - attn_metadata_builders.append( - attn_backend.get_builder_cls()( - kv_cache_spec, - layer_names, - self.vllm_config, - self.device, - )) - attn_group = AttentionGroup(attn_backend, - attn_metadata_builders, - layer_names, kv_cache_spec) + num_metadata_builders=1 + if not self.parallel_config.enable_dbo else 2, + ) + attn_groups.append(attn_group) return attn_groups diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 5012ad0483c84..bfc3743ea4179 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -1,25 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses import threading +from dataclasses import dataclass from typing import Any, Callable, Optional import torch +import vllm.envs as envs from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.config import CUDAGraphMode, VllmConfig +from vllm.distributed import get_ep_group from vllm.forward_context import (create_forward_context, get_forward_context, override_forward_context) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors +from vllm.utils import has_deep_gemm from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts logger = init_logger(__name__) -@dataclasses.dataclass +@dataclass class UbatchMetadata: context: UBatchContext input_ids: torch.Tensor @@ -29,13 +32,55 @@ class UbatchMetadata: num_tokens: int -@dataclasses.dataclass +@dataclass class CUDAGraphMetaData: cudagraph: torch.cuda.CUDAGraph ubatch_metadata: UbatchMetadata outputs: Optional[Any] = None +class SMControlContextManager: + + def __init__(self, comm_sms: int, set_comm_sms: Callable[[int], None], + set_compute_sms: Callable[[int], None]): + """ + Context manager for controlling SM (Streaming Multiprocessor) + allocation. Upon entering the context, it sets the number of SMs + allocated for communication and computation to comm_sms and + total_sms - comm_sms respectively. Upon exiting, it restores the + allocation to use all available SMs (i.e. total_sms). + + Args: + comm_sms (int): The number of SMs to allocate for communication. + (The remainder will be used for computation.) + set_comm_sms (Callable[[int], None]): + A function that sets the number of SMs for communication. + set_compute_sms (Callable[[int], None]): + A function that sets the number of SMs for computation. + """ + + assert current_platform.is_cuda(), \ + "SM control is currently only supported on CUDA" + + props = torch.cuda.get_device_properties(torch.cuda.current_device()) + total_sms = props.multi_processor_count + + assert comm_sms < total_sms + self.total_sms = total_sms + self.compute_sms = total_sms - comm_sms + self.comm_sms = comm_sms + self.set_comm_sms = set_comm_sms + self.set_compute_sms = set_compute_sms + + def __enter__(self): + self.set_comm_sms(self.comm_sms) + self.set_compute_sms(self.compute_sms) + + def __exit__(self, exc_type, exc_value, traceback): + self.set_comm_sms(self.total_sms) + self.set_compute_sms(self.total_sms) + + class UBatchWrapper: def __init__(self, runnable: Callable, vllm_config: VllmConfig, @@ -56,6 +101,35 @@ class UBatchWrapper: runnable, vllm_config, runtime_mode=runtime_mode) self.graph_pool = current_platform.get_global_graph_pool() + self.sm_control = self._create_sm_control_context(vllm_config) + + @staticmethod + def _create_sm_control_context(vllm_config: VllmConfig): + comm_sms = envs.VLLM_DBO_COMM_SMS + + set_comm_sms = lambda sms: None + if vllm_config.parallel_config.enable_expert_parallel: + # Currently only DeepEP highthroughput supports SM control so this + # only affects that case. + all2all_manager = get_ep_group( + ).device_communicator.all2all_manager + + if all2all_manager.max_sms_used() is not None: + comm_sms = min(comm_sms, all2all_manager.max_sms_used()) + + if comm_sms > 0: + set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms) + + # TODO(lucas): support other kernels besides DeepGEMM + set_compute_sms = lambda sms: None + if has_deep_gemm() and comm_sms > 0: + import deep_gemm as dg + set_compute_sms = lambda sms: dg.set_num_sms(sms) + + return SMControlContextManager(comm_sms=comm_sms, + set_comm_sms=set_comm_sms, + set_compute_sms=set_compute_sms) + def __getattr__(self, key: str): # allow accessing the attributes of the runnable. if hasattr(self.runnable, key): @@ -282,8 +356,8 @@ class UBatchWrapper: dp_metadata=dp_metadata, batch_descriptor=batch_descriptor, cudagraph_runtime_mode=CUDAGraphMode.NONE) - - return self._capture_ubatches(ubatch_metadata, self.model) + with self.sm_control: + return self._capture_ubatches(ubatch_metadata, self.model) elif num_tokens in self.cudagraphs: cudagraph_metadata = self.cudagraphs[num_tokens] cudagraph_metadata.cudagraph.replay() @@ -300,4 +374,5 @@ class UBatchWrapper: dp_metadata=dp_metadata, batch_descriptor=batch_descriptor, cudagraph_runtime_mode=CUDAGraphMode.NONE) - return self._run_ubatches(ubatch_metadata, self.model) + with self.sm_control: + return self._run_ubatches(ubatch_metadata, self.model) diff --git a/vllm/v1/worker/ubatch_splitting.py b/vllm/v1/worker/ubatch_splitting.py index 650f0ec5138db..30acb14ff58a7 100644 --- a/vllm/v1/worker/ubatch_splitting.py +++ b/vllm/v1/worker/ubatch_splitting.py @@ -3,9 +3,10 @@ from typing import Optional +import numpy as np import torch -from vllm.config import VllmConfig +from vllm.config import ParallelConfig, VllmConfig from vllm.forward_context import DPMetadata from vllm.logger import init_logger from vllm.utils import round_up @@ -29,6 +30,16 @@ def should_ubatch_with_num_tokens( dp_size, dp_rank) +def check_ubatch_thresholds(config: ParallelConfig, num_tokens: int, + uniform_decode: bool) -> bool: + if not config.enable_dbo: + return False + if uniform_decode: + return num_tokens >= config.dbo_decode_token_threshold + else: + return num_tokens >= config.dbo_prefill_token_threshold + + def get_dp_padding_ubatch( num_tokens_unpadded: int, num_tokens_padded: int, should_attempt_ubatching: bool, @@ -95,9 +106,37 @@ def get_dp_padding_ubatch( dtype=torch.int32) return should_ubatch, num_tokens_after_padding +def create_ubatch_slices(num_scheduled_tokens: np.ndarray, split_point: int) \ + -> UBatchSlices: + # TODO(lucas): Refactor the gpu_model_runner.py so we can pass + # in cu_num_tokens directly (i.e. query_start_loc) + cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32) + np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:]) + + first_ubatch_token_slice = slice(0, split_point) + second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1]) + + # Determine request slices using exclusive stop semantics + # First ubatch includes requests whose tokens overlap [0, split_point) + first_ubatch_req_stop = int( + np.searchsorted(cu_num_tokens, split_point, side="left")) + first_ubatch_req_slice = slice(0, first_ubatch_req_stop) + + # Second ubatch starts at the request that contains the split_point + # or the request starting exactly at split_point (if on boundary) + second_ubatch_req_start = int( + np.searchsorted(cu_num_tokens, split_point, side="right") - 1) + second_ubatch_req_slice = slice(second_ubatch_req_start, + len(cu_num_tokens) - 1) + + return [ + UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice), + UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice) + ] + def ubatch_split( - max_num_scheduled_tokens: int, + num_scheduled_tokens_per_request: np.ndarray, num_tokens_unpadded: int, num_tokens_padded: int, vllm_config: VllmConfig, @@ -122,17 +161,20 @@ def ubatch_split( return (None, None) # Check preconditions for microbatching - should_attempt_ubatching = \ - parallel_config.enable_dbo and \ - num_tokens_unpadded >= \ - parallel_config.dbo_decode_token_threshold \ - and max_num_scheduled_tokens == 1 + should_attempt_ubatching = check_ubatch_thresholds( + parallel_config, + num_tokens_unpadded, + vllm_config, + ) # Don't microbatch unless every other DP worker is also microbatching - num_tokens_after_padding = None - (should_ubatch, num_tokens_after_padding) = get_dp_padding_ubatch( - num_tokens_unpadded, num_tokens_padded, should_attempt_ubatching, - vllm_config) + should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch( + num_tokens_unpadded, + num_tokens_padded, + should_attempt_ubatching, + vllm_config, + ) + if not should_ubatch: return (None, None) @@ -141,15 +183,9 @@ def ubatch_split( # to the second ubatch in pad_out_ubatch_slice after attention # metadata creation assert num_tokens_after_padding is not None - total_num_tokens_per_ubatch = int(num_tokens_after_padding[0].item()) - padded_first_ubatch_slice = slice(0, total_num_tokens_per_ubatch) - padded_second_ubatch_slice = slice(total_num_tokens_per_ubatch, - num_tokens_unpadded) + token_split_point = int(num_tokens_after_padding[0].item()) - # Note there's an assumption here that there's 1 token per request - ubatch_slices = [ - UBatchSlice(padded_first_ubatch_slice, padded_first_ubatch_slice), - UBatchSlice(padded_second_ubatch_slice, padded_second_ubatch_slice) - ] + ubatch_slices = create_ubatch_slices(num_scheduled_tokens_per_request, + token_split_point) return (ubatch_slices, num_tokens_after_padding) diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py index 6716d171cc701..33d58aa948434 100644 --- a/vllm/v1/worker/ubatch_utils.py +++ b/vllm/v1/worker/ubatch_utils.py @@ -10,6 +10,14 @@ class UBatchSlice: request_slice: slice token_slice: slice + def is_empty(self) -> bool: + return self.request_slice.start == self.request_slice.stop \ + or self.token_slice.start == self.token_slice.stop + + @property + def num_tokens(self) -> int: + return self.token_slice.stop - self.token_slice.start + UBatchSlices: TypeAlias = list[UBatchSlice] diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index 9aeaa9909dc81..c26cb07123a53 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -51,8 +51,8 @@ class UBatchContext: self.cpu_wait_event.wait() self.cpu_wait_event.clear() self._restore_context() - # Assume we start on the compute stream - assert current_stream() == self.compute_stream + # Assume we want to start on the compute stream + self.update_stream(self.compute_stream) return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -62,17 +62,15 @@ class UBatchContext: self.maybe_run_recv_hook() self.cpu_signal_event.set() self.cpu_wait_event.clear() - self.current_stream = self.compute_stream - torch.cuda.set_stream(self.current_stream) return False def _restore_context(self): forward_context._forward_context = self.forward_context - torch.cuda.set_stream(self.current_stream) def update_stream(self, stream): self.current_stream = stream - torch.cuda.set_stream(self.current_stream) + if current_stream() != self.current_stream: + torch.cuda.set_stream(self.current_stream) def _signal_comm_done(self): self.gpu_comm_done_event.record(self.comm_stream) @@ -99,9 +97,20 @@ class UBatchContext: self.cpu_wait_event.clear() self._restore_context() + def switch_to_comm(self): + self.update_stream(self.comm_stream) + + def switch_to_compute(self): + self.update_stream(self.compute_stream) + def switch_to_comm_sync(self): self._signal_compute_done() self.update_stream(self.comm_stream) + self._wait_compute_done() + + def switch_to_compute_sync(self): + self._signal_comm_done() + self.update_stream(self.compute_stream) self._wait_comm_done() def maybe_run_recv_hook(self): @@ -112,8 +121,7 @@ class UBatchContext: def yield_(self): self.current_stream = current_stream() self._cpu_yield() - if self.current_stream != current_stream(): - self.update_stream(self.current_stream) + self.update_stream(self.current_stream) def yield_and_switch_from_compute_to_comm(self): assert current_stream() == self.compute_stream @@ -153,15 +161,20 @@ def _register_ubatch_function(func): return wrapper +dbo_maybe_run_recv_hook = _register_ubatch_function( + UBatchContext.maybe_run_recv_hook) +dbo_yield = _register_ubatch_function(UBatchContext.yield_) dbo_yield_and_switch_from_compute_to_comm = _register_ubatch_function( UBatchContext.yield_and_switch_from_compute_to_comm) dbo_yield_and_switch_from_comm_to_compute = _register_ubatch_function( UBatchContext.yield_and_switch_from_comm_to_compute) -dbo_yield = _register_ubatch_function(UBatchContext.yield_) -dbo_maybe_run_recv_hook = _register_ubatch_function( - UBatchContext.maybe_run_recv_hook) +dbo_switch_to_comm = _register_ubatch_function(UBatchContext.switch_to_comm) +dbo_switch_to_compute = _register_ubatch_function( + UBatchContext.switch_to_compute) dbo_switch_to_comm_sync = _register_ubatch_function( UBatchContext.switch_to_comm_sync) +dbo_switch_to_compute_sync = _register_ubatch_function( + UBatchContext.switch_to_compute_sync) def dbo_register_recv_hook(recv_hook): diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index af922f9979d1a..553d33e27203d 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -130,15 +130,32 @@ class MultiModalBudget: @dataclass class AttentionGroup: backend: type[AttentionBackend] + # When ubatching is enabled we will have a metadata builder for each ubatch + # so that if they use internal persistant buffers for cudagraphs, and they + # won't have to worry about conflicting with the other ubatches. metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] kv_cache_spec: KVCacheSpec + @staticmethod + def create_with_metadata_builders( + backend: type[AttentionBackend], + layer_names: list[str], + kv_cache_spec: KVCacheSpec, + vllm_config: VllmConfig, + device: torch.device, + num_metadata_builders: int = 1, + ) -> 'AttentionGroup': + metadata_builders = [ + backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, + device) + for _ in range(num_metadata_builders) + ] + return AttentionGroup(backend, metadata_builders, layer_names, + kv_cache_spec) + def get_metadata_builder(self, - ubatch_id: Optional[int] = None - ) -> AttentionMetadataBuilder: - if ubatch_id is None: - return self.metadata_builders[0] + ubatch_id: int = 0) -> AttentionMetadataBuilder: assert len(self.metadata_builders) > ubatch_id return self.metadata_builders[ubatch_id] From 875d6def90f737f5f39eb484a29a940b117a9c79 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:07:30 +0100 Subject: [PATCH 142/185] Add backward compatibility for `GuidedDecodingParams` (#25422) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../llm/test_struct_output_generate.py | 24 ++++++++++++- vllm/sampling_params.py | 36 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 4b0f3b2d9967e..e2c686928cea1 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -5,6 +5,7 @@ from __future__ import annotations import json +from dataclasses import fields from enum import Enum from typing import TYPE_CHECKING, Any @@ -21,7 +22,8 @@ from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager -from vllm.sampling_params import SamplingParams, StructuredOutputsParams +from vllm.sampling_params import (GuidedDecodingParams, SamplingParams, + StructuredOutputsParams) if TYPE_CHECKING: from vllm.config import TokenizerMode @@ -89,6 +91,26 @@ def _load_json(s: str, backend: str) -> str: return json.loads(s) +def test_guided_decoding_deprecated(): + with pytest.warns(DeprecationWarning, + match="GuidedDecodingParams is deprecated.*"): + guided_decoding = GuidedDecodingParams(json_object=True) + + structured_outputs = StructuredOutputsParams(json_object=True) + assert fields(guided_decoding) == fields(structured_outputs) + + with pytest.warns(DeprecationWarning, + match="guided_decoding is deprecated.*"): + sp1 = SamplingParams(guided_decoding=guided_decoding) + + with pytest.warns(DeprecationWarning, + match="guided_decoding is deprecated.*"): + sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding) + + assert sp1 == sp2 + assert sp1.structured_outputs == guided_decoding + + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, speculative_config", diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index efe70d019ccc6..f424682f9dfa0 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy +import warnings from dataclasses import field from enum import Enum, IntEnum from functools import cached_property @@ -59,6 +60,19 @@ class StructuredOutputsParams: f"but multiple are specified: {self.__dict__}") +@dataclass +class GuidedDecodingParams(StructuredOutputsParams): + + def __post_init__(self): + warnings.warn( + "GuidedDecodingParams is deprecated. This will be removed in " + "v0.12.0 or v1.0.0, which ever is soonest. Please use " + "StructuredOutputsParams instead.", + DeprecationWarning, + stacklevel=2) + return super().__post_init__() + + class RequestOutputKind(Enum): # Return entire output so far in every RequestOutput CUMULATIVE = 0 @@ -179,6 +193,8 @@ class SamplingParams( # Fields used to construct logits processors structured_outputs: Optional[StructuredOutputsParams] = None """Parameters for configuring structured outputs.""" + guided_decoding: Optional[GuidedDecodingParams] = None + """Deprecated alias for structured_outputs.""" logit_bias: Optional[dict[int, float]] = None """If provided, the engine will construct a logits processor that applies these logit biases.""" @@ -227,6 +243,7 @@ class SamplingParams( ge=-1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, structured_outputs: Optional[StructuredOutputsParams] = None, + guided_decoding: Optional[GuidedDecodingParams] = None, logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, allowed_token_ids: Optional[list[int]] = None, extra_args: Optional[dict[str, Any]] = None, @@ -238,6 +255,15 @@ class SamplingParams( int(token): min(100.0, max(-100.0, bias)) for token, bias in logit_bias.items() } + if guided_decoding is not None: + warnings.warn( + "guided_decoding is deprecated. This will be removed in " + "v0.12.0 or v1.0.0, which ever is soonest. Please use " + "structured_outputs instead.", + DeprecationWarning, + stacklevel=2) + structured_outputs = guided_decoding + guided_decoding = None return SamplingParams( n=1 if n is None else n, @@ -334,6 +360,16 @@ class SamplingParams( # eos_token_id is added to this by the engine self._all_stop_token_ids.update(self.stop_token_ids) + if self.guided_decoding is not None: + warnings.warn( + "guided_decoding is deprecated. This will be removed in " + "v0.12.0 or v1.0.0, which ever is soonest. Please use " + "structured_outputs instead.", + DeprecationWarning, + stacklevel=2) + self.structured_outputs = self.guided_decoding + self.guided_decoding = None + def _verify_args(self) -> None: if not isinstance(self.n, int): raise ValueError(f"n must be an int, but is of " From f11e3c516be3d88733ea4b0c79f47e33cc319197 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:11:34 -0400 Subject: [PATCH 143/185] [Kernels] Support blocked fp8 quantization for compressed tensors MoE (#25219) Signed-off-by: Bill Nell Co-authored-by: Michael Goin --- .../compressed_tensors_moe.py | 109 +++++++++++++++--- .../model_executor/layers/quantization/fp8.py | 22 ++-- .../layers/quantization/utils/fp8_utils.py | 6 + .../model_executor/warmup/deep_gemm_warmup.py | 4 +- 4 files changed, 112 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 85adae32f4cdc..10f9085be4d12 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -13,6 +13,7 @@ from compressed_tensors.quantization import (ActivationOrdering, import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops +from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, @@ -31,6 +32,9 @@ from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, select_nvfp4_gemm_impl) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + expert_weight_is_col_major, get_col_major_tma_aligned_tensor, + requant_weight_ue8m0_inplace) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -45,6 +49,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used logger = init_logger(__name__) @@ -505,10 +510,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): self.weight_quant.strategy == QuantizationStrategy.CHANNEL and self.input_quant.strategy == QuantizationStrategy.TOKEN) if not (per_tensor or per_channel): - raise ValueError( - "For FP8 Fused MoE layers, we require per tensor " - "or channelwise, dynamic per token quantization. Found " - f"{self.weight_quant}, {self.input_quant}") + assert self.weight_quant.strategy == QuantizationStrategy.BLOCK + self.weight_block_size = self.weight_quant.block_structure + assert self.weight_quant.dynamic is not None + else: + self.weight_block_size = None + self.block_quant = self.weight_block_size is not None self.static_input_scales = not self.input_quant.dynamic if self.static_input_scales and per_channel: @@ -519,7 +526,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): # For GPUs that lack FP8 hardware support, we can leverage the Marlin # kernel for fast weight-only FP8 quantization self.use_marlin = (not current_platform.has_device_capability(89) - or envs.VLLM_TEST_FORCE_FP8_MARLIN) + or envs.VLLM_TEST_FORCE_FP8_MARLIN + and not self.block_quant) # Disable marlin for rocm if current_platform.is_rocm(): self.use_marlin = False @@ -531,8 +539,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): # cutlass path self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100( self.weight_quant, self.input_quant) - self.use_cutlass = (quant_config._is_fp8_w8a8_sm90( - self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) + self.use_cutlass = not self.block_quant and ( + quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant) + or self.is_fp8_w8a8_sm100) self.disable_expert_map = False def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -547,6 +556,31 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): params_dtype = torch.float8_e4m3fn + if self.block_quant: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + tp_size = get_tensor_model_parallel_world_size() + block_n, block_k = ( + self.weight_block_size[0], + self.weight_block_size[1], + ) + # NOTE: To ensure proper alignment of the block-wise quantization + # scales, the output_size of the weights for both the gate and up + # layers must be divisible by block_n. + # Required by column parallel or enabling merged weights + if intermediate_size_per_partition % block_n != 0: + raise ValueError( + f"The output_size of gate's and up's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_n = {block_n}.") + if (tp_size > 1 + and intermediate_size_per_partition % block_k != 0): + # Required by row parallel + raise ValueError( + f"The input_size of down's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_k = {block_k}.") + # WEIGHTS w13_weight = torch.nn.Parameter(torch.empty( num_experts, @@ -602,6 +636,27 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): set_weight_attrs(w13_weight_scale, extra_weight_attrs) set_weight_attrs(w2_weight_scale, extra_weight_attrs) + elif self.weight_quant.strategy == QuantizationStrategy.BLOCK: + w13_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, + 2 * + ((intermediate_size_per_partition + block_n - 1) // block_n), + (hidden_size + block_k - 1) // block_k, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, (hidden_size + block_n - 1) // block_n, + (intermediate_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + # INPUT_SCALES if self.static_input_scales: w13_input_scale = torch.nn.Parameter(torch.ones( @@ -706,6 +761,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): del layer.w2_input_scale if self.use_cutlass: + assert self.weight_quant.strategy != QuantizationStrategy.BLOCK device = layer.w13_weight.device # ab_strides1 and c_strides2 are the same self.ab_strides1_c_strides2 = torch.full( @@ -724,6 +780,29 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): device=device, dtype=torch.int64) + if is_deep_gemm_e8m0_used() and self.block_quant: + assert layer.weight_block_size is not None + # Re-quantise the expert weights so their scales are UE8M0. + block_sz = tuple(layer.weight_block_size) + requant_weight_ue8m0_inplace( + layer.w13_weight.data, + layer.w13_weight_scale.data, + block_sz, + ) + requant_weight_ue8m0_inplace( + layer.w2_weight.data, + layer.w2_weight_scale.data, + block_sz, + ) + + # Ensure column-major TMA alignment expected by DeepGEMM. + if expert_weight_is_col_major(layer.w13_weight_scale): + layer.w13_weight_scale = get_col_major_tma_aligned_tensor( + layer.w13_weight_scale) + if expert_weight_is_col_major(layer.w2_weight_scale): + layer.w2_weight_scale = get_col_major_tma_aligned_tensor( + layer.w2_weight_scale) + def maybe_make_prepare_finalize( self) -> Optional[mk.FusedMoEPrepareAndFinalize]: if self.use_marlin or self.rocm_aiter_moe_enabled: @@ -777,9 +856,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): return experts # triton path - from vllm.model_executor.layers.fused_moe import TritonExperts - from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts) + from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 + BatchedTritonOrDeepGemmExperts) + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts) assert not self.rocm_aiter_moe_enabled and not self.use_marlin @@ -790,14 +870,16 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): assert max_num_tokens_per_rank is not None logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__) - return BatchedTritonExperts( + return BatchedTritonOrDeepGemmExperts( max_num_tokens=max_num_tokens_per_rank, num_dispatchers=prepare_finalize.num_dispatchers(), quant_config=self.moe_quant_config, ) else: - logger.debug("TritonExperts(%s)", self.__class__.__name__) - return TritonExperts(self.moe_quant_config) + logger.debug("TritonOrDeepGemmExperts(%s)", + self.__class__.__name__) + return TritonOrDeepGemmExperts(self.moe_quant_config, + allow_deep_gemm=True) def get_fused_moe_quant_config( self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: @@ -816,6 +898,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): a2_scale=layer.w2_input_scale, per_act_token_quant=per_act_token, per_out_ch_quant=per_channel_quant, + block_shape=layer.weight_block_size, ) def apply( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index aec9c79f1ea82..2b24e052053c0 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -33,10 +33,10 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import ( apply_fp8_block_linear, check_aiter_fp8_linear_support, create_fp8_input_scale, create_fp8_scale_parameter, - create_fp8_weight_parameter, get_col_major_tma_aligned_tensor, - maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy, - process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace, - validate_fp8_block_shape) + create_fp8_weight_parameter, expert_weight_is_col_major, + get_col_major_tma_aligned_tensor, maybe_post_process_fp8_weight_block, + process_fp8_weight_block_strategy, process_fp8_weight_tensor_strategy, + requant_weight_ue8m0_inplace, validate_fp8_block_shape) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin, prepare_moe_fp8_layer_for_marlin) @@ -64,12 +64,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"] logger = init_logger(__name__) -def _is_col_major(x: torch.Tensor) -> bool: - assert x.dim() == 3 - b, m, n = x.shape - return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m - - class Fp8Config(QuantizationConfig): """Config class for FP8.""" @@ -660,10 +654,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): - if _is_col_major(layer.w13_weight_scale_inv): + if expert_weight_is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv) - if _is_col_major(layer.w2_weight_scale_inv): + if expert_weight_is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = \ get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv) @@ -811,10 +805,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) # Ensure column-major TMA alignment expected by DeepGEMM. - if _is_col_major(layer.w13_weight_scale_inv): + if expert_weight_is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor( layer.w13_weight_scale_inv) - if _is_col_major(layer.w2_weight_scale_inv): + if expert_weight_is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor( layer.w2_weight_scale_inv) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index fc12483de0c0e..d1d87b7ba12e2 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1014,3 +1014,9 @@ def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor, cutlass_block_fp8_supported=cutlass_block_fp8_supported, use_aiter_and_is_supported=use_aiter_and_is_supported, ) + + +def expert_weight_is_col_major(x: torch.Tensor) -> bool: + assert x.dim() == 3 + b, m, n = x.shape + return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 4d1829cd228cd..f6df85a50238c 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -53,9 +53,9 @@ def _extract_data_from_fused_moe_module( """ assert isinstance(m, FusedMoE) w13 = m.w13_weight - w13_s = m.w13_weight_scale_inv + w13_s = getattr(m, "w13_weight_scale_inv", m.w13_weight_scale) w2 = m.w2_weight - w2_s = m.w2_weight_scale_inv + w2_s = getattr(m, "w2_weight_scale_inv", m.w2_weight_scale) num_topk = m.top_k assert isinstance(w13, torch.Tensor) From 2357480b1abefb2b7177046133080917e1449324 Mon Sep 17 00:00:00 2001 From: rivos-shreeasish Date: Tue, 23 Sep 2025 09:14:22 -0700 Subject: [PATCH 144/185] [BugFix] Fix UB in per_token_group_quant.cu (#24913) Signed-off-by: Shreeasish Kumar --- csrc/quantization/fp8/per_token_group_quant.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu index f5b40e35b6e5a..91d489fdef862 100644 --- a/csrc/quantization/fp8/per_token_group_quant.cu +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -12,8 +12,8 @@ #include "../vectorization_utils.cuh" #include "../../dispatch_utils.h" -__device__ __forceinline__ float GroupReduceMax(float val, const int tid) { - unsigned mask = 0xffff; +__device__ __forceinline__ float GroupReduceMax(float val) { + unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff; val = fmaxf(val, __shfl_xor_sync(mask, val, 8)); val = fmaxf(val, __shfl_xor_sync(mask, val, 4)); @@ -86,7 +86,7 @@ __global__ void per_token_group_quant_8bit_kernel( threads_per_group, // stride in group scalar_op_cache); // scalar handler - local_absmax = GroupReduceMax(local_absmax, lane_id); + local_absmax = GroupReduceMax(local_absmax); float y_s = local_absmax / max_8bit; if constexpr (SCALE_UE8M0) { From 846197f505fb7b7861a9688e2a09b514b9d3133d Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:44:37 -0400 Subject: [PATCH 145/185] [Log] Optimize kv cache memory log from Bytes to GiB (#25204) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index ffea9bb35513a..f59dacf13d85a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -383,11 +383,13 @@ class Worker(WorkerBase): f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " f"config with `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " - f"requested memory, or `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " + f"{kv_cache_memory_bytes_to_requested_limit}` " + f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit " + f"into requested memory, or `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_gpu_limit}` " + f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " f"utilize gpu memory. Current kv cache memory in use is " - f"{int(self.available_kv_cache_memory_bytes)} bytes.") + f"{GiB(self.available_kv_cache_memory_bytes)} GiB.") logger.debug(msg) From 527821d191e05e0414695fa8aea24ecaeb3d609c Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 23 Sep 2025 09:45:39 -0700 Subject: [PATCH 146/185] Use macro guard CUDA functions for back compatibility in grouped_topk_kernel.cu (#25346) Signed-off-by: Ming Yang Signed-off-by: Rahul Tuli Co-authored-by: Rahul Tuli Co-authored-by: Claude Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Ye (Charlotte) Qi --- csrc/moe/grouped_topk_kernels.cu | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index b5321f748e6be..c93f9d54d780c 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -418,6 +418,15 @@ __device__ inline T neg_inf() { return cuda_cast(-cuda::std::numeric_limits::infinity()); } +template +__device__ inline bool is_finite(const T val) { +#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800) + return cuda::std::isfinite(val); +#else + return isfinite(cuda_cast(val)); +#endif +} + template __device__ void topk_with_k2(T* output, T const* input, cg::thread_block_tile<32> const& tile, @@ -533,7 +542,7 @@ __global__ void group_idx_and_topk_idx_kernel( // calculate group_idx int32_t target_num_min = WARP_SIZE - n_group + topk_group; // The check is necessary to avoid abnormal input - if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) { + if (lane_id < n_group && is_finite(group_scores[lane_id])) { value = group_scores[lane_id]; } @@ -568,11 +577,10 @@ __global__ void group_idx_and_topk_idx_kernel( int32_t offset = i_group * num_experts_per_group; for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) { - T candidates = - (i < num_experts_per_group) && - cuda::std::isfinite(scores_with_bias[offset + i]) - ? scores_with_bias[offset + i] - : neg_inf(); + T candidates = (i < num_experts_per_group) && + is_finite(scores_with_bias[offset + i]) + ? scores_with_bias[offset + i] + : neg_inf(); queue.add(candidates, offset + i); } if (group_scores[i_group] == topk_group_value) { From 100b630a604aff9586bb5493c22b571ad1a19570 Mon Sep 17 00:00:00 2001 From: Burkhard Ringlein Date: Tue, 23 Sep 2025 18:52:40 +0200 Subject: [PATCH 147/185] [V1][Kernel] Add triton implementation for `reshape_and_cache_flash` (#24503) Signed-off-by: Burkhard Ringlein Co-authored-by: Chih-Chieh Yang Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../benchmark_reshape_and_cache_flash.py | 78 ++++++-- tests/kernels/attention/test_cache.py | 27 ++- .../ops/triton_reshape_and_cache_flash.py | 176 ++++++++++++++++++ vllm/v1/attention/backends/triton_attn.py | 15 +- 4 files changed, 276 insertions(+), 20 deletions(-) create mode 100644 vllm/attention/ops/triton_reshape_and_cache_flash.py diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index d4648c18f31d5..0aace571064a0 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -9,6 +9,9 @@ import torch from tabulate import tabulate from vllm import _custom_ops as ops +from vllm.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import ( @@ -31,6 +34,8 @@ def run_benchmark( kv_cache_dtype: str, kv_cache_layout: str, num_iters: int, + implementation: str, + benchmark_mode: str, device: str = "cuda", ) -> float: """Return latency (seconds) for given num_tokens.""" @@ -38,6 +43,14 @@ def run_benchmark( if kv_cache_dtype == "fp8" and head_size % 16: raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + if implementation not in ("cuda", "triton"): + raise ValueError( + f"Unsupported implementation: {implementation}. " + "Only 'cuda' and 'triton' are supported." + ) + if implementation == "triton" and kv_cache_layout == "HND": + return float("nan") # Triton does not support HND layout yet. + current_platform.seed_everything(42) torch.set_default_device(device) @@ -65,27 +78,49 @@ def run_benchmark( cache_layout=kv_cache_layout, ) key_cache, value_cache = key_caches[0], value_caches[0] + # to free unused memory + del key_caches, value_caches # compute per-kernel scaling factors for fp8 conversion (if used). k_scale = (key.amax() / 64.0).to(torch.float32) v_scale = (value.amax() / 64.0).to(torch.float32) + if implementation == "cuda": + function_under_test = lambda: ops.reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + function_under_test = lambda: triton_reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + if benchmark_mode == "cudagraph": + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + function_under_test() + torch.cuda.synchronize() + function_under_test = lambda: g.replay() + def run_cuda_benchmark(n_iters: int) -> float: nonlocal key, value, key_cache, value_cache, slot_mapping torch.cuda.synchronize() start = time.perf_counter() for _ in range(n_iters): - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - slot_mapping, - kv_cache_dtype, - k_scale, - v_scale, - ) - torch.cuda.synchronize() + function_under_test() + torch.cuda.synchronize() end = time.perf_counter() return (end - start) / n_iters @@ -116,10 +151,16 @@ def main(args): kv_cache_dtype=args.kv_cache_dtype, kv_cache_layout=layout, num_iters=args.iters, + implementation=args.implementation, + benchmark_mode=args.mode, device="cuda", ) rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) + print( + f"Benchmark results for implementation {args.implementation}" + f" (measuring with {args.mode}):" + ) print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) @@ -151,6 +192,21 @@ if __name__ == "__main__": ) parser.add_argument("--iters", type=int, default=100) + + parser.add_argument( + "--implementation", + type=str, + choices=["cuda", "triton"], + default="cuda", + ) + + parser.add_argument( + "--mode", + type=str, + choices=["cudagraph", "no_graph"], + default="cudagraph", + ) + args = parser.parse_args() main(args) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 69e96dfd2cb13..1325e6883132a 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -39,6 +39,8 @@ CUDA_DEVICES = [ # We assume fp8 is always enabled for testing. KV_CACHE_DTYPE = ["auto", "fp8"] +RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"] + @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("num_layers", NUM_LAYERS) @@ -223,6 +225,7 @@ def test_reshape_and_cache( @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS) +@pytest.mark.parametrize("implementation", RESHAPE_FLASH_IMPLEMENTATIONS) @torch.inference_mode() def test_reshape_and_cache_flash( kv_cache_factory_flashinfer, @@ -236,9 +239,13 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, kv_cache_layout: str, + implementation: str, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) + assert implementation in ["cuda", "triton"] + if implementation == "triton" and kv_cache_layout == "HND": + pytest.skip("Triton implementation only supports NHD layout.") # fp8 conversion requires continugous memory buffer. Reduce the number of # blocks and tokens to consume less memory. @@ -298,12 +305,20 @@ def test_reshape_and_cache_flash( cloned_key_cache = key_cache_compact.clone() cloned_value_cache = value_cache_compact.clone() # Call the reshape_and_cache kernel. - opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, - (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, - k_scale, v_scale), - cond=(head_size == HEAD_SIZES[0])) - ops.reshape_and_cache_flash(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype, k_scale, v_scale) + if implementation == "cuda": + opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, + (key, value, key_cache, value_cache, slot_mapping, + kv_cache_dtype, k_scale, v_scale), + cond=(head_size == HEAD_SIZES[0])) + ops.reshape_and_cache_flash(key, value, key_cache, value_cache, + slot_mapping, kv_cache_dtype, k_scale, + v_scale) + elif implementation == "triton": + from vllm.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash) + triton_reshape_and_cache_flash(key, value, key_cache, value_cache, + slot_mapping, kv_cache_dtype, k_scale, + v_scale) key_cache_compact = permute_and_compact(key_cache) value_cache_compact = permute_and_compact(value_cache) diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py new file mode 100644 index 0000000000000..0b0c706626af3 --- /dev/null +++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import triton +import triton.language as tl + +from vllm.platforms import current_platform + + +@triton.jit +def reshape_and_cache_kernel_flash( + key_ptr, # [num_tokens, num_heads, head_size] + value_ptr, # [num_tokens, num_heads, head_size] + key_cache_ptr, # [num_blocks, block_size, num_heads, head_size] + value_cache_ptr, # [num_blocks, block_size, num_heads, head_size] + slot_mapping_ptr, # [num_tokens] + k_scale, # float32 + v_scale, # float32 + # strides + key_stride: tl.int64, + value_stride: tl.int64, + block_stride: tl.int64, + page_stride: tl.int64, + num_heads: tl.constexpr, + head_size: tl.constexpr, + block_size: tl.constexpr, + # FP8 flags + FP8_KV_CACHE: tl.constexpr, + # tune parameters + TILE_SIZE: tl.constexpr, +): + + token_idx = tl.program_id(axis=0) + slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64) + if slot_idx < 0: + # Padding token that should be ignored. + return + + tile_i = tl.program_id(axis=1) + tile_offs = tl.arange(0, TILE_SIZE) + tile_pos = tile_i * TILE_SIZE + tile_offs + + block_idx = slot_idx // block_size + block_offset = slot_idx % block_size + + src_key_idx = token_idx * key_stride + src_value_idx = token_idx * value_stride + + tgt_idx = block_idx * block_stride + block_offset * page_stride + + # [TILE_SIZE] + key_load = tl.load(key_ptr + src_key_idx + tile_pos, + mask=tile_pos < (num_heads * head_size)) + if FP8_KV_CACHE: + if key_load.dtype.is_fp8(): + key_tile = key_load + else: + # tl.store will do the correct implicit cast to fp8, + # based on the key_cache_ptr.dtype.element_ty + key_tile = key_load / tl.load(k_scale) + else: + key_tile = key_load + + # [TILE_SIZE] + value_load = tl.load(value_ptr + src_value_idx + tile_pos, + mask=tile_pos < (num_heads * head_size)) + if FP8_KV_CACHE: + if value_load.dtype.is_fp8(): + value_tile = value_load + else: + # tl.store will do the correct implicit cast to fp8, + # based on the value_cache_ptr.dtype.element_ty + value_tile = value_load / tl.load(v_scale) + else: + value_tile = value_load + + tl.store( + key_cache_ptr + tgt_idx + tile_pos, + key_tile, + mask=tile_pos < (num_heads * head_size), + ) + tl.store( + value_cache_ptr + tgt_idx + tile_pos, + value_tile, + mask=tile_pos < (num_heads * head_size), + ) + return + + +def triton_reshape_and_cache_flash( + key: torch.Tensor, # [num_tokens, num_heads, head_size] + value: torch.Tensor, # [num_tokens, num_heads, head_size] + # [num_blocks, block_size, num_heads, head_size] + key_cache: torch.Tensor, + # [num_blocks, block_size, num_heads, head_size] + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, # [num_tokens] + kv_cache_dtype: str, # "auto", "fp8" + k_scale: torch.Tensor, # float32 + v_scale: torch.Tensor, # float32 +): + num_tokens = key.shape[0] + num_heads = key.shape[1] + head_size = key.shape[2] + block_size = key_cache.shape[1] + n = num_heads * head_size + + key_stride = key.stride()[0] + value_stride = value.stride()[0] + block_stride = key_cache.stride()[0] + page_stride = key_cache.stride()[1] + + head_stride = key_cache.stride()[2] + assert head_stride == head_size, "only continous heads are supported" + + assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), \ + f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}." + kv_cache_torch_dtype = current_platform.fp8_dtype() if \ + kv_cache_dtype.startswith("fp8") else key_cache.dtype + + if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith( + "fp8"): + # to avoid erounous implicit cast in triton kernel (tl.store to uint8) + # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4) + key_cache = key_cache.view(kv_cache_torch_dtype) + value_cache = value_cache.view(kv_cache_torch_dtype) + assert kv_cache_dtype != torch.uint8, "explicit fp8 cast and store to "\ + "uint8 is not supported by triton reshape_and_cache_flash" + + FP8_KV_CACHE = kv_cache_dtype.startswith("fp8") + assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [ + torch.float8_e4m3fn, torch.float8_e5m2, torch.uint8, + torch.float8_e4m3fnuz], \ + "unsupported dtype of KV cache tensor, got "\ + "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, " \ + "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz." + + # heuristics instead of autotuning + TILE_SIZE = min(2048, triton.next_power_of_2(n)) + if torch.version.hip: + num_stages = 4 + num_warps = 8 + else: # cuda + num_stages = 10 + num_warps = 16 + if torch.cuda.get_device_capability(key.device)[0] < 9: + TILE_SIZE = min(512, TILE_SIZE) + + # TODO(ngl): maybe replace with static launch grid to avoid overhead if + # using cudagraphs + grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"])) + + reshape_and_cache_kernel_flash[grid]( + key_ptr=key, + value_ptr=value, + key_cache_ptr=key_cache, + value_cache_ptr=value_cache, + slot_mapping_ptr=slot_mapping, + k_scale=k_scale, + v_scale=v_scale, + # strides + key_stride=key_stride, + value_stride=value_stride, + block_stride=block_stride, + page_stride=page_stride, + num_heads=num_heads, + head_size=head_size, + block_size=block_size, + # FP8 flags + FP8_KV_CACHE=FP8_KV_CACHE, + # autotune parameters + TILE_SIZE=TILE_SIZE, + num_warps=num_warps, + num_stages=num_stages, + ) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 722c23f150cd3..f9fbd05efc67d 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -8,6 +8,8 @@ import torch from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash) from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import VllmConfig from vllm.logger import init_logger @@ -291,7 +293,13 @@ class TritonAttentionImpl(AttentionImpl): if self.kv_sharing_target_layer_name is None: # Reshape the input keys and values and store them in the cache. # Skip this if sharing KV cache with an earlier attention layer. - ops.reshape_and_cache_flash( + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(self.fp8_dtype) + value_cache = value_cache.view(self.fp8_dtype) + # triton kernel does not support uint8 kv_cache + # (because some explicit casts (e.g. float8_e4m3fnuz) + # are not supported) + triton_reshape_and_cache_flash( key, value, key_cache, @@ -303,8 +311,9 @@ class TritonAttentionImpl(AttentionImpl): ) if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(self.fp8_dtype) - value_cache = value_cache.view(self.fp8_dtype) + if key_cache.dtype != self.fp8_dtype: + key_cache = key_cache.view(self.fp8_dtype) + value_cache = value_cache.view(self.fp8_dtype) num_tokens, num_heads, head_size = query.shape assert layer._q_scale_float == 1.0, \ "A non 1.0 q_scale is not currently supported." From 24e822274562790a8e7cd7fde2972be719eb38cb Mon Sep 17 00:00:00 2001 From: Weida Hong Date: Wed, 24 Sep 2025 01:34:58 +0800 Subject: [PATCH 148/185] [Misc] Reduce initialization time of auto_tune (#23682) Signed-off-by: Weida Hong --- benchmarks/auto_tune/auto_tune.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index ed3679b66f805..b333ba9cd8e99 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -103,10 +103,15 @@ start_server() { VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & fi + local server_pid=$! # wait for 10 minutes... server_started=0 for i in {1..60}; do + # This line checks whether the server is still alive or not, + # since that we should always have permission to send signal to the server process. + kill -0 $server_pid 2> /dev/null || break + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then @@ -118,7 +123,7 @@ start_server() { done if (( ! server_started )); then - echo "server did not start within 10 minutes. Please check server log at $vllm_log". + echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log". return 1 else return 0 From 867ecdd1c87ba17cecf8a97b262e1a8117795e94 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:46:40 -0400 Subject: [PATCH 149/185] [Spec Decode][CI] Add e2e test for `examples/spec_decode.py` and prevent breaking Acceptance Length (#24531) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Roger Wang --- .buildkite/test-pipeline.yaml | 2 ++ examples/offline_inference/spec_decode.py | 42 ++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c4ea4b675649c..49316eb4f6073 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -329,6 +329,8 @@ steps: - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 004e75b204642..ce078bce0b753 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -49,6 +49,7 @@ def get_custom_mm_prompts(num_prompts): def parse_args(): parser = FlexibleArgumentParser() add_dataset_parser(parser) + parser.add_argument("--test", action="store_true") parser.add_argument( "--method", type=str, @@ -60,6 +61,7 @@ def parse_args(): parser.add_argument("--tp", type=int, default=1) parser.add_argument("--enforce-eager", action="store_true") parser.add_argument("--enable-chunked-prefill", action="store_true") + parser.add_argument("--max-model-len", type=int, default=16384) parser.add_argument("--temp", type=float, default=0) parser.add_argument("--top-p", type=float, default=1.0) parser.add_argument("--top-k", type=int, default=-1) @@ -71,8 +73,7 @@ def parse_args(): return parser.parse_args() -def main(): - args = parse_args() +def main(args): args.endpoint_type = "openai-chat" model_dir = args.model_dir @@ -134,7 +135,7 @@ def main(): gpu_memory_utilization=0.8, speculative_config=speculative_config, disable_log_stats=False, - max_model_len=16384, + max_model_len=args.max_model_len, limit_mm_per_prompt={"image": 5}, disable_chunked_mm_input=True, ) @@ -198,6 +199,39 @@ def main(): acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0 print(f"acceptance at token {i}: {acceptance_rate:.2f}") + return acceptance_length + if __name__ == "__main__": - main() + args = parse_args() + acceptance_length = main(args) + + if args.test: + # takes ~30s to run on 1xH100 + assert args.method in ["eagle", "eagle3"] + assert args.tp == 1 + assert args.num_spec_tokens == 3 + assert args.dataset_name == "hf" + assert args.dataset_path == "philschmid/mt-bench" + assert args.num_prompts == 80 + assert args.temp == 0 + assert args.top_p == 1.0 + assert args.top_k == -1 + assert args.enable_chunked_prefill + + # check acceptance length is within 2% of expected value + rtol = 0.02 + expected_acceptance_length = 2.296 if args.method == "eagle" else 2.811 + + assert ( + acceptance_length <= (1 + rtol) * expected_acceptance_length + and acceptance_length >= (1 - rtol) * expected_acceptance_length + ), ( + f"acceptance_length {acceptance_length} is not " + f"within {rtol * 100}% of {expected_acceptance_length}" + ) + + print( + f"Test passed! Expected AL: " + f"{expected_acceptance_length}, got {acceptance_length}" + ) From 5abb1179012268a587043dbc1e5afccb4e41c3a5 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 24 Sep 2025 02:19:25 +0800 Subject: [PATCH 150/185] [Core] Ensure LoRA linear respect the base_layer's tp_size and tp_rank (#25487) Signed-off-by: Jee Jee Li --- vllm/lora/layers/base_linear.py | 5 +-- vllm/lora/layers/column_parallel_linear.py | 40 ++++++++-------------- vllm/lora/layers/replicated_linear.py | 1 - vllm/lora/layers/row_parallel_linear.py | 15 +++----- vllm/lora/lora_weights.py | 4 +-- 5 files changed, 24 insertions(+), 41 deletions(-) diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 6cf5815ef12da..ed294b0aedaf4 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -24,11 +24,12 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): super().__init__() self.base_layer = base_layer self.input_size = self.base_layer.input_size + # Ensure tp_size and tp_rank consistency with the base_layer. + self.tp_size = self.base_layer.tp_size + self.tp_rank = self.base_layer.tp_rank self.device = _get_lora_device(self.base_layer) self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None - self.output_slices: tuple[int, ...] - self.tp_size: int self.output_size: int self.n_slices: int diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index fa4eb272a69fe..6284576446c8f 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -8,9 +8,7 @@ import torch.nn as nn from transformers import PretrainedConfig from vllm.config.lora import LoRAConfig -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather) +from vllm.distributed import tensor_model_parallel_all_gather from vllm.distributed.utils import divide from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -85,7 +83,6 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # inconsistent when TP is greater than 1. self.is_merged_col_linear = type( base_layer) is MergedColumnParallelLinear - self.tp_size = get_tensor_model_parallel_world_size() self.output_size = self.base_layer.output_size_per_partition # There is only one LoRA layer self.n_slices = 1 @@ -97,22 +94,20 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # Applicable to cases where the base_layer is # MergedColumnParallelLinear. if self.is_merged_col_linear: - tp_rank = get_tensor_model_parallel_rank() shard_size = self.output_size // 2 offset = lora_b.shape[0] // 2 - left_weight = lora_b[tp_rank * shard_size:(tp_rank + 1) * + left_weight = lora_b[self.tp_rank * shard_size:(self.tp_rank + 1) * shard_size, :] - right_weight = lora_b[offset + tp_rank * shard_size:offset + - (tp_rank + 1) * shard_size, :] + right_weight = lora_b[offset + self.tp_rank * shard_size:offset + + (self.tp_rank + 1) * shard_size, :] lora_b = torch.cat([left_weight, right_weight], dim=0) # Applicable to cases where the base_layer is # ColumnParallelLinear. else: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size lora_b = lora_b[start_idx:end_idx, :] return lora_b @@ -120,10 +115,9 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # TODO: Fix the slicing logic of bias. if bias is None: return bias - tensor_model_parallel_rank = get_tensor_model_parallel_rank() shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size bias = bias[start_idx:end_idx] return bias @@ -144,7 +138,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # Matrix multiply. output_parallel = self.apply(input_, bias) - if self.base_layer.gather_output: + if self.base_layer.gather_output and self.tp_size > 1: # All-gather across the partitions. output = tensor_model_parallel_all_gather(output_parallel) else: @@ -185,8 +179,6 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): QKVParallelLinear]) -> None: super().__init__(base_layer) # There are two LoRA layers - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() # the output_sizes in MergedColumnParallelLinear is not sharded by tp # we need to divide it by the tp_size to get correct slices size output_sizes = self.base_layer.output_sizes @@ -341,9 +333,9 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): self.n_slices = 1 def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() - self.q_shard_id = tp_rank - self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas + + self.q_shard_id = self.tp_rank + self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas lora_b_q = lora_b[self.q_proj_shard_size * self.q_shard_id:self.q_proj_shard_size * (self.q_shard_id + 1), :] @@ -397,8 +389,6 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): super().__init__(base_layer) # There are three LoRA layer. self.n_slices = len(self.base_layer.output_sizes) - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() self.q_proj_shard_size = (self.base_layer.num_heads * self.base_layer.head_size) @@ -461,9 +451,8 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): # Therefore, the sharding of `lora_a` only needs to correspond with the # gather operation. def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] - start_idx = tp_rank * shard_size + start_idx = self.tp_rank * shard_size lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a @@ -547,9 +536,8 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): """ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] - start_idx = tp_rank * shard_size + start_idx = self.tp_rank * shard_size lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py index 3356297c1537a..18a8f13ed9427 100644 --- a/vllm/lora/layers/replicated_linear.py +++ b/vllm/lora/layers/replicated_linear.py @@ -18,7 +18,6 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): def __init__(self, base_layer: ReplicatedLinear) -> None: super().__init__(base_layer, ) # To ensure interface compatibility, set to 1 always. - self.tp_size = 1 self.output_size = self.base_layer.output_size self.n_slices = 1 diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index cac2c92136dca..d468655e629ae 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -8,9 +8,7 @@ import torch.nn as nn from transformers import PretrainedConfig from vllm.config.lora import LoRAConfig -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, +from vllm.distributed import (split_tensor_along_last_dim, tensor_model_parallel_all_reduce) # yapf: disable from vllm.model_executor.layers.linear import RowParallelLinear @@ -25,12 +23,9 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): def __init__(self, base_layer: RowParallelLinear) -> None: super().__init__(base_layer) - self.tp_size = get_tensor_model_parallel_world_size() # reset input_size self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size - - self.tp_rank = get_tensor_model_parallel_rank() # There is only one LoRA layer. self.n_slices = 1 @@ -68,12 +63,12 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): else: # TODO: simplify code below splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.base_layer.tp_size) + input_, num_partitions=self.tp_size) input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. output_parallel = self.apply(input_parallel) - if self.base_layer.reduce_results and self.base_layer.tp_size > 1: + if self.base_layer.reduce_results and self.tp_size > 1: output_ = tensor_model_parallel_all_reduce(output_parallel) else: output_ = output_parallel @@ -154,8 +149,8 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): buffer, x, self.lora_a_stacked, 1.0) if not current_platform.can_update_inplace(): buffer = shrunk_buffer - - buffer = tensor_model_parallel_all_reduce(buffer) + if self.tp_size>1: + buffer = tensor_model_parallel_all_reduce(buffer) # following S-LoRA, allows the fusing of all_gather and all_reduce # by adding the column partitioned lora output to a slice of output diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index e3198fb3d3ae4..90e18217d28be 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -48,11 +48,11 @@ class LoRALayerWeights: @property def input_dim(self) -> int: - return self.lora_a.shape[0] + return self.lora_a.shape[1] @property def output_dim(self) -> int: - return self.lora_b.shape[1] + return self.lora_b.shape[0] @property def is_packed(self) -> bool: From a3a7828010f4ab8e608fa257d09ab31846e83a9a Mon Sep 17 00:00:00 2001 From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:31:45 -0700 Subject: [PATCH 151/185] [ROCm] Add skinny gemm bias support for dtypes fp16,bf16,fp8 (#24988) Signed-off-by: Hashem Hashemi Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> --- csrc/rocm/ops.h | 9 +- csrc/rocm/skinny_gemms.cu | 181 ++++++++++++++---- csrc/rocm/torch_bindings.cpp | 5 +- .../quantization/test_rocm_skinny_gemms.py | 80 ++++++-- vllm/_custom_ops.py | 19 +- .../layers/quantization/utils/w8a8_utils.py | 8 +- vllm/model_executor/layers/utils.py | 6 +- 7 files changed, 231 insertions(+), 77 deletions(-) diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index b6ee2656746c1..edf7aff1abaac 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -5,11 +5,14 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, const int64_t rows_per_block); -torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, +torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, + const c10::optional& in_bias, const int64_t CuCount); -void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, - at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount); +void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b, + const c10::optional& in_bias, at::Tensor& out_c, + const at::Tensor& scale_a, const at::Tensor& scale_b, + const int64_t CuCount); void paged_attention( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index eb47139208c91..52119d52f6d1e 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -292,8 +292,9 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, template __global__ void __launch_bounds__(WvPrGrp* THRDS) - wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, + wvSplitK_hf_sml_(const int K, const int M, const int Bx, const int By, + const scalar_t* B, const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI3XX__) @@ -484,7 +485,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]); + if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]); + } else if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); + } C[m + i + n * M] = __float2s(sum[n][i]); } } @@ -529,7 +537,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]); + if (BIAS) + sum4[n][i][0] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]); } } @@ -541,8 +551,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #else // !defined(__HIP__GFX9__) TODO: Add NAVI support template -__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, +__global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx, + const int By, const scalar_t* B, + const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } @@ -553,8 +565,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, template __global__ void __launch_bounds__(WvPrGrp* THRDS) - wvSplitK_hf_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, + wvSplitK_hf_(const int K, const int M, const int Bx, const int By, + const scalar_t* B, const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI3XX__) @@ -772,8 +785,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - if (commitColumn[i]) + if (commitColumn[i]) { + if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]); + } else if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); + } C[m + i + n * M] = __float2s(sum[n][i]); + } } } } @@ -818,8 +840,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]); - C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]); + if (commitColumn[i]) { + if (BIAS) + sum4[n][i][0] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); + C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]); + } } } } @@ -842,8 +868,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #else // !defined(__HIP__GFX9__) TODO: Add NAVI support template -__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, +__global__ void wvSplitK_hf_(const int K, const int M, const int Bx, + const int By, const scalar_t* B, + const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } @@ -854,8 +882,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, template __global__ void __launch_bounds__(WvPrGrp* THRDS) - wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, + wvSplitK_hf_big_(const int K, const int M, const int Bx, const int By, + const scalar_t* B, const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI3XX__) @@ -1124,8 +1153,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - if (commitColumn[i]) + if (commitColumn[i]) { + if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]); + } else if constexpr (std::is_same_v) { + if (BIAS) + sum[n][i] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); + } C[m + i + n * M] = __float2s(sum[n][i]); + } } } } @@ -1166,8 +1204,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 63) { for (int n = 0; n < N; n++) { for (int i = 0; i < YTILE; i++) { - // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]); - C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]); + if (commitColumn[i]) { + if (BIAS) + sum4[n][i][0] += + __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]); + C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]); + } } } } @@ -1190,8 +1232,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #else // !defined(__HIP__GFX9__) TODO: Add NAVI support template -__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, - const scalar_t* __restrict__ A, scalar_t* C, +__global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx, + const int By, const scalar_t* B, + const scalar_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } @@ -1226,11 +1270,20 @@ int mindiv(int N, int div1, int div2) { return rtn; } -torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, +torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, + const c10::optional& in_bias, const int64_t CuCount) { auto M_in = in_a.size(0); auto K_in = in_a.size(1); auto N_in = in_b.size(0); + auto Bx_in = + (in_bias.has_value() && in_bias->numel() > 0) + ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0) + : 1; + auto By_in = (in_bias.has_value() && in_bias->numel() > 0 && + in_bias->sizes().size() == 2) + ? in_bias->size(0) + : 1; TORCH_CHECK(in_a.dtype() == in_b.dtype()); TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0"); @@ -1254,18 +1307,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitK_hf_sml_ \ - <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ - CuCount); \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ } else if (K_in * N_in <= max_lds_len * 1.2) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ wvSplitK_hf_ \ - <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ - CuCount); \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ } else { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \ wvSplitK_hf_big_ \ - <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ - CuCount); \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ } \ } @@ -1273,6 +1326,10 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, using fptype = typename scalar::type; fptype* af4 = reinterpret_cast(in_a.data_ptr()); const fptype* bf4 = reinterpret_cast(in_b.data_ptr()); + const fptype* biasf4 = + (in_bias.has_value() && in_bias->numel() > 0) + ? reinterpret_cast(in_bias->data_ptr()) + : nullptr; fptype* c = reinterpret_cast(out_c.data_ptr()); switch (N_in) { case 1: @@ -1300,8 +1357,9 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, template __global__ void __launch_bounds__(WvPrGrp* THRDS) - wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B, - const fp8_t* __restrict__ A, scalar_t* C, + wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx, + const int By, const fp8_t* B, const fp8_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { @@ -1453,7 +1511,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) if (threadIdx.x == 0) { for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { - C[m + y + n * M] = __float2s(sum[n][y][0] * sA * sB); + if (y + m >= M) break; // To avoid mem access fault. + sum[n][y][0] *= sA * sB; + if constexpr (std::is_same_v) { + if (BIAS) + sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]); + } else if constexpr (std::is_same_v) { + if (BIAS) + sum[n][y][0] += + __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]); + } + C[m + y + n * M] = __float2s(sum[n][y][0]); // * sA * sB); } } } @@ -1465,7 +1533,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) template __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, - const fp8_t* B, const fp8_t* __restrict__ A, + const int Bx, const int By, const fp8_t* B, + const fp8_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { @@ -1477,8 +1547,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, template __global__ void __launch_bounds__(WvPrGrp* THRDS) - wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B, - const fp8_t* __restrict__ A, scalar_t* C, + wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx, + const int By, const fp8_t* B, const fp8_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { constexpr int max_lds_len = LDS_SIZE; @@ -1626,7 +1697,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { if (y + m >= M) break; // To avoid mem access fault. - C[m + y + n * M] = __float2s(sum[n][y][0] * sA * sB); + sum[n][y][0] *= sA * sB; + if constexpr (std::is_same_v) { + if (BIAS) + sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]); + } else if constexpr (std::is_same_v) { + if (BIAS) + sum[n][y][0] += + __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]); + } + C[m + y + n * M] = __float2s(sum[n][y][0]); } } } @@ -1638,16 +1718,19 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) template __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, - const fp8_t* B, const fp8_t* __restrict__ A, - scalar_t* C, const float* __restrict__ s_A, + const int Bx, const int By, const fp8_t* B, + const fp8_t* __restrict__ A, + const scalar_t* __restrict__ BIAS, scalar_t* C, + const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } #endif // defined(__HIP__MI3XX__) TODO: Add NAVI support -void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, - at::Tensor& scale_a, at::Tensor& scale_b, +void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b, + const c10::optional& in_bias, at::Tensor& out_c, + const at::Tensor& scale_a, const at::Tensor& scale_b, const int64_t CuCount) { static c10::ScalarType kFp8Type = is_fp8_ocp() ? c10::ScalarType::Float8_e4m3fn @@ -1656,6 +1739,15 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, auto K_in = in_a.size(1); auto N_in = in_b.size(0); auto Kp_in = in_a.stride(0); + auto Bx_in = + (in_bias.has_value() && in_bias->numel() > 0) + ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0) + : 1; + auto By_in = (in_bias.has_value() && in_bias->numel() > 0 && + in_bias->sizes().size() == 2) + ? in_bias->size(0) + : 1; + TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0"); TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type); TORCH_CHECK(out_c.dtype() == torch::kFloat16 || @@ -1673,13 +1765,15 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitKQ_hf_sml_ \ - <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ - s_a, s_b, __wvPrGrp, CuCount); \ + <<>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \ + b_ptr, bias_ptr, c_ptr, s_a, s_b, \ + __wvPrGrp, CuCount); \ } else { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ wvSplitKQ_hf_ \ - <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ - s_a, s_b, __wvPrGrp, CuCount); \ + <<>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \ + b_ptr, bias_ptr, c_ptr, s_a, s_b, \ + __wvPrGrp, CuCount); \ } \ } @@ -1691,6 +1785,9 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] { auto a_ptr = in_a.data_ptr(); auto b_ptr = in_b.data_ptr(); + auto bias_ptr = (in_bias.has_value() && in_bias->numel() > 0) + ? reinterpret_cast(in_bias->data_ptr()) + : nullptr; switch (N_in) { case 1: WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1) diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index c0c4daef64f05..518486b1ca5de 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -22,13 +22,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { // Custom gemm op for skinny matrix-matrix multiplication rocm_ops.def( - "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> " + "wvSplitK(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> " "Tensor"); rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK); // wvSplitK for fp8 rocm_ops.def( - "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, " + "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, " + "Tensor scale_a, " " Tensor scale_b, int CuCount) -> ()"); rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ); diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index a9b1c71ef0718..6de5fc9c56010 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math + import pytest import torch import vllm._custom_ops as ops from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - rocm_per_tensor_w8a8_scaled_mm_impl) from vllm.platforms import current_platform DTYPES = [torch.bfloat16, torch.float16] @@ -49,6 +49,7 @@ NKM_FACTORS_WVSPLITK_FP8 = [ (2, 512, 512), (3, 2048, 2048), (4, 4096, 4096), + (4, 16400, 2048), # Extended FP8 dimensions not covered by WVSPLITK (1, 14336, 1024), (2, 24576, 2048), @@ -67,6 +68,9 @@ SEEDS = [0] @torch.inference_mode() def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed): torch.manual_seed(seed) + #TODO: Zero-centering the inputs causes errors for LLMM1! + # Without that the numbers quickly saturate, and may + # be giving false matches. A = torch.rand(n, k, dtype=dtype, device="cuda") B = torch.rand(m, k, dtype=dtype, device="cuda") @@ -85,11 +89,51 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed): torch.manual_seed(seed) cu_count = current_platform.get_cu_count() - A = torch.rand(n, k, dtype=dtype, device="cuda") - B = torch.rand(m, k, dtype=dtype, device="cuda") + A = torch.rand(n, k, dtype=dtype, device="cuda") - .5 + B = torch.rand(m, k, dtype=dtype, device="cuda") - .5 - ref_out = torch.matmul(A, B.t()) - out = ops.wvSplitK(B, A, cu_count) + ref_out = torch.nn.functional.linear(A, B) + out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count) + + assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="only test for rocm") +def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed): + torch.manual_seed(seed) + cu_count = current_platform.get_cu_count() + + xavier = math.sqrt(2 / k) # normalize to avoid large output-bias deltas + A = (torch.rand(n, k, dtype=dtype, device="cuda") - .5) * xavier + B = (torch.rand(m, k, dtype=dtype, device="cuda") - .5) * xavier + BIAS = torch.rand(m, dtype=dtype, device="cuda") - .5 + + ref_out = torch.nn.functional.linear(A, B, BIAS) + out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS) + + assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="only test for rocm") +def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed): + torch.manual_seed(seed) + cu_count = current_platform.get_cu_count() + + xavier = math.sqrt(2 / k) # normalize to avoid large output-bias deltas + A = (torch.rand(n, k, dtype=dtype, device="cuda") - .5) * xavier + B = (torch.rand(m, k, dtype=dtype, device="cuda") - .5) * xavier + BIAS = torch.rand(n, m, dtype=dtype, device="cuda") - .5 + + ref_out = torch.nn.functional.linear(A, B, BIAS) + out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS) assert torch.allclose(out, ref_out, rtol=0.01) @@ -103,8 +147,8 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed): def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed): torch.manual_seed(seed) - A = torch.rand(n, k, device="cuda") - B = torch.rand(m, k, device="cuda") + A = torch.rand(n, k, device="cuda") - 0.5 + B = torch.rand(m, k, device="cuda") - 0.5 A, scale_a = ref_dynamic_per_tensor_fp8_quant(A) B, scale_b = ref_dynamic_per_tensor_fp8_quant(B) @@ -123,27 +167,27 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed): @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("use_bias", [True, False]) @pytest.mark.skipif( not (current_platform.is_rocm() and current_platform.supports_fp8()), reason="only test for rocm fp8") -def test_rocm_per_tensor_w8a8_scaled_mm_impl(n, k, m, dtype, seed, use_bias): +def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed): torch.manual_seed(seed) - A = torch.rand(n, k, device="cuda") - B = torch.rand(m, k, device="cuda") + xavier = math.sqrt(2 / k) # normalize to avoid large output-bias deltas + A = (torch.rand(n, k, device="cuda") - .5) * xavier + B = (torch.rand(m, k, device="cuda") - .5) * xavier + BIAS = torch.rand(m, dtype=dtype, device="cuda") - .5 A, scale_a = ref_dynamic_per_tensor_fp8_quant(A) B, scale_b = ref_dynamic_per_tensor_fp8_quant(B) - bias = torch.rand(1, m, dtype=dtype, device="cuda") if use_bias else None - - output = rocm_per_tensor_w8a8_scaled_mm_impl(A, B.t(), dtype, scale_a, - scale_b, bias) ref_out = torch._scaled_mm(A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, - bias=bias) - assert torch.allclose(output, ref_out, rtol=0.01) + bias=BIAS) + out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, + current_platform.get_cu_count(), BIAS) + + assert torch.allclose(out, ref_out, rtol=0.01) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 712295aa92886..a108542e14368 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1447,17 +1447,24 @@ def LLMM1(a: torch.Tensor, b: torch.Tensor, return torch.ops._rocm_C.LLMM1(a, b, rows_per_block) -def wvSplitK(a: torch.Tensor, b: torch.Tensor, cu_count: int) -> torch.Tensor: - return torch.ops._rocm_C.wvSplitK(a, b, cu_count) +def wvSplitK(a: torch.Tensor, + b: torch.Tensor, + cu_count: int, + bias: torch.Tensor = None) -> torch.Tensor: + return torch.ops._rocm_C.wvSplitK(a, b, bias, cu_count) -def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype, - scale_a: torch.Tensor, scale_b: torch.Tensor, - cu_count: int) -> torch.Tensor: +def wvSplitKQ(a: torch.Tensor, + b: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + cu_count: int, + bias: torch.Tensor = None) -> torch.Tensor: out = torch.empty((b.shape[0], a.shape[0]), dtype=out_dtype, device=b.device) - torch.ops._rocm_C.wvSplitKQ(a, b, out, scale_a, scale_b, cu_count) + torch.ops._rocm_C.wvSplitKQ(a, b, bias, out, scale_a, scale_b, cu_count) return out diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 8cda1789e6c97..6ed482db4700e 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -178,10 +178,12 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: from vllm.platforms.rocm import on_mi3xx - if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( - ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0 and bias is None: + if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx() and \ + qinput.shape[0] == 1 and \ + qinput.shape[1] % 16 == 0 and \ + ((bias is None) or (bias.dtype == out_dtype)) : output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b, - current_platform.get_cu_count()) + current_platform.get_cu_count(), bias) else: output = torch._scaled_mm(qinput, weight, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index a1675ffbaa950..d7a65d43c2107 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -100,7 +100,7 @@ def rocm_unquantized_gemm_impl( k = weight.shape[1] use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \ x.dtype in [torch.float16, torch.bfloat16] \ - and k % 8 == 0 and bias is None) + and k % 8 == 0) if use_skinny is not True: return torch.nn.functional.linear(x, weight, bias) @@ -111,9 +111,9 @@ def rocm_unquantized_gemm_impl( cu_count = current_platform.get_cu_count() if m > 8 and 0 < n <= 4: - out = ops.wvSplitK(weight, x_view, cu_count) + out = ops.wvSplitK(weight, x_view, cu_count, bias) return out.view(*x.shape[:-1], weight.shape[0]) - elif m % 4 == 0 and n == 1 and k <= 8192: + elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None: out = ops.LLMM1(weight, x_view, 4) return out.view(*x.shape[:-1], weight.shape[0]) return torch.nn.functional.linear(x, weight, bias) From 8c1c81a3de344fba340cb2efc82f29d2c1563a2d Mon Sep 17 00:00:00 2001 From: Amir Samani Date: Tue, 23 Sep 2025 11:33:06 -0700 Subject: [PATCH 152/185] [core] add nccl symmetric memory for all reduce (#24532) Signed-off-by: Amir Samani Signed-off-by: Michael Goin Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 1 + .../kernels/benchmark_device_communicators.py | 26 ++- .../test_nccl_symm_mem_allreduce.py | 94 +++++++++ vllm/compilation/cuda_graph.py | 6 + .../device_communicators/all_reduce_utils.py | 27 ++- .../device_communicators/cuda_communicator.py | 15 ++ .../device_communicators/pynccl.py | 53 ++++- .../device_communicators/pynccl_allocator.py | 186 ++++++++++++++++++ .../device_communicators/pynccl_wrapper.py | 38 +++- vllm/envs.py | 11 ++ vllm/utils/__init__.py | 32 +++ vllm/v1/worker/gpu_ubatch_wrapper.py | 6 + 12 files changed, 489 insertions(+), 6 deletions(-) create mode 100644 tests/distributed/test_nccl_symm_mem_allreduce.py create mode 100644 vllm/distributed/device_communicators/pynccl_allocator.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 49316eb4f6073..cf32087ed3b98 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1039,3 +1039,4 @@ steps: num_gpus: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index a61c17edc1e28..4cbdde5a5b2ca 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -7,6 +7,10 @@ Benchmark script for device communicators: CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, and SymmMemCommunicator (multimem, two-shot). +for NCCL symmetric memory you need to set the environment variables +NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does +not use fast NVLS implementation for all reduce. + Usage: torchrun --nproc_per_node= benchmark_device_communicators.py [options] @@ -26,7 +30,13 @@ import torch.distributed as dist from torch.distributed import ProcessGroup from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce -from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator, + register_nccl_symmetric_ops, +) +from vllm.distributed.device_communicators.pynccl_allocator import ( + set_graph_pool_id, +) from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator from vllm.logger import init_logger from vllm.utils import FlexibleArgumentParser @@ -98,6 +108,7 @@ class CommunicatorBenchmark: ) if not self.pynccl_comm.disabled: logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) + register_nccl_symmetric_ops(self.pynccl_comm) else: logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) self.pynccl_comm = None @@ -194,6 +205,15 @@ class CommunicatorBenchmark: None, # no env variable needed ) ) + communicators.append( + ( + "pynccl-symm", + lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) if self.symm_mem_comm_multimem is not None: comm = self.symm_mem_comm_multimem @@ -271,7 +291,9 @@ class CommunicatorBenchmark: # Capture the graph using context manager with context: graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph): + graph_pool = torch.cuda.graph_pool_handle() + set_graph_pool_id(graph_pool) + with torch.cuda.graph(graph, pool=graph_pool): for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): allreduce_fn(graph_input) diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py new file mode 100644 index 0000000000000..ffc913742620d --- /dev/null +++ b/tests/distributed/test_nccl_symm_mem_allreduce.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import random +import typing + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +import vllm.envs as envs +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.distributed.device_communicators.cuda_communicator import ( + CudaCommunicator) +from vllm.distributed.device_communicators.pynccl import ( + register_nccl_symmetric_ops) +from vllm.distributed.device_communicators.pynccl_allocator import ( + get_nccl_mem_pool, is_symmetric_memory_enabled) +from vllm.distributed.parallel_state import (get_tp_group, + init_distributed_environment, + initialize_model_parallel) +from vllm.platforms import current_platform +from vllm.utils import update_environment_variables + +torch.manual_seed(42) +random.seed(44) + +test_size_elements = 4 * 1024 * 1024 + + +def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int): + monkeypatch = pytest.MonkeyPatch() + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + dtype = torch.bfloat16 + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + update_environment_variables({ + "RANK": str(local_rank), + "LOCAL_RANK": str(local_rank), + "WORLD_SIZE": str(world_size), + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12345", + }) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + cuda_communicator = typing.cast(CudaCommunicator, + get_tp_group().device_communicator) + pynccl_comm = cuda_communicator.pynccl_comm + if get_nccl_mem_pool() is None: + pytest.skip("NCCL allocator compilation failed " + "(probably missing NCCL headers).") + if not is_symmetric_memory_enabled(): + pytest.skip("NCCL symmetric memory allreduce is disabled.") + + register_nccl_symmetric_ops(pynccl_comm) + input = torch.randint(1, + 23, (test_size_elements, ), + dtype=dtype, + device=device) + input_clone = input.clone() + output = torch.ops.vllm.all_reduce_symmetric_with_copy(input) + assert output is not None + + group = get_tp_group().device_group + dist.all_reduce(input_clone, group=group) + torch.testing.assert_close(output, input_clone, atol=2.5, rtol=0.1) + + +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="NCCLSymmMemAllreduce is only available for CUDA platforms.", +) +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], + reason="Only test on CUDA") +def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size): + if world_size > torch.cuda.device_count(): + pytest.skip("Not enough GPUs to run the test.") + + # Enable SymmMemCommunicator + monkeypatch.setenv("VLLM_USE_NCCL_SYMM_MEM", "1") + monkeypatch.setenv("NCCL_NVLS_ENABLE", "1") + monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1") + + mp.spawn(nccl_symm_mem_allreduce_worker, + args=(world_size, ), + nprocs=world_size) + cleanup_dist_env_and_memory() diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index e233f959c0a4a..befb7736d75af 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -12,6 +12,8 @@ import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.monitor import validate_cudagraph_capturing_enabled from vllm.config import CUDAGraphMode, VllmConfig +from vllm.distributed.device_communicators.pynccl_allocator import ( + set_graph_pool_id) from vllm.forward_context import BatchDescriptor, get_forward_context from vllm.logger import init_logger from vllm.platforms import current_platform @@ -154,6 +156,10 @@ class CUDAGraphWrapper: stack.enter_context( patch("torch.cuda.empty_cache", lambda: None)) + if self.graph_pool is not None: + set_graph_pool_id(self.graph_pool) + else: + set_graph_pool_id(current_platform.graph_pool_handle()) # mind-exploding: carefully manage the reference and memory. with torch.cuda.graph(cudagraph, pool=self.graph_pool): # `output` is managed by pytorch's cudagraph pool diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 805a88854b77c..87e0f8e1a9677 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -10,8 +10,9 @@ import sys import tempfile from collections.abc import Sequence from itertools import product -from typing import Optional +from typing import Any, Optional +import torch import torch.distributed as dist import torch.multiprocessing as mp @@ -56,6 +57,30 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = { } } +NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = { + "min_world_size": 4, + "thresholds": { + 4: 2 * MiB, # 2 MB + 8: 1 * MiB, # 1 MB + }, + "always_use_above_world_size": 8 # Always use symm mem for world_size > 8 +} + + +def should_nccl_symm_mem_allreduce(world_size: int, + input_tensor: torch.Tensor) -> bool: + from vllm.distributed.device_communicators.pynccl_allocator import ( + is_symmetric_memory_enabled) + if not is_symmetric_memory_enabled(): + return False + if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]: + return False + threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size) + if threshold is not None and input_tensor.nbytes >= threshold: + return True + return (world_size + > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"]) + def producer(batch_src: Sequence[int], producer_queue, diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index b2bf3bc3cc2ed..6c25bf3cd95c3 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -7,6 +7,12 @@ import torch from torch.distributed import ProcessGroup import vllm.envs as envs +from vllm.distributed.device_communicators.all_reduce_utils import ( + should_nccl_symm_mem_allreduce) +from vllm.distributed.device_communicators.pynccl import ( + register_nccl_symmetric_ops) +from vllm.distributed.device_communicators.pynccl_allocator import ( + is_symmetric_memory_enabled) from vllm.logger import init_logger from vllm.platforms import current_platform @@ -53,6 +59,8 @@ class CudaCommunicator(DeviceCommunicatorBase): group=self.cpu_group, device=self.device, ) + if is_symmetric_memory_enabled(): + register_nccl_symmetric_ops(self.pynccl_comm) self.ca_comm: Optional[CustomAllreduce] = None self.qr_comm: Optional[QuickAllReduce] = None @@ -107,6 +115,13 @@ class CudaCommunicator(DeviceCommunicatorBase): raise ValueError(f"Unknown all2all backend: {all2all_backend}") def all_reduce(self, input_): + # since currently we perform copy input -> symm_input -> out-of-place AR + # return symm_output, we don't need to check if input is symmetric + if self.pynccl_comm is not None and \ + should_nccl_symm_mem_allreduce(self.pynccl_comm.world_size,input_): + out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_) + if out is not None: + return out # always try quick reduce first, then custom allreduce, # and then pynccl. (quick reduce just for ROCM MI3*) qr_comm = self.qr_comm diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 3e4d0d250af94..75de85e1b0aba 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -17,6 +17,39 @@ from vllm.utils import current_stream logger = init_logger(__name__) +_NCCL_SYMM_OPS_REGISTERED = False + + +def register_nccl_symmetric_ops(pynccl_comm): + from vllm.distributed.device_communicators.pynccl_allocator import ( + nccl_symm_mem_context) + from vllm.utils import direct_register_custom_op + + global _NCCL_SYMM_OPS_REGISTERED + if _NCCL_SYMM_OPS_REGISTERED: + return + _NCCL_SYMM_OPS_REGISTERED = True + + def all_reduce_symmetric_with_copy_impl( + input_tensor: torch.Tensor) -> torch.Tensor: + with nccl_symm_mem_context(pynccl_comm): + symm_input = torch.empty_like(input_tensor) + symm_output = torch.empty_like(input_tensor) + symm_input.copy_(input_tensor) + symm_output = pynccl_comm.all_reduce(symm_input, symm_output) + return symm_output + + def all_reduce_symmetric_with_copy_fake( + input_tensor: torch.Tensor) -> torch.Tensor: + return torch.empty_like(input_tensor) + + direct_register_custom_op( + op_name="all_reduce_symmetric_with_copy", + op_func=all_reduce_symmetric_with_copy_impl, + mutates_args=[], + fake_impl=all_reduce_symmetric_with_copy_fake, + ) + class PyNcclCommunicator: @@ -67,6 +100,7 @@ class PyNcclCommunicator: self.available = True self.disabled = False + self.nccl_version = self.nccl.ncclGetRawVersion() logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion()) if self.rank == 0: @@ -109,6 +143,7 @@ class PyNcclCommunicator: def all_reduce(self, in_tensor: torch.Tensor, + out_tensor: torch.Tensor = None, op: ReduceOp = ReduceOp.SUM, stream=None) -> torch.Tensor: if self.disabled: @@ -120,7 +155,8 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {in_tensor.device}") - out_tensor = torch.empty_like(in_tensor) + if out_tensor is None: + out_tensor = torch.empty_like(in_tensor) if stream is None: stream = current_stream() @@ -288,3 +324,18 @@ class PyNcclCommunicator: def group_end(self): self.nccl.ncclGroupEnd() + + def register_comm_window(self, tensor: torch.Tensor): + return self.nccl.ncclCommWindowRegister( + self.comm, + buffer_type(tensor.data_ptr()), + tensor.numel() * tensor.element_size(), + 1, + ) + + def register_comm_window_raw(self, ptr: int, size: int): + return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), + size, 1) + + def deregister_comm_window(self, window): + return self.nccl.ncclCommWindowDeregister(self.comm, window) diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py new file mode 100644 index 0000000000000..bc874c1e197e7 --- /dev/null +++ b/vllm/distributed/device_communicators/pynccl_allocator.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import atexit +import contextlib +import tempfile +from typing import Any, Optional + +import torch +from packaging import version +from torch.cuda.memory import CUDAPluggableAllocator +from torch.utils.cpp_extension import load_inline + +from vllm import envs +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import find_nccl_include_paths + +logger = init_logger(__name__) + +nccl_allocator_source = """ +#include +extern "C" { + +void* nccl_alloc_plug(size_t size, int device, void* stream) { + void* ptr; + ncclResult_t err = ncclMemAlloc(&ptr, size); + return ptr; + +} + +void nccl_free_plug(void* ptr, size_t size, int device, void* stream) { + ncclResult_t err = ncclMemFree(ptr); +} + +} +""" + +_allocator = None +_allocator_wrapper = None +_mem_pool = None +_registered_base_addrs = set() +_graph_pool_id = None +_nccl_allocator_failed_to_compile = False +_cached_pool_snapshot = None + + +def is_symmetric_memory_enabled(): + global _nccl_allocator_failed_to_compile + return envs.VLLM_USE_NCCL_SYMM_MEM and not _nccl_allocator_failed_to_compile + + +def is_symmetric_memory_tensor(tensor: torch.Tensor): + if not is_symmetric_memory_enabled() or _cached_pool_snapshot is None: + return False + for segment in _cached_pool_snapshot: + for block in segment["blocks"]: + if block["address"] == tensor.untyped_storage().data_ptr(): + return True + return False + + +def set_graph_pool_id(graph_pool_id): + global _graph_pool_id + _graph_pool_id = graph_pool_id + + +def compile_nccl_allocator(): + global _allocator, _allocator_wrapper, _nccl_allocator_failed_to_compile + if not current_platform.is_cuda(): + _nccl_allocator_failed_to_compile = True + return + try: + out_dir = tempfile.gettempdir() + nccl_allocator_libname = "nccl_allocator" + nccl_include_paths = find_nccl_include_paths() + load_inline( + name=nccl_allocator_libname, + cpp_sources=nccl_allocator_source, + with_cuda=True, + extra_ldflags=["-lnccl"], + verbose=envs.VLLM_LOGGING_LEVEL == "DEBUG", + is_python_module=False, + build_directory=out_dir, + extra_include_paths=nccl_include_paths, + ) + _allocator_wrapper = CUDAPluggableAllocator( + f"{out_dir}/{nccl_allocator_libname}.so", + "nccl_alloc_plug", + "nccl_free_plug", + ) + _allocator = _allocator_wrapper.allocator() + except Exception as e: + _nccl_allocator_failed_to_compile = True + logger.warning( + "Failed to compile NCCL memory allocator. " + "Symmetric memory will be disabled. " + "This is expected if NCCL headers are not available. " + "optionally set VLLM_NCCL_INCLUDE_PATH to point to a directory " + "containing the NCCL header. " + "Error: %s", str(e)) + + +def get_nccl_mem_pool(): + global _mem_pool, _nccl_allocator_failed_to_compile + if _mem_pool is None and not _nccl_allocator_failed_to_compile: + compile_nccl_allocator() + if _allocator is not None: + _mem_pool = torch.cuda.MemPool(_allocator) + return _mem_pool + + +def _cleanup_nccl_mem_pool(): + global _mem_pool + _mem_pool = None + + +def _cleanup_nccl_allocator_wrapper(): + global _allocator_wrapper + _allocator_wrapper = None + + +atexit.register(_cleanup_nccl_mem_pool) +atexit.register(_cleanup_nccl_allocator_wrapper) + + +class nccl_symm_mem_context: + + def __init__( + self, + pynccl_comm: PyNcclCommunicator, + disabled: bool = False, + ): + self.disabled = (disabled or not is_symmetric_memory_enabled() + or pynccl_comm.world_size == 1 + or not current_platform.is_cuda() + or get_nccl_mem_pool() is None or version.parse( + torch.__version__) < version.parse("2.8.0.a0")) + if self.disabled: + self.pynccl_comm: Optional[PyNcclCommunicator] = None + self._mem_pool_ctx: contextlib.AbstractContextManager[ + Any] = contextlib.nullcontext() + self.is_graph_capture = None + self.device = None + else: + self.pynccl_comm = pynccl_comm + self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool()) + self.is_graph_capture = torch.cuda.is_current_stream_capturing() + self.device = torch.cuda.current_device() + + def __enter__(self): + if self.disabled: + return self + assert ( + self.pynccl_comm + is not None), "Symmetric memory requires pynccl to be initalized" + assert ( + self.pynccl_comm.nccl_version >= 22703 + ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory" + if self.is_graph_capture: + assert ( + _graph_pool_id + is not None), "graph_pool_id is not set under graph capture" + # Pause graph memory pool to use symmetric memory with cuda graph + torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id) + self._mem_pool_ctx.__enter__() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.disabled: + return + global _cached_pool_snapshot + global _registered_base_addrs + self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb) + _pool = get_nccl_mem_pool() + assert _pool is not None + _cached_pool_snapshot = _pool.snapshot() + assert self.pynccl_comm is not None + for segment in _cached_pool_snapshot: + if segment["address"] not in _registered_base_addrs: + self.pynccl_comm.register_comm_window_raw( + segment["address"], segment["total_size"]) + _registered_base_addrs.add(segment["address"]) + if self.is_graph_capture: + torch._C._cuda_beginAllocateCurrentThreadToPool( + self.device, _graph_pool_id) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index a930b63bc26ff..c3e99e177e2d5 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -41,6 +41,7 @@ logger = init_logger(__name__) ncclResult_t = ctypes.c_int ncclComm_t = ctypes.c_void_p +ncclWindow_t = ctypes.c_void_p class ncclUniqueId(ctypes.Structure): @@ -222,6 +223,24 @@ class NCCLLibrary: Function("ncclGroupStart", ncclResult_t, []), # ncclResult_t ncclGroupEnd(); Function("ncclGroupEnd", ncclResult_t, []), + # ncclResult_t ncclCommWindowRegister( + # ncclComm_t comm, void* buff, size_t size, + # ncclWindow_t* win, int winFlags); + Function( + "ncclCommWindowRegister", + ncclResult_t, + [ + ncclComm_t, + buffer_type, + ctypes.c_size_t, + ctypes.POINTER(ncclWindow_t), + ctypes.c_int, + ], + ), + # ncclResult_t ncclCommWindowDeregister( + # ncclComm_t comm, ncclWindow_t win); + Function("ncclCommWindowDeregister", ncclResult_t, + [ncclComm_t, ncclWindow_t]), ] # class attribute to store the mapping from the path to the library @@ -271,10 +290,14 @@ class NCCLLibrary: error_str = self.ncclGetErrorString(result) raise RuntimeError(f"NCCL error: {error_str}") - def ncclGetVersion(self) -> str: + def ncclGetRawVersion(self) -> int: version = ctypes.c_int() self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version))) - version_str = str(version.value) + # something like 21903 + return version.value + + def ncclGetVersion(self) -> str: + version_str = str(self.ncclGetRawVersion()) # something like 21903 --> "2.19.3" major = version_str[0].lstrip("0") minor = version_str[1:3].lstrip("0") @@ -375,6 +398,17 @@ class NCCLLibrary: def ncclGroupEnd(self) -> None: self.NCCL_CHECK(self._funcs["ncclGroupEnd"]()) + def ncclCommWindowRegister(self, comm: ncclComm_t, buff: buffer_type, + size: int, win_flags: int) -> ncclWindow_t: + window = ncclWindow_t() + self.NCCL_CHECK(self._funcs["ncclCommWindowRegister"]( + comm, buff, size, ctypes.byref(window), win_flags)) + return window + + def ncclCommWindowDeregister(self, comm: ncclComm_t, + window: ncclWindow_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window)) + __all__ = [ "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId", diff --git a/vllm/envs.py b/vllm/envs.py index f6eafe892ef2d..fa6f14d6b0371 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -193,6 +193,8 @@ if TYPE_CHECKING: VLLM_DBO_COMM_SMS: int = 20 GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None + VLLM_USE_NCCL_SYMM_MEM: bool = False + VLLM_NCCL_INCLUDE_PATH: Optional[str] = None def get_default_cache_root(): @@ -1410,6 +1412,15 @@ environment_variables: dict[str, Callable[[], Any]] = { ["container", "code_interpreter", "web_search_preview"]), + + # Flag to enable NCCL symmetric memory allocation and registration + "VLLM_USE_NCCL_SYMM_MEM": + lambda: bool(int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))), + + # NCCL header path + "VLLM_NCCL_INCLUDE_PATH": + lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None), + } # --8<-- [end:env-vars-definition] diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 3399d00fbabbd..5d165f1662383 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1383,6 +1383,38 @@ def find_nccl_library() -> str: return so_file +def find_nccl_include_paths() -> Optional[list[str]]: + """ + We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH` + environment variable, or we find the library file brought by + nvidia-nccl-cuXX. load_inline by default uses + torch.utils.cpp_extension.include_paths + """ + paths: list[str] = [] + inc = envs.VLLM_NCCL_INCLUDE_PATH + if inc and os.path.isdir(inc): + paths.append(inc) + + try: + import importlib.util + spec = importlib.util.find_spec("nvidia.nccl") + if spec and getattr(spec, "submodule_search_locations", None): + for loc in spec.submodule_search_locations: + inc_dir = os.path.join(loc, "include") + if os.path.exists(os.path.join(inc_dir, "nccl.h")): + paths.append(inc_dir) + except Exception: + pass + + seen = set() + out: list[str] = [] + for p in paths: + if p and p not in seen: + out.append(p) + seen.add(p) + return out or None + + prev_set_stream = torch.cuda.set_stream _current_stream_tls = threading.local() diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index bfc3743ea4179..d636e7af72ea1 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -11,6 +11,8 @@ import vllm.envs as envs from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.config import CUDAGraphMode, VllmConfig from vllm.distributed import get_ep_group +from vllm.distributed.device_communicators.pynccl_allocator import ( + set_graph_pool_id) from vllm.forward_context import (create_forward_context, get_forward_context, override_forward_context) from vllm.logger import init_logger @@ -206,6 +208,10 @@ class UBatchWrapper: cudagraph=torch.cuda.CUDAGraph(), ubatch_metadata=ubatch_metadata, ) + if self.graph_pool is not None: + set_graph_pool_id(self.graph_pool) + else: + set_graph_pool_id(current_platform.graph_pool_handle()) with torch.cuda.graph(cudagraph_metadata.cudagraph, stream=compute_stream, pool=self.graph_pool): From 63400259d05485330fb23635af892b3fce160dea Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 23 Sep 2025 21:03:10 +0200 Subject: [PATCH 153/185] [Performance] Move apply_w8a8_block_fp8_linear to an op class (#24666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ElizaWszola Signed-off-by: ElizaWszola Signed-off-by: Luka Govedič Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič Co-authored-by: Michael Goin Co-authored-by: Luka Govedič --- .../cutlass_benchmarks/w8a8_benchmarks.py | 4 +- .../benchmark_fp8_block_dense_gemm.py | 4 +- tests/kernels/quantization/test_block_fp8.py | 5 +- .../quantization/test_fp8_quant_group.py | 26 +- .../model_executor/test_enabled_custom_ops.py | 30 -- tests/quantization/test_compressed_tensors.py | 35 +++ vllm/config/__init__.py | 17 + .../compressed_tensors/compressed_tensors.py | 8 + .../schemes/compressed_tensors_w8a8_fp8.py | 37 ++- .../layers/quantization/deepgemm.py | 10 +- .../model_executor/layers/quantization/fp8.py | 40 ++- .../layers/quantization/input_quant_fp8.py | 24 +- .../layers/quantization/utils/fp8_utils.py | 290 +++++++++++------- vllm/utils/deep_gemm.py | 12 +- 14 files changed, 341 insertions(+), 201 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index a5a5b52f60397..02f8c593392c4 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) from vllm.utils import FlexibleArgumentParser, cdiv @@ -158,7 +158,7 @@ def bench_fp8( "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) ), - "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( + "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm( a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) ), "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index b99c2099f2c38..b3c3742825de7 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( get_col_major_tma_aligned_tensor, per_token_group_quant_fp8, - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) from vllm.triton_utils import triton from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8 @@ -59,7 +59,7 @@ def benchmark_shape(m: int, # === vLLM Triton Implementation === def vllm_triton_gemm(): - return w8a8_block_fp8_matmul(A_vllm, + return w8a8_triton_block_scaled_mm(A_vllm, B_vllm, A_scale_vllm, B_scale_vllm, diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index c440747316b80..c0b934fc55ae6 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -12,7 +12,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8, from vllm.config import VllmConfig from vllm.model_executor.layers.quantization.utils.fp8_utils import ( cutlass_scaled_mm, get_col_major_tma_aligned_tensor, - per_token_group_quant_fp8, w8a8_block_fp8_matmul) + per_token_group_quant_fp8, w8a8_triton_block_scaled_mm) from vllm.platforms import current_platform from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8 @@ -90,7 +90,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype) - out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype) + out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size, + out_dtype) rel_diff = (torch.mean( torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py index 720eee62760db..3d4c851a9b889 100644 --- a/tests/kernels/quantization/test_fp8_quant_group.py +++ b/tests/kernels/quantization/test_fp8_quant_group.py @@ -20,9 +20,11 @@ from vllm.platforms import current_platform (8, 513, 64), # Non-divisible (native only) ]) @pytest.mark.parametrize("seed", [42]) +@pytest.mark.parametrize("use_ue8m0", [True, False]) @torch.inference_mode() def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, - group_size: int, seed: int) -> None: + group_size: int, seed: int, + use_ue8m0: bool) -> None: """Test QuantFP8 group quantization with various configurations. Tests both CUDA and native implementations, column-major scales, @@ -38,7 +40,8 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, group_shape = GroupShape(1, group_size) quant_op = QuantFP8(static=False, group_shape=group_shape, - column_major_scales=False) + column_major_scales=False, + use_ue8m0=use_ue8m0) # 1. Test native implementation (always available) x_quant_native, scales_native = quant_op.forward_native(x.clone()) @@ -48,9 +51,15 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, # 2. Test column-major scales configuration quant_op_col = QuantFP8(static=False, group_shape=group_shape, - column_major_scales=True) + column_major_scales=True, + use_ue8m0=use_ue8m0) _, scales_col = quant_op_col.forward_native(x.clone()) - assert scales_col.shape == (expected_num_groups, batch_size) + assert scales_col.shape == (batch_size, expected_num_groups) + assert scales_col.stride(0) == 1 + assert scales_col.stride(1) == batch_size + + # Test column-major scales consistency + assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8) # 3. Test CUDA implementation (only for divisible dimensions) if is_divisible: @@ -68,8 +77,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, @pytest.mark.parametrize("seed", [42]) +@pytest.mark.parametrize("use_ue8m0", [True, False]) @torch.inference_mode() -def test_quantfp8_group_multidimensional(seed: int) -> None: +def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: current_platform.seed_everything(seed) group_size = 64 @@ -82,7 +92,8 @@ def test_quantfp8_group_multidimensional(seed: int) -> None: group_shape = GroupShape(1, group_size) quant_op = QuantFP8(static=False, group_shape=group_shape, - column_major_scales=False) + column_major_scales=False, + use_ue8m0=use_ue8m0) x_quant, scales = quant_op.forward_native(x_3d.clone()) assert x_quant.shape == x_3d.shape @@ -91,7 +102,8 @@ def test_quantfp8_group_multidimensional(seed: int) -> None: # Test column_major_scales with multi-dim quant_op_col = QuantFP8(static=False, group_shape=group_shape, - column_major_scales=True) + column_major_scales=True, + use_ue8m0=use_ue8m0) _, scales_col = quant_op_col.forward_native(x_3d.clone()) assert scales_col.shape == (batch1, hidden_dim // group_size, batch2) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 92ce10a9efc0b..200b6ecd58528 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -17,8 +17,6 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( from vllm.model_executor.layers.layernorm import (RMSNorm, dispatch_rocm_rmsnorm_func, fused_add_rms_norm, rms_norm) -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul) from vllm.platforms import current_platform RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16] @@ -111,34 +109,6 @@ def test_enabled_ops_invalid(env: str): RMSNorm(1024).enabled() -@pytest.mark.skipif( - not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(), - reason="AITER is a feature exclusive for ROCm and FP8_FNUZ") -@pytest.mark.parametrize("use_cutlass", [True, False]) -@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) -@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"]) -def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str, - use_rocm_aiter_gemm_w8a8_blockscale: str, - monkeypatch): - - monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) - monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", - use_rocm_aiter_gemm_w8a8_blockscale) - - use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool( - int(use_rocm_aiter_gemm_w8a8_blockscale))) - block_scale_func = dispatch_w8a8_blockscale_func( - use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported) - if use_cutlass: - assert block_scale_func == cutlass_scaled_mm - elif current_platform.is_rocm() and int(use_rocm_aiter) and int( - use_rocm_aiter_gemm_w8a8_blockscale): - assert block_scale_func == ( - torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale) - else: - assert block_scale_func == w8a8_block_fp8_matmul - - @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) def test_topk_dispatch(use_rocm_aiter: str, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index c0ab3fbb10622..af8c7ec3b4822 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -18,6 +18,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp) from vllm.model_executor.layers.quantization.utils.quant_utils import ( cutlass_fp4_supported) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( @@ -742,3 +745,35 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt, perplexity = llm.generate_prompt_perplexity([prompt])[0] print(perplexity) assert perplexity <= exp_perplexity + + +def test_compressed_tensors_fp8_block_enabled(vllm_runner): + model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK" + with vllm_runner(model_path) as llm: + + fp8_dtype = current_platform.fp8_dtype() + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8) + assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear, + W8A8BlockFp8LinearOp) + + assert qkv_proj.weight.dtype is fp8_dtype + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight.shape) == 2 + assert len(qkv_proj.weight_scale.shape) == 2 + + input_quant_op = \ + qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op + assert isinstance(input_quant_op, QuantFP8) + assert input_quant_op._forward_method == input_quant_op.forward_cuda + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index a2562a10b45a6..50e8cad236176 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -687,6 +687,23 @@ class VllmConfig: # local attention. self.scheduler_config.disable_hybrid_kv_cache_manager = True + def has_blocked_weights(): + if self.quant_config is not None: + if hasattr(self.quant_config, "weight_block_size"): + return self.quant_config.weight_block_size is not None + elif hasattr(self.quant_config, "has_blocked_weights"): + return self.quant_config.has_blocked_weights() + return False + + # Enable quant_fp8 CUDA ops (TODO disable in follow up) + # On H100 the CUDA kernel is faster than + # native implementation + # https://github.com/vllm-project/vllm/issues/25094 + if has_blocked_weights(): + custom_ops = self.compilation_config.custom_ops + if "none" not in custom_ops and "-quant_fp8" not in custom_ops: + custom_ops.append("+quant_fp8") + def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: # remove the sizes that not multiple of tp_size when diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index d6550dd16892f..3f771ea2abd1a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -644,6 +644,14 @@ class CompressedTensorsConfig(QuantizationConfig): # If no matches, return None return None + def has_blocked_weights(self) -> bool: + for scheme in self.target_scheme_map.values(): + weight_quant = scheme.get("weights") + if (weight_quant is not None + and weight_quant.strategy == QuantizationStrategy.BLOCK): + return True + return False + @staticmethod def supports_cutlass_24( weight_quant: Optional[QuantizationArgs], diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index d42ae22c51393..fa0816959fcda 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -11,7 +11,7 @@ from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_fp8_block_linear, check_aiter_fp8_linear_support, + W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support, create_fp8_input_scale, create_fp8_scale_parameter, create_fp8_weight_parameter, maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy, @@ -41,16 +41,30 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): self.strategy = weight_quant.strategy self.out_dtype = torch.get_default_dtype() self.is_static_input_scheme = is_static_input_scheme - self.act_q_group_shape = GroupShape.PER_TENSOR \ - if is_static_input_scheme else GroupShape.PER_TOKEN - self.fp8_linear = Fp8LinearOp( - act_quant_static=self.is_static_input_scheme, - act_quant_group_shape=self.act_q_group_shape) self.weight_block_size = self.weight_quant.block_structure + if self.weight_block_size is not None: + self.act_q_group_shape = GroupShape(1, self.weight_block_size[0]) + else: + self.act_q_group_shape = GroupShape.PER_TENSOR \ + if is_static_input_scheme else GroupShape.PER_TOKEN + self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() + if self.weight_block_size is not None: + assert not self.is_static_input_scheme + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(*self.weight_block_size), + act_quant_group_shape=self.act_q_group_shape, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported, + ) + else: + self.fp8_linear = Fp8LinearOp( + act_quant_static=self.is_static_input_scheme, + act_quant_group_shape=self.act_q_group_shape) + @classmethod def get_min_capability(cls) -> int: # lovelace and up @@ -141,13 +155,14 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if layer.weight_block_size is not None: - return apply_fp8_block_linear( - layer, + if self.weight_block_size is not None: + return self.w8a8_block_fp8_linear.apply( input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, bias=bias, - cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, - use_aiter_and_is_supported=self.use_aiter_and_is_supported) + ) return self.fp8_linear.apply(input=x, weight=layer.weight, diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py index d26a932eddb2c..c2b3ccf19fca8 100644 --- a/vllm/model_executor/layers/quantization/deepgemm.py +++ b/vllm/model_executor/layers/quantization/deepgemm.py @@ -43,7 +43,7 @@ def prepare_block_fp8_matmul_inputs( return M, N, K, C -def w8a8_block_fp8_matmul_deepgemm( +def w8a8_deepgemm_block_scaled_mm( A: torch.Tensor, B: torch.Tensor, As: torch.Tensor, @@ -59,7 +59,7 @@ def w8a8_block_fp8_matmul_deepgemm( return C -def w8a8_block_fp8_matmul_deepgemm_fake( +def w8a8_deepgemm_block_scaled_mm_fake( A: torch.Tensor, B: torch.Tensor, As: torch.Tensor, @@ -73,9 +73,9 @@ def w8a8_block_fp8_matmul_deepgemm_fake( direct_register_custom_op( - op_name="w8a8_block_fp8_matmul_deepgemm", - op_func=w8a8_block_fp8_matmul_deepgemm, + op_name="w8a8_deepgemm_block_scaled_mm", + op_func=w8a8_deepgemm_block_scaled_mm, mutates_args=[], - fake_impl=w8a8_block_fp8_matmul_deepgemm_fake, + fake_impl=w8a8_deepgemm_block_scaled_mm_fake, dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 2b24e052053c0..c4951712baa78 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -31,7 +31,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_fp8_block_linear, check_aiter_fp8_linear_support, + W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support, create_fp8_input_scale, create_fp8_scale_parameter, create_fp8_weight_parameter, expert_weight_is_col_major, get_col_major_tma_aligned_tensor, maybe_post_process_fp8_weight_block, @@ -234,15 +234,28 @@ class Fp8LinearMethod(LinearMethodBase): self.weight_block_size = self.quant_config.weight_block_size self.block_quant = self.weight_block_size is not None self.act_q_static = self.quant_config.activation_scheme == "static" - # Use per-token quantization for better perf if dynamic and cutlass - if not self.act_q_static and cutlass_fp8_supported(): - self.act_q_group_shape = GroupShape.PER_TOKEN + if self.weight_block_size: + self.act_q_group_shape = GroupShape(1, self.weight_block_size[0]) else: - self.act_q_group_shape = GroupShape.PER_TENSOR + # Use per-token quantization for better perf if dynamic and cutlass + if not self.act_q_static and cutlass_fp8_supported(): + self.act_q_group_shape = GroupShape.PER_TOKEN + else: + self.act_q_group_shape = GroupShape.PER_TENSOR - self.fp8_linear = Fp8LinearOp( - act_quant_static=self.act_q_static, - act_quant_group_shape=self.act_q_group_shape) + if self.block_quant: + assert not self.act_q_static + assert self.weight_block_size is not None + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(*self.weight_block_size), + act_quant_group_shape=self.act_q_group_shape, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported, + ) + else: + self.fp8_linear = Fp8LinearOp( + act_quant_static=self.act_q_static, + act_quant_group_shape=self.act_q_group_shape) def create_weights( self, @@ -391,12 +404,15 @@ class Fp8LinearMethod(LinearMethodBase): bias=bias) if self.block_quant: - return apply_fp8_block_linear( - layer, + assert self.weight_block_size is not None + + return self.w8a8_block_fp8_linear.apply( input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, bias=bias, - cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, - use_aiter_and_is_supported=self.use_aiter_and_is_supported) + ) return self.fp8_linear.apply(input=x, weight=layer.weight, diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 31182f40b48f6..ece3e5817116f 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -27,11 +27,14 @@ class QuantFP8(CustomOp): This CustomOp supports both static and dynamic quantization. """ - def __init__(self, - static: bool, - group_shape: GroupShape, - num_token_padding: Optional[int] = None, - column_major_scales: bool = False): + def __init__( + self, + static: bool, + group_shape: GroupShape, + num_token_padding: Optional[int] = None, + column_major_scales: bool = False, + use_ue8m0: Optional[bool] = None, # for Torch compile + ): """ :param static: static or dynamic quantization :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR, @@ -46,6 +49,7 @@ class QuantFP8(CustomOp): self.group_shape = group_shape self.num_token_padding = num_token_padding self.column_major_scales = column_major_scales + self.use_ue8m0 = use_ue8m0 self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: @@ -70,7 +74,8 @@ class QuantFP8(CustomOp): x, group_size=self.group_size, column_major_scales=self.column_major_scales, - dtype=_FP8_DTYPE) + dtype=_FP8_DTYPE, + use_ue8m0=self.use_ue8m0) assert (scale is not None) == self.static assert scale_ub is None or (not self.static and self.group_shape @@ -137,7 +142,10 @@ class QuantFP8(CustomOp): x_grouped = x.view(-1, num_groups, self.group_size) absmax = x_grouped.abs().max(dim=-1, keepdim=True)[0].float() - scales = (absmax / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR) + scales_raw = absmax / _FP8_MAX + if self.use_ue8m0: + scales_raw = torch.exp2(torch.ceil(torch.log2(scales_raw))) + scales = (scales_raw).clamp(min=_FP8_MIN_SCALING_FACTOR) x_scaled = x_grouped / scales x_quant = x_scaled.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE) @@ -151,6 +159,6 @@ class QuantFP8(CustomOp): scales = scales.reshape(orig_shape[:-1] + (num_groups, )) if self.column_major_scales: - scales = scales.transpose(-2, -1).contiguous() + scales = scales.transpose(-2, -1).contiguous().transpose(-1, -2) return x_quant, scales diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index d1d87b7ba12e2..2098086bf2401 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -13,8 +13,9 @@ import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import ( - group_broadcast) + GroupShape, group_broadcast) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED) from vllm.model_executor.parameter import (BlockQuantScaleParameter, @@ -24,6 +25,7 @@ from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, direct_register_custom_op from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used, + is_deep_gemm_supported, should_use_deepgemm_for_fp8_linear) logger = init_logger(__name__) @@ -35,6 +37,8 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz +# We need to pass in the is_hopper flag as argument because the function +# current_platform.is_device_capability() is not supported by Torch compiler. def cutlass_scaled_mm( A: torch.Tensor, B: torch.Tensor, @@ -42,15 +46,17 @@ def cutlass_scaled_mm( Bs: torch.Tensor, block_size: list[int], output_dtype: torch.dtype = torch.float16, + is_hopper: Optional[bool] = None, ) -> torch.Tensor: + if is_hopper is None: + is_hopper = current_platform.is_device_capability(90) return ops.cutlass_scaled_mm( A, B.T, out_dtype=output_dtype, scale_a=As, # SM90 block FP8 requires row-major scale_b, which we do ahead of time - scale_b=Bs if block_size is not None - and current_platform.is_device_capability(90) else Bs.T) + scale_b=Bs if block_size is not None and is_hopper else Bs.T) def rocm_aiter_gemm_w8a8_blockscale_impl( @@ -98,122 +104,189 @@ if current_platform.is_rocm(): aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128) -def dispatch_w8a8_blockscale_func( - use_cutlass: bool, use_aiter_and_is_supported: bool -) -> Callable[[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - list[int], - torch.dtype, -], torch.Tensor]: - if use_cutlass: - return cutlass_scaled_mm - if (use_aiter_and_is_supported): - return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale - return w8a8_block_fp8_matmul +# TODO we should be able to change the type of block_size to GroupShape +# after we resolve GroupShape compilation issue +# https://github.com/vllm-project/vllm/issues/25270 +def _w8a8_triton_block_scaled_mm_func( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + return w8a8_triton_block_scaled_mm(qx, weight, x_scale, weight_scale, + block_size, output_dtype) + + +def _w8a8_triton_block_scaled_mm_fake( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + return torch.empty((qx.size(0), weight.size(0)), + dtype=output_dtype, + device=qx.device) + + +direct_register_custom_op( + "w8a8_triton_block_scaled_mm_func", + _w8a8_triton_block_scaled_mm_func, + mutates_args=[], + fake_impl=_w8a8_triton_block_scaled_mm_fake, + dispatch_key="CUDA", +) # TODO fix ROCm->Triton custom path: # https://github.com/vllm-project/vllm/issues/14397 -def apply_w8a8_block_fp8_linear( - input: torch.Tensor, - weight: torch.Tensor, - block_size: list[int], - weight_scale: torch.Tensor, - input_scale: Optional[torch.Tensor] = None, - bias: Optional[torch.Tensor] = None, - cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, - use_aiter_and_is_supported: bool = False, -) -> torch.Tensor: - assert input_scale is None - # View input as 2D matrix for fp8 methods - input_2d = input.view(-1, input.shape[-1]) - output_shape = [*input.shape[:-1], weight.shape[0]] - output_dtype = input.dtype +class W8A8BlockFp8LinearOp: + """ + This class executes a Blocked FP8 linear layer using cutlass if supported + and torch.scaled_mm otherwise. + """ - if should_use_deepgemm_for_fp8_linear(output_dtype, weight): + def __init__( + self, + weight_group_shape: GroupShape, + act_quant_group_shape: GroupShape, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, + use_aiter_and_is_supported: bool = False, + ): + self.weight_group_shape = weight_group_shape + self.act_quant_group_shape = act_quant_group_shape + self.is_deep_gemm_supported = is_deep_gemm_supported() + self.is_hopper = current_platform.is_device_capability(90) + # Get the correct blockscale mul and input quant operations. + # We can't use _dispatch_w8a8_blockscale_op to figure out if we want + # to use deepgemm because we don't know the shape of weights (and + # whether deepgemm supports it) at the init time. + self.w8a8_blockscale_op, self.input_quant_op = \ + self._dispatch_w8a8_blockscale_op( + cutlass_block_fp8_supported, use_aiter_and_is_supported) + self.deepgemm_input_quant_op = (QuantFP8( + False, + self.act_quant_group_shape, + column_major_scales=True, + use_ue8m0=is_deep_gemm_e8m0_used()) if self.is_deep_gemm_supported + else None) + + def apply( + self, + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + assert input_scale is None + # View input as 2D matrix for fp8 methods input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[0]] + output_dtype = input.dtype - q_input, x_scale = per_token_group_quant_fp8( - input_2d, - block_size[1], - column_major_scales=True, - ) + if should_use_deepgemm_for_fp8_linear(output_dtype, weight, + self.is_deep_gemm_supported): + output = self._run_deepgemm(input, weight, weight_scale) + if bias is not None: + output = output + bias + return output.to(dtype=input.dtype).view(*output_shape) + output = self.w8a8_blockscale_op(input_2d, weight, weight_scale) + if bias is not None: + output = output + bias + return output.to(dtype=input.dtype).view(*output_shape) + + def _run_deepgemm( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + ) -> torch.Tensor: # ensure DeepGEMM-backed custom op is registered before use import vllm.model_executor.layers.quantization.deepgemm # noqa: F401 - output = torch.ops.vllm.w8a8_block_fp8_matmul_deepgemm( + assert self.deepgemm_input_quant_op is not None + q_input, x_scale = self.deepgemm_input_quant_op(input_2d) + return torch.ops.vllm.w8a8_deepgemm_block_scaled_mm( q_input, weight, x_scale, weight_scale, - block_size, - output_dtype=output_dtype) - if bias is not None: - output += bias - return output.to(dtype=output_dtype).view(*output_shape) + self.weight_group_shape, + output_dtype=input_2d.dtype) - w8a8_blockscale_func = dispatch_w8a8_blockscale_func( - cutlass_block_fp8_supported, use_aiter_and_is_supported) - if cutlass_block_fp8_supported: - num_pad = 0 - if current_platform.is_device_capability(90): - # pad first dimension to be divisible by 4 due to - # cutlass blockwise gemm limitation for hopper - num_pad = 4 - (input_2d.shape[0] % 4) - if num_pad > 0: - input_2d = torch.nn.functional.pad(input_2d, - (0, 0, 0, num_pad), - "constant", 0) - q_input, x_scale = per_token_group_quant_fp8(input_2d, - block_size[1], - column_major_scales=True) - output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, - block_size, input.dtype) - if num_pad > 0: - output = output[:-num_pad] - else: - if use_aiter_and_is_supported: - q_input, x_scale = aiter_per1x128_quant( - input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8) + def _run_cutlass( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + ) -> torch.Tensor: + assert self.input_quant_op is not None + if self.is_hopper: + # We pad unconditionally (even if shape is already divisible by 4) + # to support dynamic shape for input_2d.shape[0] in torch.compile + x = torch.nn.functional.pad(input_2d, + (0, 0, 0, -input_2d.shape[0] % 4)) else: - q_input, x_scale = per_token_group_quant_fp8( - input_2d, block_size[1], column_major_scales=False) + x = input_2d - output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, - block_size, input.dtype) + q_input, x_scale = self.input_quant_op(x) + output = cutlass_scaled_mm(q_input, weight, x_scale, weight_scale, + list(self.weight_group_shape), + input_2d.dtype, self.is_hopper) + output = output[0:input_2d.shape[0], ...] + return output - if bias is not None: - output = output + bias - return output.to(dtype=input.dtype).view(*output_shape) + def _run_aiter( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + ) -> torch.Tensor: + assert self.act_quant_group_shape == GroupShape(1, 128) + q_input, x_scale = aiter_per1x128_quant( + input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8) + return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale( + q_input, weight, x_scale, weight_scale, self.weight_group_shape, + input_2d.dtype) + def _run_triton( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + ) -> torch.Tensor: + assert self.input_quant_op is not None + q_input, x_scale = self.input_quant_op(input_2d) + return torch.ops.vllm.w8a8_triton_block_scaled_mm_func( + q_input, weight, x_scale, weight_scale, self.weight_group_shape, + input_2d.dtype) -def apply_w8a8_block_fp8_linear_fake( - input: torch.Tensor, - weight: torch.Tensor, - block_size: list[int], - weight_scale: torch.Tensor, - input_scale: Optional[torch.Tensor] = None, - bias: Optional[torch.Tensor] = None, - cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, - use_aiter_and_is_supported: bool = False, -) -> torch.Tensor: - output_shape = [*input.shape[:-1], weight.shape[0]] - return torch.empty(output_shape, dtype=input.dtype, device=input.device) - - -if not current_platform.is_cpu(): - direct_register_custom_op( - op_name="apply_w8a8_block_fp8_linear", - op_func=apply_w8a8_block_fp8_linear, - mutates_args=[], - fake_impl=apply_w8a8_block_fp8_linear_fake, - ) + def _dispatch_w8a8_blockscale_op( + self, + use_cutlass: bool, + use_aiter_and_is_supported: bool, + ) -> tuple[Callable[[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + ], torch.Tensor], Optional[QuantFP8]]: + if use_cutlass: + return self._run_cutlass, (QuantFP8(False, + self.act_quant_group_shape, + column_major_scales=True, + use_ue8m0=False)) + if use_aiter_and_is_supported: + return self._run_aiter, None + return self._run_triton, (QuantFP8(False, + self.act_quant_group_shape, + column_major_scales=False, + use_ue8m0=False)) def input_to_float8( @@ -465,7 +538,7 @@ def per_token_group_quant_fp8( @triton.jit -def _w8a8_block_fp8_matmul( +def _w8a8_triton_block_scaled_mm( # Pointers to inputs and output A, B, @@ -590,7 +663,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, return None -def w8a8_block_fp8_matmul( +def w8a8_triton_block_scaled_mm( A: torch.Tensor, B: torch.Tensor, As: torch.Tensor, @@ -650,7 +723,7 @@ def w8a8_block_fp8_matmul( return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), ) - _w8a8_block_fp8_matmul[grid]( + _w8a8_triton_block_scaled_mm[grid]( A, B, C, @@ -997,25 +1070,6 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module, layer.weight_scale.data.T.contiguous(), requires_grad=False) -def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor, - bias: Optional[torch.Tensor], - cutlass_block_fp8_supported: bool, - use_aiter_and_is_supported: bool) -> torch.Tensor: - """Apply block-wise FP8 linear operation.""" - assert layer.weight_block_size is not None - - return torch.ops.vllm.apply_w8a8_block_fp8_linear( - input=input, - weight=layer.weight, - block_size=layer.weight_block_size, - weight_scale=layer.weight_scale, - input_scale=layer.input_scale, - bias=bias, - cutlass_block_fp8_supported=cutlass_block_fp8_supported, - use_aiter_and_is_supported=use_aiter_and_is_supported, - ) - - def expert_weight_is_col_major(x: torch.Tensor) -> bool: assert x.dim() == 3 b, m, n = x.shape diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 4083193d7650e..2f533ca0639fc 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -9,7 +9,7 @@ from __future__ import annotations import functools import importlib import os -from typing import Any, Callable, NoReturn +from typing import Any, Callable, NoReturn, Optional import torch @@ -172,9 +172,13 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): return 1 - sim -def should_use_deepgemm_for_fp8_linear(output_dtype: torch.dtype, - weight: torch.Tensor): - return (is_deep_gemm_supported() and output_dtype == torch.bfloat16 +def should_use_deepgemm_for_fp8_linear( + output_dtype: torch.dtype, + weight: torch.Tensor, + supports_deep_gemm: Optional[bool] = None): + if supports_deep_gemm is None: + supports_deep_gemm = is_deep_gemm_supported() + return (supports_deep_gemm and output_dtype == torch.bfloat16 and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0) From 24fab45d96a91b491db338ee02cd24e55b7fbb5f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Sep 2025 15:29:26 -0400 Subject: [PATCH 154/185] [Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444) Signed-off-by: mgoin --- vllm/config/__init__.py | 9 ++++++++- vllm/config/compilation.py | 7 +++---- vllm/v1/worker/gpu_model_runner.py | 23 +++++++++++++++++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 50e8cad236176..d786d3e289b33 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -509,8 +509,15 @@ class VllmConfig: if self.compilation_config.cudagraph_mode is None: if envs.VLLM_USE_V1 and self.compilation_config.level \ == CompilationLevel.PIECEWISE: + # default to full and piecewise for most models self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE + CUDAGraphMode.FULL_AND_PIECEWISE + + # pooling model does not support full cudagraphs + if self.model_config is not None and \ + self.model_config.pooler_config is not None: + self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 34fa7fcfe7e87..0441745e8b36e 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -228,15 +228,14 @@ class CompilationConfig: The mode of the cudagraph: - NONE, no cudagraph capture. - - PIECEWISE. (v1 default) + - PIECEWISE. - FULL. - FULL_DECODE_ONLY. - - FULL_AND_PIECEWISE. + - FULL_AND_PIECEWISE. (v1 default) PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph incompatible ops (i.e. some attention ops) outside the cudagraph for general flexibility. - This is the default mode. FULL mode: Capture full cudagraph for all batches. Can be good for small models or workloads with small prompts; not supported by many backends. @@ -249,7 +248,7 @@ class CompilationConfig: FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and piecewise cudagraph for prefill and mixed prefill-decode batches. - This is like the most performant mode for most models. + This is the most performant mode for most models and is the default. Currently, the cudagraph mode is only used for the v1 engine. Note that the cudagraph logic is generally orthogonal to the diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ed324138c6fe1..df4f769585864 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # TODO(luka) better system for describing dummy batches seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] else: - # Make sure max_model_len is used at the graph capture time. - seq_lens = self.max_model_len + seq_lens = max_query_len self.seq_lens.np[:num_reqs] = seq_lens self.seq_lens.np[num_reqs:] = 0 self.seq_lens.copy_to_gpu() @@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): CUDAGraphMode.FULL_DECODE_ONLY logger.warning(msg) + # check that if we are doing decode full-cudagraphs it is supported + if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL + and min_cg_support == AttentionCGSupport.NEVER): + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported " + f"with {min_cg_builder_name} backend (support: " + f"{min_cg_support})") + if (self.compilation_config.level == CompilationLevel.PIECEWISE and + (self.compilation_config.splitting_ops_contain_attention() + or self.compilation_config.use_inductor_graph_partition)): + msg += "; setting cudagraph_mode=PIECEWISE because "\ + "attention is compiled piecewise" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + msg += "; setting cudagraph_mode=NONE because "\ + "attention is not compiled piecewise" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.NONE + logger.warning(msg) + # check that if we are doing spec-decode + decode full-cudagraphs it is # supported if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL From d5944d5146d9a6847badc7018b328de079cc7e80 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:44:35 -0700 Subject: [PATCH 155/185] [Speculators][Speculative Decoding] Fix gpt-oss eagle3 accuracy issue (#25406) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- tests/v1/spec_decode/test_eagle.py | 4 ++ vllm/config/model.py | 1 + vllm/model_executor/models/llama_eagle.py | 5 ++ vllm/model_executor/models/llama_eagle3.py | 5 ++ vllm/v1/spec_decode/eagle.py | 70 +++++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 11 +++- 6 files changed, 79 insertions(+), 17 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 23bfabfcf89b1..5096f9fd647bd 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -534,6 +534,8 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): proposer.runner.attn_groups.append([mock.MagicMock()]) proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \ attn_metadata_builder + proposer._get_attention_metadata_builder = mock.MagicMock( + return_value=attn_metadata_builder) result = proposer.propose(target_token_ids=target_token_ids, target_positions=target_positions, @@ -660,6 +662,8 @@ def test_propose_tree(spec_token_tree): proposer.runner.attn_groups.append([mock.MagicMock()]) proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \ attn_metadata_builder + proposer._get_attention_metadata_builder = mock.MagicMock( + return_value=attn_metadata_builder) # Setup inputs for the proposer. target_token_ids = torch.randint(0, diff --git a/vllm/config/model.py b/vllm/config/model.py index 33e5d3ea04a48..d8a8fe20fd030 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1003,6 +1003,7 @@ class ModelConfig: self.quantization = quantization_override break + quant_method = quant_method if quant_method != "" else None # Verify quantization configurations. if self.quantization is None: self.quantization = quant_method diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index dfae3c3ea5437..2ff2d54a83aa8 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -134,6 +134,11 @@ class EagleLlamaForCausalLM(LlamaForCausalLM): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + # Ensure draft_vocab_size is set + # default to the base vocab size when absent + if getattr(self.config, "draft_vocab_size", None) is None: + base_vocab_size = getattr(self.config, "vocab_size", None) + self.config.draft_vocab_size = base_vocab_size target_layer_num = vllm_config.model_config.get_num_layers( vllm_config.parallel_config) self.model = LlamaModel(vllm_config=vllm_config, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index fb10af6c53c90..b99a1547918ee 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -203,6 +203,11 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + # Ensure draft_vocab_size is set + # default to the base vocab size when absent + if getattr(self.config, "draft_vocab_size", None) is None: + base_vocab_size = getattr(self.config, "vocab_size", None) + self.config.draft_vocab_size = base_vocab_size target_layer_num = vllm_config.model_config.get_num_layers( vllm_config.parallel_config) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index a0f40828d42f5..a9e0a38fe3417 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,6 +9,7 @@ import numpy as np import torch import torch.nn as nn +from vllm.attention.backends.abstract import AttentionMetadataBuilder from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, get_layers_from_vllm_config) @@ -77,6 +78,8 @@ class EagleProposer: self.is_multimodal_model = vllm_config.model_config \ .is_multimodal_model + self.attn_metadata_builder: Optional[AttentionMetadataBuilder] = None + self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.vllm_config.model_config.enforce_eager) @@ -117,7 +120,7 @@ class EagleProposer: with_numpy=True) # Determine allowed attention backends once during initialization. - self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] + self.allowed_attn_types: tuple[type, ...] if current_platform.is_rocm(): rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend @@ -190,10 +193,12 @@ class EagleProposer: assert self.runner is not None - # FIXME: need to consider multiple kv_cache_groups - attn_metadata_builder = \ - self.runner.attn_groups[0][0].get_metadata_builder() - attn_metadata = attn_metadata_builder.build_for_drafting( + # Select the correct attention metadata builders for EAGLE layers. + # Get the attention metadata builders once and reuse for later. + builder = (self._get_attention_metadata_builder() + if self.attn_metadata_builder is None else + self.attn_metadata_builder) + attn_metadata = builder.build_for_drafting( common_attn_metadata=common_attn_metadata, draft_index=0) # At this moment, we assume all eagle layers belong to the same KV @@ -327,11 +332,9 @@ class EagleProposer: exceeds_max_model_len, PADDING_SLOT_ID) # Rebuild attention metadata - attn_metadata_builder = \ - self.runner.attn_groups[0][0].get_metadata_builder() - attn_metadata = attn_metadata_builder\ - .build_for_drafting(common_attn_metadata=common_attn_metadata, - draft_index=token_index + 1) + attn_metadata = builder.build_for_drafting( + common_attn_metadata=common_attn_metadata, + draft_index=token_index + 1) for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata @@ -851,10 +854,24 @@ class EagleProposer: # share lm_head with the target model if needed # some model definition do not define lm_head explicitly # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM - if self.vllm_config.speculative_config.method != "eagle3" and \ - hasattr(target_language_model, "lm_head"): - logger.info("Loading EAGLE LM head weights from the target model.") - self.model.lm_head = target_language_model.lm_head + if self.vllm_config.speculative_config.method != "eagle3": + if hasattr(target_language_model, "lm_head"): + logger.info( + "Loading EAGLE LM head weights from the target model.") + self.model.lm_head = target_language_model.lm_head + else: + if (hasattr(self.model, "lm_head") + and hasattr(target_language_model, "lm_head") + and self.model.lm_head.weight.shape + == target_language_model.lm_head.weight.shape): + logger.info("Assuming the EAGLE head shares the same lm_head" + " with the target model.") + del self.model.lm_head + self.model.lm_head = target_language_model.lm_head + else: + logger.info( + "The EAGLE head's lm_head will be loaded separately" + " from the target model.") @torch.inference_mode() def dummy_run( @@ -877,6 +894,31 @@ class EagleProposer: inputs_embeds=inputs_embeds, ) + def _get_attention_metadata_builder( + self) -> list[AttentionMetadataBuilder]: + """Find and return the attention metadata builders for EAGLE layers. + + Returns: + The metadata builders for EAGLE layers. + + Raises: + AssertionError: If no metadata builders are found for EAGLE layers. + """ + builder = None + chosen_layer = self.attn_layer_names[0] + + for kv_cache_group in self.runner.attn_groups: + for attn_group in kv_cache_group: + if chosen_layer in attn_group.layer_names: + builder = attn_group.get_metadata_builder() + break + if builder is not None: + break + + assert builder is not None, ( + "Failed to find attention metadata builder for EAGLE layers.") + return builder + def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index df4f769585864..d0fe5e25d6abe 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1177,9 +1177,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): encoder_seq_lens=encoder_seq_lens, ) - if self.speculative_config and \ - spec_decode_common_attn_metadata is None: - spec_decode_common_attn_metadata = common_attn_metadata + if (self.speculative_config + and spec_decode_common_attn_metadata is None): + if isinstance(self.drafter, EagleProposer): + if (self.drafter.attn_layer_names[0] + in kv_cache_group_spec.layer_names): + spec_decode_common_attn_metadata = common_attn_metadata + else: + spec_decode_common_attn_metadata = common_attn_metadata for attn_group in self.attn_groups[kv_cache_group_id]: # Prepare for cascade attention if enabled & beneficial. From a8ffc4f0f2d02aa4e505dcc3c974d5ec6e00737c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Sep 2025 15:49:55 -0400 Subject: [PATCH 156/185] [Bugfix] Lower gpt-oss max cudagraph size to 992 to be compatible with FA3 (#25508) Signed-off-by: mgoin --- vllm/model_executor/models/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ce3d23763ed64..aa7bcf5b65ada 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -266,24 +266,24 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): if structured_outputs_config.reasoning_parser == "": structured_outputs_config.reasoning_parser = "openai_gptoss" - # Increase the max capture size from 512 to 1024 for performance. + # Increase the max capture size from 512 to 992 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs - # from 67 to 83. + # from 67 to 81. scheduler_config = vllm_config.scheduler_config if len(scheduler_config.cuda_graph_sizes) == 1: max_capture_size = scheduler_config.cuda_graph_sizes[0] # FIXME(woosuk): When using full cuda graph with FA3, the max # supported size is 992. - if max_capture_size < 1024: + if max_capture_size < 992: cuda_graph_sizes = [1, 2, 4] # Step size 8 for small batch sizes cuda_graph_sizes += [i for i in range(8, 256, 8)] # Step size 16 for larger batch sizes - cuda_graph_sizes += [i for i in range(256, 1025, 16)] + cuda_graph_sizes += [i for i in range(256, 993, 16)] scheduler_config.cuda_graph_sizes = cuda_graph_sizes logger.info( "Overriding max cuda graph capture size to " - "%d for performance.", 1024) + "%d for performance.", 992) class MambaModelConfig(VerifyAndUpdateConfig): From 8bdd8b5c5155114f22af6d99f97176a15b6f8bf5 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 23 Sep 2025 21:53:00 +0200 Subject: [PATCH 157/185] Enable symmetric memory all reduce by default only enabling for TP (#25070) Signed-off-by: ilmarkov Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 2 + tests/distributed/test_symm_mem_allreduce.py | 59 +++++++++++++++---- .../device_communicators/cuda_communicator.py | 7 ++- vllm/envs.py | 4 +- 4 files changed, 56 insertions(+), 16 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index cf32087ed3b98..aef6d709722fa 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -164,6 +164,7 @@ steps: - tests/v1/test_internal_lb_dp.py - tests/v1/test_hybrid_lb_dp.py - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py commands: # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -188,6 +189,7 @@ steps: - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py index 5a804a389123b..83e1fe47aeec0 100644 --- a/tests/distributed/test_symm_mem_allreduce.py +++ b/tests/distributed/test_symm_mem_allreduce.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import queue import random import typing @@ -10,26 +11,31 @@ import torch.distributed as dist import torch.multiprocessing as mp import vllm.envs as envs +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.device_communicators.cuda_communicator import ( CudaCommunicator) -from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, - get_tp_group, +from vllm.distributed.parallel_state import (get_tp_group, init_distributed_environment, initialize_model_parallel) +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.llm_engine import LLMEngine from vllm.platforms import current_platform from vllm.utils import update_environment_variables torch.manual_seed(42) random.seed(44) -test_size_elements = 4 * 1024 * 1024 +test_size_elements = 1024 * 1024 -def symm_mem_allreduce_worker(local_rank: int, world_size: int): +def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue): monkeypatch = pytest.MonkeyPatch() - with monkeypatch.context() as m: + config = VllmConfig(parallel_config=ParallelConfig( + tensor_parallel_size=world_size)) + + with monkeypatch.context() as m, set_current_vllm_config(config): m.delenv("CUDA_VISIBLE_DEVICES", raising=False) dtype = torch.bfloat16 device = torch.device(f"cuda:{local_rank}") @@ -51,22 +57,26 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int): get_tp_group().device_communicator) symm_mem_comm = cuda_communicator.symm_mem_comm if symm_mem_comm is None or symm_mem_comm.disabled: - pytest.skip("SymmMemCommunicator is not available or disabled.") + # can't use skip under multiprocessing + q.put("SymmMemCommunicator is not available or disabled.") + return inp_direct_symm_mem = torch.randint(1, 23, (test_size_elements, ), dtype=dtype, device=device) if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem): - pytest.skip( + # can't use skip under multiprocessing + q.put( "SymmMemCommunicator isn't used for this world and input size." ) + return original_inp_direct_symm_mem = inp_direct_symm_mem.clone() out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem) assert out_direct_symm_mem is not None - group = get_tensor_model_parallel_group().device_group + group = get_tp_group().device_group dist.all_reduce(original_inp_direct_symm_mem, group=group) torch.testing.assert_close(out_direct_symm_mem, original_inp_direct_symm_mem, @@ -100,9 +110,34 @@ def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size, world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") + q = mp.get_context('spawn').Queue() + mp.spawn(symm_mem_allreduce_worker, + args=(world_size, q), + nprocs=world_size) + try: + val = q.get(timeout=1) + except queue.Empty: + val = None + finally: + cleanup_dist_env_and_memory() + if val is not None: + pytest.skip(val) - # Enable SymmMemCommunicator - monkeypatch.setenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1") - mp.spawn(symm_mem_allreduce_worker, args=(world_size, ), nprocs=world_size) - cleanup_dist_env_and_memory() +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="SymmMemAllreduce is only available for CUDA platforms.") +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], + reason="Only test on CUDA") +def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch): + world_size = 4 + if world_size > torch.cuda.device_count(): + pytest.skip("Not enough GPUs to run the test.") + # Verify that the DataParallel runs without error + engine_args = EngineArgs(model="distilbert/distilgpt2", + enforce_eager=True, + enable_prefix_caching=True, + data_parallel_size=2, + tensor_parallel_size=2, + data_parallel_backend="mp") + LLMEngine.from_engine_args(engine_args) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 6c25bf3cd95c3..b20e79f577c35 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -30,18 +30,21 @@ class CudaCommunicator(DeviceCommunicatorBase): unique_name: str = ""): super().__init__(cpu_group, device, device_group, unique_name) if "tp" not in unique_name: - # only tp uses custom allreduce + # custom allreduce or torch symm mem can be used only by tp use_custom_allreduce = False + use_torch_symm_mem = False else: from vllm.distributed.parallel_state import ( _ENABLE_CUSTOM_ALL_REDUCE) use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE + use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM # ep does not use pynccl use_pynccl = "ep" not in unique_name self.use_pynccl = use_pynccl self.use_custom_allreduce = use_custom_allreduce + self.use_torch_symm_mem = use_torch_symm_mem # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( @@ -65,7 +68,7 @@ class CudaCommunicator(DeviceCommunicatorBase): self.ca_comm: Optional[CustomAllreduce] = None self.qr_comm: Optional[QuickAllReduce] = None self.symm_mem_comm: Optional[SymmMemCommunicator] = None - if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda(): + if use_torch_symm_mem and current_platform.is_cuda(): self.symm_mem_comm = SymmMemCommunicator( group=self.cpu_group, device=self.device, diff --git a/vllm/envs.py b/vllm/envs.py index fa6f14d6b0371..50d58c5468f97 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -182,7 +182,7 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False - VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False + VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False @@ -1370,7 +1370,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use pytorch symmetric memory for allreduce "VLLM_ALLREDUCE_USE_SYMM_MEM": - lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), + lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))), # Allows vllm to find tuned config under customized folder "VLLM_TUNED_CONFIG_FOLDER": From 8b8a8afc895ab200cf9aa27ea47c5fc6dbef443b Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:09:37 -0400 Subject: [PATCH 158/185] [CI] Fix Pre-commit Issue (#25497) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0fe5e25d6abe..f785824958147 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2367,7 +2367,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, - aux_hidden_states: Optional[torch.Tensor], + aux_hidden_states: Optional[list[torch.Tensor]], spec_decode_metadata: Optional[SpecDecodeMetadata], common_attn_metadata: CommonAttentionMetadata, ) -> Union[list[list[int]], torch.Tensor]: @@ -2387,6 +2387,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: indices = [] offset = 0 + assert spec_decode_metadata is not None for num_draft, tokens in zip( spec_decode_metadata.num_draft_tokens, sampled_token_ids): @@ -2437,6 +2438,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # TODO(woosuk): Support M-RoPE. target_positions = self.positions.gpu[:num_scheduled_tokens] if self.use_aux_hidden_state_outputs: + assert aux_hidden_states is not None target_hidden_states = torch.cat( [h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1) @@ -2462,6 +2464,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # TODO(woosuk): Support M-RoPE. target_positions = self.positions.gpu[token_indices] if self.use_aux_hidden_state_outputs: + assert aux_hidden_states is not None target_hidden_states = torch.cat( [h[token_indices] for h in aux_hidden_states], dim=-1) else: @@ -2897,7 +2900,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert not create_mixed_batch num_reqs = cdiv(num_tokens, max_query_len) assert num_reqs <= max_num_reqs, \ - "Do not capture num_reqs > max_num_reqs for uniform batch" + f"Do not capture num_reqs {num_reqs} > max_num_reqs " \ + f"{max_num_reqs} for uniform batch. Num tokens: " \ + f"{num_tokens}, max_query_len: {max_query_len}" num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: num_scheduled_tokens_list[-1] = num_tokens % max_query_len From c828d1bf987bbfe412c1fcd46736b1598d647053 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:43:45 -0400 Subject: [PATCH 159/185] [Bugfix] gpt-oss container tool output bug (#25485) Signed-off-by: Alec Solder Co-authored-by: Alec Solder --- vllm/entrypoints/harmony_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 57e4bb1e1da52..0c1c9c3192fc0 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -317,7 +317,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: ) output_items.append(response_item) elif recipient is not None and (recipient.startswith("python") - or recipient.startswith("browser")): + or recipient.startswith("browser") + or recipient.startswith("container")): for content in message.content: reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", From 08275ec0a2ba48fbc1054bdbdda2f1e0dfcb20b3 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 24 Sep 2025 05:25:46 +0800 Subject: [PATCH 160/185] [Build] Update Xgrammar to 0.1.25 (#25467) Signed-off-by: chaunceyjiang --- requirements/common.txt | 2 +- vllm/v1/structured_output/backend_xgrammar.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 7973da080c37d..a52745f698703 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,7 +24,7 @@ outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" +xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 55b4792fe010d..a853e6540719e 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -108,7 +108,9 @@ class XgrammarBackend(StructuredOutputBackend): end=s["end"], ) for s in s_tag["structures"] ] - ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"]) + structural_tag = xgr.StructuralTag.from_legacy_structural_tag( + tags, s_tag["triggers"]) + ctx = self.compiler.compile_structural_tag(structural_tag) else: logger.error( "Validation should have already occurred. Please file an issue." @@ -318,6 +320,8 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: end=s["end"], ) for s in s_tag["structures"] ] - xgr.Grammar.from_structural_tag(tags, s_tag["triggers"]) + structural_tag = xgr.StructuralTag.from_legacy_structural_tag( + tags, s_tag["triggers"]) + xgr.Grammar.from_structural_tag(structural_tag) except Exception as e: raise ValueError("Invalid structural tag specification.") from e From 690f948e4a440baa85a7327c7e24f297d25b9bb2 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:31:08 -0400 Subject: [PATCH 161/185] [Bugfix] Fix for the import error from #24588 (#25481) Signed-off-by: Gregory Shtrasberg --- .../layers/fused_moe/gpt_oss_triton_kernels_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 0e84a9241e905..18de758519346 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -23,7 +23,7 @@ if has_triton_kernels(): from triton_kernels.routing import (RoutingData, routing, routing_from_bitmatrix) from triton_kernels.tensor import Bitmatrix - except (ModuleNotFoundError, AttributeError) as e: + except (AttributeError, ImportError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " "version is compatible. Error: %s", e) From ae002924e96bd17cfc690c266623c340ff28a70f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 24 Sep 2025 05:58:25 +0800 Subject: [PATCH 162/185] [CI/Build] Fix and re-enable v1 PP test on CI (#25496) Signed-off-by: Isotr0py --- tests/distributed/test_pipeline_parallel.py | 3 --- vllm/model_executor/models/granite.py | 7 ------- vllm/model_executor/models/granitemoe.py | 7 ------- vllm/model_executor/models/granitemoeshared.py | 7 ------- 4 files changed, 24 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 073b362b64749..aa28ed9ce25e5 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -382,7 +382,6 @@ def test_tp_language_generation( test_options: PPTestOptions, num_gpus_available, ): - pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, @@ -410,7 +409,6 @@ def test_tp_language_embedding( test_options: PPTestOptions, num_gpus_available, ): - pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, @@ -438,7 +436,6 @@ def test_tp_multimodal_generation( test_options: PPTestOptions, num_gpus_available, ): - pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 795b38e724eab..2c619396e6c0c 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -308,13 +308,11 @@ class GraniteModel(nn.Module): hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) - residual = None hidden_states *= self.config.embedding_multiplier else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) @@ -322,7 +320,6 @@ class GraniteModel(nn.Module): if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, - "residual": residual }) hidden_states = self.norm(hidden_states) @@ -475,10 +472,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): torch.zeros((batch_size, self.config.hidden_size), dtype=dtype, device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), }) def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 07200fef4799d..47ac22c4aeaa5 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -298,17 +298,14 @@ class GraniteMoeModel(nn.Module): else: hidden_states = self.get_input_embeddings(input_ids) hidden_states *= self.embedding_multiplier - residual = None else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, - "residual": residual }) hidden_states = self.norm(hidden_states) return hidden_states @@ -523,10 +520,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): torch.zeros((batch_size, self.config.hidden_size), dtype=dtype, device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), }) def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index a5d118f084e6c..b434822bff0a9 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -195,17 +195,14 @@ class GraniteMoeSharedModel(nn.Module): else: hidden_states = self.get_input_embeddings(input_ids) hidden_states *= self.embedding_multiplier - residual = None else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, - "residual": residual }) hidden_states = self.norm(hidden_states) return hidden_states @@ -323,10 +320,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): torch.zeros((batch_size, self.config.hidden_size), dtype=dtype, device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), }) def load_weights(self, weights: Iterable[tuple[str, From 4f8c4b890ac1ca57b2f80278641fbe3f9a6abfba Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 23 Sep 2025 15:11:14 -0700 Subject: [PATCH 163/185] [Core] Use KVCacheBlock as much as possible instead of dict[block_id, KVCacheBlock] (#24830) Signed-off-by: Jialin Ouyang --- tests/v1/core/test_prefix_caching.py | 178 +++++++++++++----- .../core/test_single_type_kv_cache_manager.py | 18 +- vllm/v1/core/block_pool.py | 138 +++++++++++--- 3 files changed, 247 insertions(+), 87 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 3cf9d93696767..37b4f9a08e40d 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -14,10 +14,11 @@ from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor -from vllm.v1.core.block_pool import BlockPool +from vllm.v1.core.block_pool import BlockHashToBlockMap, BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request -from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, - get_block_hash, get_group_id, +from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, + KVCacheBlock, get_block_hash, + get_group_id, get_request_block_hasher, hash_block_tokens, init_none_hash, make_block_hash_with_group_id) @@ -138,7 +139,7 @@ def test_prefill(hash_fn): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([1, 2, 3, 4], ) + assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], ) # Check full block metadata parent_block_hash = None @@ -171,7 +172,7 @@ def test_prefill(hash_fn): blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([5], ) + assert blocks is not None and blocks.get_block_ids() == ([5], ) for block in computed_blocks.blocks[0]: assert block.ref_cnt == 2 @@ -207,7 +208,7 @@ def test_prefill(hash_fn): blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([6], ) + assert blocks is not None and blocks.get_block_ids() == ([6], ) # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -227,7 +228,9 @@ def test_prefill(hash_fn): len(computed_blocks.blocks[0]) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], ) + assert blocks is not None and blocks.get_block_ids() == ([ + 7, 8, 9, 10, 4, 5, 6, 3, 2, 1 + ], ) assert free_block_queue.num_free_blocks == 0 assert (free_block_queue.fake_free_list_head.next_free_block @@ -261,8 +264,9 @@ def test_prefill_hybrid_model(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, - 8], [9, 10, 11, 12]) + assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], [ + 5, 6, 7, 8 + ], [9, 10, 11, 12]) # Check full block metadata parent_block_hash = None @@ -298,7 +302,7 @@ def test_prefill_hybrid_model(): blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([13], [14], [15]) + assert blocks is not None and blocks.get_block_ids() == ([13], [14], [15]) for block_per_group in computed_blocks.blocks: for block in block_per_group: if block != manager.block_pool.null_block: @@ -309,14 +313,15 @@ def test_prefill_hybrid_model(): manager.free(req1) cached_block_hash_to_block_bak = copy.copy( - manager.block_pool.cached_block_hash_to_block) + manager.block_pool.cached_block_hash_to_block._cache) - def test_partial_request_hit(request_id: str, hash_to_evict: list[bytes], + def test_partial_request_hit(request_id: str, + hash_to_evict: list[BlockHashWithGroupId], expect_hit_length: int): req = make_request(request_id, common_token_ids + unique_token_ids, block_size, sha256) for hash_with_group_id in hash_to_evict: - manager.block_pool.cached_block_hash_to_block.pop( + manager.block_pool.cached_block_hash_to_block._cache.pop( hash_with_group_id) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert len(req.block_hashes) == 3 @@ -324,7 +329,7 @@ def test_prefill_hybrid_model(): for block_per_group in computed_blocks.blocks: assert len(block_per_group) == num_computed_tokens // block_size for hash_with_group_id in hash_to_evict: - manager.block_pool.cached_block_hash_to_block[ + manager.block_pool.cached_block_hash_to_block._cache[ hash_with_group_id] = cached_block_hash_to_block_bak[ hash_with_group_id] manager.free(req) @@ -362,7 +367,8 @@ def test_prefill_hybrid_model(): # total cache miss. # The cache hit length of full attention is 1 * block_size. # The cache hit length of sliding window is 2 * block_size. - # Then it is cache miss as the two type of layers have different hit length. + # Then it is cache miss as the two type of layers + # have different hit length. test_partial_request_hit("8", [ make_block_hash_with_group_id(block_hashes[2], 0), make_block_hash_with_group_id(block_hashes[0], 1), @@ -406,7 +412,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([1, 2, 3, 4], ) + assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], ) req0_block_hashes = [b.block_hash for b in blocks.blocks[0]] # Check full block metadata @@ -441,7 +447,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([5], ) + assert blocks is not None and blocks.get_block_ids() == ([5], ) for block in computed_blocks.blocks[0]: assert block.ref_cnt == 2 @@ -478,6 +484,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req2, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) + assert blocks is not None block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes @@ -513,7 +520,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([1, 2, 3, 4], ) + assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], ) # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -558,7 +565,8 @@ def test_evict(): blocks = manager.allocate_slots(req0, 5 * 16 + 7, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 6 # 5 full + 1 partial + # 5 full + 1 partial + assert blocks is not None and len(blocks.blocks[0]) == 6 # 3 blocks. req1 = make_request("1", list(range(last_token_id, @@ -570,7 +578,7 @@ def test_evict(): blocks = manager.allocate_slots(req1, 3 * 16, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 3 # 3 full blocks + assert blocks is not None and len(blocks.blocks[0]) == 3 # 3 full blocks last_token_id += 3 * 16 # 10 - (6 + 3) == 1 @@ -592,7 +600,7 @@ def test_evict(): blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([10], ) + assert blocks is not None and blocks.get_block_ids() == ([10], ) assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -617,7 +625,7 @@ def test_hash_block_correct_reuse(): blocks = manager.allocate_slots(req, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 1 + assert blocks is not None and len(blocks.blocks[0]) == 1 # Deallocate the block. manager.free(req) @@ -631,7 +639,7 @@ def test_hash_block_correct_reuse(): blocks = manager.allocate_slots(req, num_tokens - 1, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 1 + assert blocks is not None and len(blocks.blocks[0]) == 1 assert manager.block_pool.blocks[blocks.blocks[0] [0].block_id].block_hash is None @@ -658,7 +666,7 @@ def test_computed_blocks_not_evicted(): blocks = manager.allocate_slots(req0, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 1 + assert blocks is not None and len(blocks.blocks[0]) == 1 assert blocks.blocks[0][0].block_id == 1 # Allocate another block. @@ -670,7 +678,7 @@ def test_computed_blocks_not_evicted(): blocks = manager.allocate_slots(req1, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 1 + assert blocks is not None and len(blocks.blocks[0]) == 1 assert blocks.blocks[0][0].block_id == 2 # Free the blocks. @@ -688,7 +696,7 @@ def test_computed_blocks_not_evicted(): blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 1 + assert blocks is not None and len(blocks.blocks[0]) == 1 assert blocks.blocks[0][0].block_id == 2 @@ -712,7 +720,7 @@ def test_basic_prefix_caching_disabled(): blocks = manager.allocate_slots(req1, 10, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 3 + assert blocks is not None and len(blocks.blocks[0]) == 3 # Free the blocks. manager.free(req1) @@ -726,7 +734,7 @@ def test_basic_prefix_caching_disabled(): blocks = manager.allocate_slots(req2, 16, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert len(blocks.blocks[0]) == 4 + assert blocks is not None and len(blocks.blocks[0]) == 4 # New requests should not have any blocks. req3 = make_request("3", list(range(4)), block_size, sha256) @@ -773,7 +781,8 @@ def test_cache_blocks(hash_fn): assert len(block_pool.cached_block_hash_to_block) == 2 assert all([block.block_hash is not None for block in blocks]) - # Test that blocks that don't start from the beginning are cached correctly. + # Test that blocks that don't start from the beginning are cached + # correctly. blocks += [KVCacheBlock(block_id=2)] block_pool.cache_full_blocks( request=req, @@ -1101,7 +1110,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids, block_size, sha256) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == ([1, 2, 3, 4], ) + assert blocks is not None and blocks.get_block_ids() == ([1, 2, 3, 4], ) unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -1112,7 +1121,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == ([5], ) + assert blocks is not None and blocks.get_block_ids() == ([5], ) # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() @@ -1168,49 +1177,41 @@ def test_maybe_evict_cached_block(): # Manually add all blocks to cached_blocks for block, block_hash in zip(pool.blocks, block_hashes): block.block_hash = block_hash - pool.cached_block_hash_to_block[block_hash][block.block_id] = block + pool.cached_block_hash_to_block.insert(block_hash, block) block0, block1, block2, block3 = pool.blocks - assert pool.cached_block_hash_to_block == { + assert pool.cached_block_hash_to_block._cache == { block_hash0: { block0.block_id: block0, - block3.block_id: block3 + block3.block_id: block3, }, - block_hash1: { - block1.block_id: block1 - }, - block_hash2: { - block2.block_id: block2 - } + block_hash1: block1, + block_hash2: block2, } # Evict block1 pool._maybe_evict_cached_block(block1) - assert pool.cached_block_hash_to_block == { + assert pool.cached_block_hash_to_block._cache == { block_hash0: { block0.block_id: block0, block3.block_id: block3 }, - block_hash2: { - block2.block_id: block2 - } + block_hash2: block2, } # Evict block0: block_hash0 entry should NOT be removed, as block3 # also use the same hash pool._maybe_evict_cached_block(block0) - assert pool.cached_block_hash_to_block == { + assert pool.cached_block_hash_to_block._cache == { block_hash0: { block3.block_id: block3 }, - block_hash2: { - block2.block_id: block2 - } + block_hash2: block2, } # Evict block2 pool._maybe_evict_cached_block(block2) - assert pool.cached_block_hash_to_block == {block_hash0: {3: block3}} + assert pool.cached_block_hash_to_block._cache == {block_hash0: {3: block3}} # Evict block3 pool._maybe_evict_cached_block(block3) - assert pool.cached_block_hash_to_block == {} + assert pool.cached_block_hash_to_block._cache == {} @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) @@ -1374,7 +1375,7 @@ def test_eagle_with_sliding_window(): # Evict the first block in the request assert manager.block_pool.get_cached_block( block_hash_first_block, kv_cache_group_ids=[0]) is not None - manager.block_pool.cached_block_hash_to_block.pop( + manager.block_pool.cached_block_hash_to_block._cache.pop( make_block_hash_with_group_id(block_hash_first_block, 0)) # New request @@ -1386,3 +1387,78 @@ def test_eagle_with_sliding_window(): # there will be no matched prefix. assert len(computed_blocks.blocks[0]) == 0 assert num_tokens == 0 + + +def test_block_lookup_cache_single_block_per_key(): + cache = BlockHashToBlockMap() + key0 = BlockHashWithGroupId(b"hash0") + key1 = BlockHashWithGroupId(b"hash1") + key2 = BlockHashWithGroupId(b"hash2") + block0 = KVCacheBlock(0) + block1 = KVCacheBlock(1) + + assert cache.get_one_block(key0) is None + assert cache.get_one_block(key1) is None + assert cache.get_one_block(key2) is None + # key0 inserted + cache.insert(key0, block0) + assert cache.get_one_block(key0) is block0 + assert cache.get_one_block(key1) is None + assert cache.get_one_block(key2) is None + # key1 inserted + cache.insert(key1, block1) + assert cache.get_one_block(key0) is block0 + assert cache.get_one_block(key1) is block1 + assert cache.get_one_block(key2) is None + # No block poped due to block_id mismatch + assert cache.pop(key0, 100) is None + assert cache.get_one_block(key0) is block0 + assert cache.get_one_block(key1) is block1 + assert cache.get_one_block(key2) is None + # block poped with (key0, block ID 0) + assert cache.pop(key0, 0) is block0 + assert cache.get_one_block(key0) is None + assert cache.get_one_block(key1) is block1 + assert cache.get_one_block(key2) is None + # No block poped due to block_id mismatch + assert cache.pop(key0, 1) is None + assert cache.get_one_block(key0) is None + assert cache.get_one_block(key1) is block1 + assert cache.get_one_block(key2) is None + # block poped with (key1, block ID 1) + assert cache.pop(key1, 1) is block1 + assert cache.get_one_block(key0) is None + assert cache.get_one_block(key1) is None + assert cache.get_one_block(key2) is None + + +def test_block_lookup_cache_multi_blocks_per_key(): + cache = BlockHashToBlockMap() + key0 = BlockHashWithGroupId(b"hash0") + key1 = BlockHashWithGroupId(b"hash1") + block00 = KVCacheBlock(0) + block01 = KVCacheBlock(1) + block10 = KVCacheBlock(10) + block11 = KVCacheBlock(11) + + assert cache.get_one_block(key0) is None + assert cache.get_one_block(key1) is None + + cache.insert(key0, block00) + cache.insert(key0, block01) + cache.insert(key1, block10) + cache.insert(key1, block11) + + assert cache.get_one_block(key0) is block00 + assert cache.pop(key0, 0) is block00 + assert cache.get_one_block(key0) is block01 + assert cache.pop(key0, 1) is block01 + assert cache.get_one_block(key0) is None + assert cache.pop(key0, 2) is None + + assert cache.get_one_block(key1) is block10 + assert cache.pop(key1, 10) is block10 + assert cache.get_one_block(key1) is block11 + assert cache.pop(key1, 11) is block11 + assert cache.get_one_block(key1) is None + assert cache.pop(key1, 12) is None diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index b70850a9bcff9..01b54ae56e90a 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -47,16 +47,15 @@ def test_chunked_local_attention_possible_cached_prefix(): BlockHash(str(i).encode()) for i in range(len(block_is_cached)) ] - block_pool.cached_block_hash_to_block.clear() + block_pool.cached_block_hash_to_block._cache.clear() # Mock the block pool with the cached blocks for i, (block_hash, is_cached) in enumerate(zip(block_hash_list, block_is_cached)): if is_cached: - block_pool.cached_block_hash_to_block[ - make_block_hash_with_group_id(block_hash, 0)] = { - i: block_pool.blocks[i + 10], - } + block_pool.cached_block_hash_to_block.insert( + make_block_hash_with_group_id(block_hash, 0), + block_pool.blocks[i + 10]) computed_blocks = manager.find_longest_cache_hit( block_hashes=block_hash_list, @@ -112,16 +111,15 @@ def test_sliding_window_possible_cached_prefix(): BlockHash(str(i).encode()) for i in range(len(block_is_cached)) ] - block_pool.cached_block_hash_to_block.clear() + block_pool.cached_block_hash_to_block._cache.clear() # Mock the block pool with the cached blocks for i, (block_hash, is_cached) in enumerate(zip(block_hash_list, block_is_cached)): if is_cached: - block_pool.cached_block_hash_to_block[ - make_block_hash_with_group_id(block_hash, 0)] = { - i: block_pool.blocks[i + 10], - } + block_pool.cached_block_hash_to_block.insert( + make_block_hash_with_group_id(block_hash, 0), + block_pool.blocks[i + 10]) computed_blocks = manager.find_longest_cache_hit( block_hashes=block_hash_list, diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index d1e1c1c8d0382..3cc738304821b 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections import defaultdict from collections.abc import Iterable -from typing import Optional +from typing import Any, Optional, Union from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared, BlockRemoved, BlockStored, @@ -19,6 +18,103 @@ from vllm.v1.request import Request logger = init_logger(__name__) +class BlockHashToBlockMap: + """ + Cache of blocks that are used for prefix caching. It caches blocks + from hash directly to a block or multiple blocks + (i.e. {block_hash: KVCacheBlocks}) + - Mostly block_hash maps to a single KVCacheBlock, and KVCacheBlocks + would simply be a KVCacheBlock. + - Otherwise, KVCacheBlocks is a dict from {block_id: KVCacheBlock} + + A cached block is a full block with a block hash that can be used + for prefix caching. + The cached block may be used by running requests or in the + free_block_queue that could potentially be evicted. + + NOTE #1: We currently don't de-duplicate the blocks in the cache, + meaning that if a block becomes full and is cached, we don't check + if there is already an identical block in the cache. This is because + we want to make sure the allocated block IDs won't change so that + block tables are append-only. + NOTE #2: The union type is introduced in order to reduce GC costs + from the inner dict. + """ + + def __init__(self): + self._cache: dict[BlockHashWithGroupId, + Union[KVCacheBlock, dict[int, KVCacheBlock]]] = {} + + def get_one_block(self, + key: BlockHashWithGroupId) -> Optional[KVCacheBlock]: + """ + Gets any block with the given block hash key. + """ + blocks = self._cache.get(key) + if blocks is not None: + if isinstance(blocks, KVCacheBlock): + return blocks + if isinstance(blocks, dict): + return next(iter(blocks.values())) + self._unexpected_blocks_type(blocks) + return None + + def insert(self, key: BlockHashWithGroupId, block: KVCacheBlock) -> None: + """ + Inserts the KVCacheBlock to the cache + """ + blocks = self._cache.get(key) + if blocks is None: + # When key is not found, attach a single block to the key + self._cache[key] = block + elif isinstance(blocks, KVCacheBlock): + # If there's a block with the same key, merge the original block + # and the new block into a dict + self._cache[key] = {blocks.block_id: blocks, block.block_id: block} + elif isinstance(blocks, dict): + # If it's already a dict, simply insert the block + blocks[block.block_id] = block + else: + self._unexpected_blocks_type(blocks) + + def pop(self, key: BlockHashWithGroupId, + block_id: int) -> Optional[KVCacheBlock]: + """ + Checks if block_hash exists and pop block_id from the cache + """ + blocks = self._cache.pop(key, None) + if blocks is None: + # block_hash not found in the cache + return None + # TODO(Jialin): If key is found, block_id should always present + # in blocks. We currently keep the original behaviour for safety. + # + # Will add block_id == blocks.block_id assertion and + # use del blocks[block_id] instead as followup. + if isinstance(blocks, KVCacheBlock): + if blocks.block_id == block_id: + return blocks + # If the single block ID doesn't match, we should put the + # block back (it should happen rarely) + self._cache[key] = blocks + return None + if isinstance(blocks, dict): + # Try to pop block_id from the block dict, and if dict still + # contain blocks, put back to the cache. + block = blocks.pop(block_id, None) + if len(blocks) > 0: + self._cache[key] = blocks + return block + self._unexpected_blocks_type(blocks) + return None + + def __len__(self) -> int: + return len(self._cache) + + def _unexpected_blocks_type(self, blocks: Any) -> None: + raise AssertionError(f"Invalid KV cache block type {type(blocks)}") + + class BlockPool: """BlockPool that manages KVCacheBlocks. It provides methods to allocate, free and cache the kv cache blocks. The @@ -51,17 +147,9 @@ class BlockPool: # enabled). self.free_block_queue = FreeKVCacheBlockQueue(self.blocks) - # {block_hash: {block ID: block}}. A cached block is - # a full block with a block hash that can be used for prefix caching. - # The cached block may be used by running requests or in the - # free_block_queue that could potentially be evicted. - # NOTE: We currently don't de-duplicate the blocks in the cache, - # meaning that if a block becomes full and is cached, we don't check - # if there is already an identical block in the cache. This is because - # we want to make sure the allocated block IDs won't change so that - # block tables are append-only. - self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[ - int, KVCacheBlock]] = defaultdict(dict) + # Cache for block lookup + self.cached_block_hash_to_block: BlockHashToBlockMap = \ + BlockHashToBlockMap() # To represent a placeholder block with block_id=0. # The ref_cnt of null_block is not maintained, needs special care to @@ -90,12 +178,11 @@ class BlockPool: for group_id in kv_cache_group_ids: block_hash_with_group_id = make_block_hash_with_group_id( block_hash, group_id) - cached_blocks_one_group = self.cached_block_hash_to_block.get( + block = self.cached_block_hash_to_block.get_one_block( block_hash_with_group_id) - if not cached_blocks_one_group: + if not block: return None - first_block = next(iter(cached_blocks_one_group.values())) - cached_blocks.append(first_block) + cached_blocks.append(block) return cached_blocks def cache_full_blocks( @@ -140,8 +227,8 @@ class BlockPool: block_hash_with_group_id = make_block_hash_with_group_id( block_hash, kv_cache_group_id) blk.block_hash = block_hash_with_group_id - self.cached_block_hash_to_block[block_hash_with_group_id][ - blk.block_id] = blk + self.cached_block_hash_to_block.insert(block_hash_with_group_id, + blk) if new_hashes is not None: new_hashes.append(maybe_convert_block_hash(block_hash)) @@ -211,15 +298,14 @@ class BlockPool: if block_hash is None: # The block doesn't have hash, eviction is not needed return False - blocks_by_id = self.cached_block_hash_to_block.get(block_hash) - if blocks_by_id is None: - # block_hash not found in cached_block_hash_to_block, + + if self.cached_block_hash_to_block.pop(block_hash, + block.block_id) is None: + # block not found in cached_block_hash_to_block, # eviction is not needed return False + block.reset_hash() - blocks_by_id.pop(block.block_id, None) - if len(blocks_by_id) == 0: - del self.cached_block_hash_to_block[block_hash] if self.enable_kv_cache_events: # FIXME (Chen): Not sure whether we should return `hash_value` @@ -283,7 +369,7 @@ class BlockPool: return False # Remove all hashes so that no new blocks will hit. - self.cached_block_hash_to_block = defaultdict(dict) + self.cached_block_hash_to_block = BlockHashToBlockMap() # Remove all hashes from all blocks. for block in self.blocks: From 969b4da3a6ab737f72cb33db502b4c0bb70d4139 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 24 Sep 2025 00:12:14 +0200 Subject: [PATCH 164/185] [V0 Deprecation] Remove placeholder attn (#25510) Signed-off-by: Thomas Parnell --- .../attention/test_attention_selector.py | 37 +-- vllm/attention/backends/placeholder_attn.py | 314 ------------------ vllm/attention/layer.py | 3 - vllm/attention/selector.py | 9 - .../kv_connector/v1/nixl_connector.py | 1 - 5 files changed, 10 insertions(+), 354 deletions(-) delete mode 100644 vllm/attention/backends/placeholder_attn.py diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index a4e200775c09d..730514eb5a568 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -85,8 +85,7 @@ def test_env( if device == "cpu": with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, None, block_size, - False) + backend = get_attn_backend(16, torch.float16, None, block_size) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "hip": @@ -106,7 +105,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) assert f"The selected backend, {name}" in str( exc_info.value) @@ -117,7 +115,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) assert f"The selected backend, {name}" in str( exc_info.value) @@ -127,7 +124,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = f"{name}_VLLM_V1" assert backend.get_name() == expected @@ -136,7 +132,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "TRITON_ATTN_VLLM_V1" assert backend.get_name() == expected @@ -164,7 +159,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "CUTLASS_MLA_VLLM_V1" assert backend.get_name() == expected @@ -179,7 +173,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "FLASHINFER_MLA" assert backend.get_name() == expected @@ -199,7 +192,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = f"{name}_VLLM_V1" assert backend.get_name() == expected @@ -208,7 +200,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "FLASH_ATTN_MLA" assert backend.get_name() == expected @@ -218,7 +209,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "TRITON_MLA_VLLM_V1" assert backend.get_name() == expected @@ -227,7 +217,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "FLASHINFER_VLLM_V1" assert backend.get_name() == expected @@ -236,7 +225,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) expected = "FLASH_ATTN_VLLM_V1" assert backend.get_name() == expected @@ -245,7 +233,6 @@ def test_env( torch.float16, None, block_size, - False, use_mla=use_mla) assert backend.get_name() == "FLEX_ATTENTION", ( "Should fallback to FlexAttention if head size is " @@ -264,13 +251,13 @@ def test_fp32_fallback( if device == "cpu": with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16, False) + backend = get_attn_backend(16, torch.float32, None, 16) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "cuda": with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16, False) + backend = get_attn_backend(16, torch.float32, None, 16) assert backend.get_name() == "FLEX_ATTENTION" @@ -286,29 +273,29 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5)) - backend = get_attn_backend(16, torch.float16, None, 16, False) + backend = get_attn_backend(16, torch.float16, None, 16) assert backend.get_name() != STR_FLASH_ATTN_VAL # Reset the monkeypatch for subsequent tests monkeypatch.undo() # Unsupported data type - backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16) assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + backend = get_attn_backend(16, torch.float16, "fp8", 16) assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported block size - backend = get_attn_backend(16, torch.float16, None, 8, False) + backend = get_attn_backend(16, torch.float16, None, 8) assert backend.get_name() != STR_FLASH_ATTN_VAL # flash-attn is not installed import sys original_module = sys.modules.get('vllm_flash_attn') monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) - backend = get_attn_backend(16, torch.float16, None, 16, False) + backend = get_attn_backend(16, torch.float16, None, 16) assert backend.get_name() != STR_FLASH_ATTN_VAL # Restore the original module if it existed @@ -319,11 +306,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) # Unsupported head size - backend = get_attn_backend(17, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, None, 16, True) + backend = get_attn_backend(17, torch.float16, None, 16) assert backend.get_name() != STR_FLASH_ATTN_VAL @@ -336,5 +319,5 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch): # Should raise ValueError for invalid backend with pytest.raises(ValueError) as exc_info: - get_attn_backend(32, torch.float16, None, 16, False) + get_attn_backend(32, torch.float16, None, 16) assert "Invalid value 'INVALID'" in str(exc_info.value) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py deleted file mode 100644 index cddeb2cf39bf0..0000000000000 --- a/vllm/attention/backends/placeholder_attn.py +++ /dev/null @@ -1,314 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from itertools import accumulate -from typing import List, Optional, Tuple, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, - AttentionMetadataBuilder) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.utils import async_tensor_h2d - -# Placeholder attention backend for models like Mamba and pooling models that -# lack attention. - - -class PlaceholderAttentionBackend(AttentionBackend): - """Placeholder backend for when no attention is needed.""" - - @staticmethod - def get_name() -> str: - return "NO_ATTENTION" - - @staticmethod - def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: - return PlaceholderAttentionImpl - - @staticmethod - def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]: - return PlaceholderAttentionMetadataBuilder - - @staticmethod - def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]: - return PlaceholderAttentionMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (1, 1, 1, 1, 1) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - return - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - return - - -@dataclass -class PlaceholderAttentionMetadata(AttentionMetadata): - """Attention metadata for prefill and decode batched together.""" - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # Maximum query length in the batch. - max_query_len: Optional[int] - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - # Placeholder. - block_tables: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None - _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None - - @property - def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - - # Placeholders - slot_mapping = torch.empty(0) - block_tables = torch.empty(0) - - self._cached_prefill_metadata = PlaceholderAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=0, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - ) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.seq_lens_tensor is not None - - # Placeholders - slot_mapping = torch.empty(0) - block_tables = torch.empty(0) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - - self._cached_decode_metadata = PlaceholderAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=None, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - use_cuda_graph=self.use_cuda_graph, - ) - return self._cached_decode_metadata - - -class PlaceholderAttentionMetadataBuilder( - AttentionMetadataBuilder[PlaceholderAttentionMetadata]): - - def __init__(self, input_builder): - - self.input_builder = input_builder - self.runner = input_builder.runner - - def prepare(self): - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.curr_seq_lens: List[int] = [] - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - """ - is_prompt = inter_data.is_prompt - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - - # Some input builders such as ModelInputForCPUBuilder do not have the - # "inter_data_list" attribute. - # Let's check inter_data_list exists before we reference it. - if hasattr(self.input_builder, "inter_data_list"): - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - if use_captured_graph: - num_decode_tokens = batch_size - self.num_prefill_tokens - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - - # Placeholders - slot_mapping_tensor = torch.empty(0) - block_tables = torch.empty(0) - - return PlaceholderAttentionMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - enable_kv_scales_calculation=True, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=use_captured_graph, - ) - - -class PlaceholderAttentionImpl(AttentionImpl): - - def __init__(self, *args, **kwargs) -> None: - return - - def forward(self, *args, **kwargs) -> torch.Tensor: - raise NotImplementedError diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 544a720524429..0ed20b3b71519 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -115,12 +115,10 @@ class Attention(nn.Module, AttentionLayerBase): if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size - is_attention_free = cache_config.is_attention_free calculate_kv_scales = cache_config.calculate_kv_scales else: kv_cache_dtype = "auto" block_size = 16 - is_attention_free = False calculate_kv_scales = False if num_kv_heads is None: num_kv_heads = num_heads @@ -185,7 +183,6 @@ class Attention(nn.Module, AttentionLayerBase): dtype, kv_cache_dtype, block_size, - is_attention_free, use_mla=use_mla, has_sink=self.has_sink) else: diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 3a235ba6e0b42..b651fc3eaee36 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -142,7 +142,6 @@ def get_attn_backend( dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, - is_attention_free: bool = False, use_mla: bool = False, has_sink: bool = False, ) -> type[AttentionBackend]: @@ -156,7 +155,6 @@ def get_attn_backend( dtype=dtype, kv_cache_dtype=kv_cache_dtype, block_size=block_size, - is_attention_free=is_attention_free, use_v1=envs.VLLM_USE_V1, use_mla=use_mla, has_sink=has_sink, @@ -169,17 +167,10 @@ def _cached_get_attn_backend( dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, - is_attention_free: bool, use_v1: bool = False, use_mla: bool = False, has_sink: bool = False, ) -> type[AttentionBackend]: - # If there are no attention layers (e.g. we are running Mamba), - # use the placeholder NO_ATTENTION - if is_attention_free: - from vllm.attention.backends.placeholder_attn import ( - PlaceholderAttentionBackend) - return PlaceholderAttentionBackend # Check whether a particular choice of backend was # previously forced. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 64feddb591c27..528d4022bd17a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -574,7 +574,6 @@ class NixlConnectorWorker: self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, - self.model_config.is_attention_free, use_mla=self.use_mla) self.backend_name = backend.get_name() attn_backend = backend_name_to_enum(self.backend_name) From eca7be9077aa22e70da5c2ef04ff056e3c7bdc58 Mon Sep 17 00:00:00 2001 From: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:17:49 -0700 Subject: [PATCH 165/185] =?UTF-8?q?Add=20VLLM=5FENABLE=5FINDUCTOR=5FMAX=5F?= =?UTF-8?q?AUTOTUNE=20&=20VLLM=5FENABLE=5FINDUCTOR=5FCOORDINA=E2=80=A6=20(?= =?UTF-8?q?#25493)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: rouchenzi Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> --- vllm/compilation/compiler_interface.py | 5 +++-- vllm/envs.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 7158fd685964f..eeca14d1296f3 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -551,8 +551,9 @@ def set_inductor_config(config, runtime_shape): if isinstance(runtime_shape, int): # for a specific batchsize, tuning triton kernel parameters # can be beneficial - config["max_autotune"] = True - config["coordinate_descent_tuning"] = True + config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE + config["coordinate_descent_tuning"] = ( + envs.VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING) class EagerAdaptor(CompilerInterface): diff --git a/vllm/envs.py b/vllm/envs.py index 50d58c5468f97..1c6c1e78ac9b1 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -193,6 +193,8 @@ if TYPE_CHECKING: VLLM_DBO_COMM_SMS: int = 20 GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None + VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True + VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True VLLM_USE_NCCL_SYMM_MEM: bool = False VLLM_NCCL_INCLUDE_PATH: Optional[str] = None @@ -1413,6 +1415,17 @@ environment_variables: dict[str, Callable[[], Any]] = { "code_interpreter", "web_search_preview"]), + # Enable max_autotune & coordinate_descent_tuning in inductor_config + # to compile static shapes passed from compile_sizes in compilation_config + # If set to 1, enable max_autotune; By default, this is enabled (1) + "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": + lambda: bool(int(os.getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1"))), + # If set to 1, enable coordinate_descent_tuning; + # By default, this is enabled (1) + "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": + lambda: bool(int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", + "1"))), + # Flag to enable NCCL symmetric memory allocation and registration "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))), @@ -1513,6 +1526,8 @@ def compute_hash() -> str: "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", + "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", + "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", ] for key in environment_variables_to_hash: # if this goes out of sync with environment_variables, From 4f2954f7240b2b8f14a680e55c3caae163a404a9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Sep 2025 18:26:10 -0400 Subject: [PATCH 166/185] Fix triton_reshape_and_cache_flash.py triton import (#25522) Signed-off-by: mgoin --- vllm/attention/ops/triton_reshape_and_cache_flash.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py index 0b0c706626af3..883052cb46aab 100644 --- a/vllm/attention/ops/triton_reshape_and_cache_flash.py +++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py @@ -2,10 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch -import triton -import triton.language as tl from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton @triton.jit From 95bc60e4cb42029dbe7f1b7c568d5147f0ea1f2d Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Tue, 23 Sep 2025 15:46:46 -0700 Subject: [PATCH 167/185] [gpt-oss][bugfix] remove logic to require resp_ in ResponseAPI (#25428) Signed-off-by: Andrew Xia --- .../openai/test_response_api_with_harmony.py | 1 + vllm/entrypoints/openai/serving_responses.py | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 23d8373d97809..c28970afc731f 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -523,6 +523,7 @@ async def test_function_calling(client: OpenAI, model_name: str): input="What's the weather like in Paris today?", tools=tools, temperature=0.0, + extra_body={"request_id": "test_function_calling_non_resp"}, ) assert response is not None assert response.status == "completed" diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 99bb464db1d13..c70baba88d433 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -235,8 +235,6 @@ class OpenAIServingResponses(OpenAIServing): # Handle the previous response ID. prev_response_id = request.previous_response_id if prev_response_id is not None: - if not prev_response_id.startswith("resp_"): - return self._make_invalid_id_error(prev_response_id) async with self.response_store_lock: prev_response = self.response_store.get(prev_response_id) if prev_response is None: @@ -924,9 +922,6 @@ class OpenAIServingResponses(OpenAIServing): stream: Optional[bool], ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[ StreamingResponsesResponse, None]]: - if not response_id.startswith("resp_"): - return self._make_invalid_id_error(response_id) - async with self.response_store_lock: response = self.response_store.get(response_id) @@ -944,9 +939,6 @@ class OpenAIServingResponses(OpenAIServing): self, response_id: str, ) -> Union[ErrorResponse, ResponsesResponse]: - if not response_id.startswith("resp_"): - return self._make_invalid_id_error(response_id) - async with self.response_store_lock: response = self.response_store.get(response_id) if response is None: @@ -972,13 +964,6 @@ class OpenAIServingResponses(OpenAIServing): response_id) return response - def _make_invalid_id_error(self, response_id: str) -> ErrorResponse: - return self.create_error_response( - err_type="invalid_request_error", - message=(f"Invalid 'response_id': '{response_id}'. " - "Expected an ID that begins with 'resp'."), - ) - def _make_not_found_error(self, response_id: str) -> ErrorResponse: return self.create_error_response( err_type="invalid_request_error", From 7361ab379f817727173691224e41e0175423b795 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Sep 2025 18:48:40 -0400 Subject: [PATCH 168/185] Remove redundant mutates_args and dispatch_key for direct_register_custom_op (#25512) Signed-off-by: mgoin --- vllm/attention/layer.py | 3 --- vllm/compilation/collective_fusion.py | 1 - vllm/distributed/device_communicators/pynccl.py | 1 - vllm/distributed/parallel_state.py | 7 ------- vllm/lora/ops/triton_ops/lora_expand_op.py | 2 -- vllm/lora/ops/triton_ops/lora_shrink_op.py | 2 -- .../layers/fused_moe/flashinfer_trtllm_moe.py | 1 - .../layers/fused_moe/fused_marlin_moe.py | 1 - vllm/model_executor/layers/fused_moe/fused_moe.py | 1 - vllm/model_executor/layers/fused_moe/layer.py | 2 -- .../layers/fused_moe/rocm_aiter_fused_moe.py | 7 ------- vllm/model_executor/layers/layernorm.py | 4 ---- vllm/model_executor/layers/mamba/linear_attn.py | 2 -- vllm/model_executor/layers/mamba/mamba_mixer.py | 2 -- vllm/model_executor/layers/mamba/mamba_mixer2.py | 2 -- vllm/model_executor/layers/mamba/short_conv.py | 2 -- vllm/model_executor/layers/quantization/deepgemm.py | 3 --- vllm/model_executor/layers/quantization/gguf.py | 3 --- .../layers/quantization/kernels/scaled_mm/aiter.py | 2 -- .../layers/quantization/utils/fp8_utils.py | 3 --- .../layers/quantization/utils/mxfp4_utils.py | 2 -- .../layers/quantization/utils/w8a8_utils.py | 2 -- vllm/model_executor/layers/rotary_embedding/common.py | 1 - vllm/model_executor/layers/utils.py | 2 -- vllm/model_executor/models/deepseek_v2.py | 3 --- vllm/model_executor/models/plamo2.py | 2 -- vllm/model_executor/models/qwen3_next.py | 1 - vllm/utils/__init__.py | 11 +++++++++-- 28 files changed, 9 insertions(+), 66 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 0ed20b3b71519..baa83e29bdd05 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -575,9 +575,7 @@ def unified_attention_fake( direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, - mutates_args=[], fake_impl=unified_attention_fake, - dispatch_key=current_platform.dispatch_key, tags=tag_cudagraph_unsafe, ) @@ -628,6 +626,5 @@ direct_register_custom_op( op_func=unified_attention_with_output, mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, - dispatch_key=current_platform.dispatch_key, tags=tag_cudagraph_unsafe, ) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 331cd8a873929..04b76a9c2d228 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -547,7 +547,6 @@ if flashinfer_comm is not None: "scale_out", ], fake_impl=call_trtllm_fused_allreduce_norm_fake, - dispatch_key=current_platform.dispatch_key, ) flashinfer_trtllm_fused_allreduce_norm = ( torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 75de85e1b0aba..76fe9a93259fa 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -46,7 +46,6 @@ def register_nccl_symmetric_ops(pynccl_comm): direct_register_custom_op( op_name="all_reduce_symmetric_with_copy", op_func=all_reduce_symmetric_with_copy_impl, - mutates_args=[], fake_impl=all_reduce_symmetric_with_copy_fake, ) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 895971893a661..69f98eb54f36c 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -149,29 +149,22 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int, if supports_custom_op(): - from vllm.platforms import current_platform direct_register_custom_op( op_name="all_reduce", op_func=all_reduce, - mutates_args=[], fake_impl=all_reduce_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="reduce_scatter", op_func=reduce_scatter, - mutates_args=[], fake_impl=reduce_scatter_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="all_gather", op_func=all_gather, - mutates_args=[], fake_impl=all_gather_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index b1ab84e08ba76..467cbaa8af48f 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -11,7 +11,6 @@ import torch from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr -from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -283,7 +282,6 @@ try: op_func=_lora_expand, mutates_args=["output_tensor"], fake_impl=_lora_expand_fake, - dispatch_key=current_platform.dispatch_key, ) lora_expand = torch.ops.vllm.lora_expand diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index 1e7075ab07151..57da93c226d25 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -11,7 +11,6 @@ import torch from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr -from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -237,7 +236,6 @@ try: op_func=_lora_shrink, mutates_args=["output_tensor"], fake_impl=_lora_shrink_fake, - dispatch_key=current_platform.dispatch_key, ) lora_shrink = torch.ops.vllm.lora_shrink diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index e358143fac7c7..fe586a22e2506 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -92,7 +92,6 @@ def flashinfer_fused_moe_blockscale_fp8_fake( direct_register_custom_op( op_name="flashinfer_fused_moe_blockscale_fp8", op_func=flashinfer_fused_moe_blockscale_fp8, - mutates_args=[], fake_impl=flashinfer_fused_moe_blockscale_fp8_fake, tags=(torch.Tag.needs_fixed_stride_order, ), ) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 1e3ac6cd79f68..eb12a9b0a233f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -235,6 +235,5 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor, direct_register_custom_op( op_name="fused_marlin_moe", op_func=fused_marlin_moe, - mutates_args=[], fake_impl=fused_marlin_moe_fake, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 0e334fdf24045..611df357265bf 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1256,7 +1256,6 @@ def outplace_fused_experts_fake( direct_register_custom_op( op_name="outplace_fused_experts", op_func=outplace_fused_experts, - mutates_args=[], fake_impl=outplace_fused_experts_fake, tags=(() if is_torch_equal_or_newer("2.7.0") else (torch.Tag.needs_fixed_stride_order, )), diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 71cc2bcf174dd..2bf3bf96baf10 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2040,7 +2040,6 @@ direct_register_custom_op( op_func=moe_forward, mutates_args=["hidden_states"], fake_impl=moe_forward_fake, - dispatch_key=current_platform.dispatch_key, tags=(torch.Tag.needs_fixed_stride_order, ), ) @@ -2071,7 +2070,6 @@ direct_register_custom_op( op_func=moe_forward_shared, mutates_args=["hidden_states"], fake_impl=moe_forward_shared_fake, - dispatch_key=current_platform.dispatch_key, tags=(torch.Tag.needs_fixed_stride_order, ), ) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index f4972ff5f9cb0..2764af5fc5323 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -223,17 +223,13 @@ if current_platform.is_rocm(): direct_register_custom_op( op_name="rocm_aiter_asm_moe_tkw1", op_func=rocm_aiter_asm_moe_tkw1_impl, - mutates_args=[], fake_impl=rocm_aiter_asm_moe_tkw1_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_fused_moe", op_func=rocm_aiter_fused_moe_impl, - mutates_args=[], fake_impl=rocm_aiter_fused_moe_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( @@ -241,7 +237,6 @@ if current_platform.is_rocm(): op_func=rocm_aiter_topk_softmax_impl, mutates_args=["topk_weights", "topk_indices", "token_expert_indices"], fake_impl=rocm_aiter_topk_softmax_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( @@ -249,7 +244,6 @@ if current_platform.is_rocm(): op_func=rocm_aiter_biased_grouped_topk_impl, mutates_args=["topk_weights", "topk_ids"], fake_impl=rocm_aiter_biased_grouped_topk_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( @@ -257,7 +251,6 @@ if current_platform.is_rocm(): op_func=rocm_aiter_grouped_topk_impl, mutates_args=["topk_weights", "topk_ids"], fake_impl=rocm_aiter_grouped_topk_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index f875f712ba9c9..8123259d037ba 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -103,17 +103,13 @@ if current_platform.is_rocm(): direct_register_custom_op( op_name="rocm_aiter_rms_norm", op_func=rocm_aiter_rms_norm_impl, - mutates_args=[], fake_impl=rocm_aiter_rms_norm_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_rmsnorm2d_fwd_with_add", op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl, - mutates_args=[], fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 6a901b47b8b63..410cbef4f6bc0 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata @@ -401,5 +400,4 @@ direct_register_custom_op( op_func=linear_attention, mutates_args=["output"], fake_impl=linear_attention_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index a56ee13a63804..d64854cdb3818 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata @@ -464,5 +463,4 @@ direct_register_custom_op( op_func=mamba_mixer, mutates_args=["output"], fake_impl=mamba_mixer_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 047ce4c4c43d0..908ea6e0025f1 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import ( LoaderFunction, composed_weight_loader, sharded_weight_loader) from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata @@ -765,5 +764,4 @@ direct_register_custom_op( op_func=mamba_mixer2, mutates_args=["output"], fake_impl=mamba_mixer2_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index ffdcd702aab40..cc424760e229f 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.short_conv_attn import ( ShortConvAttentionMetadata) @@ -251,5 +250,4 @@ direct_register_custom_op( op_func=short_conv, mutates_args=["output"], fake_impl=short_conv_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py index c2b3ccf19fca8..8452f686b3acc 100644 --- a/vllm/model_executor/layers/quantization/deepgemm.py +++ b/vllm/model_executor/layers/quantization/deepgemm.py @@ -4,7 +4,6 @@ import logging import torch -from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils import direct_register_custom_op from vllm.utils.deep_gemm import fp8_gemm_nt @@ -75,7 +74,5 @@ def w8a8_deepgemm_block_scaled_mm_fake( direct_register_custom_op( op_name="w8a8_deepgemm_block_scaled_mm", op_func=w8a8_deepgemm_block_scaled_mm, - mutates_args=[], fake_impl=w8a8_deepgemm_block_scaled_mm_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index a631dfdab6544..de25ee84d081e 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -161,7 +161,6 @@ try: direct_register_custom_op( op_name="_fused_mul_mat_gguf", op_func=_fused_mul_mat_gguf, - mutates_args=[], fake_impl=_fused_mul_mat_gguf_fake, ) fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf @@ -273,7 +272,6 @@ try: direct_register_custom_op( op_name="_fused_moe_gguf", op_func=_fused_moe_gguf, - mutates_args=[], fake_impl=_fused_moe_gguf_fake, ) fused_moe_gguf = torch.ops.vllm._fused_moe_gguf @@ -319,7 +317,6 @@ try: direct_register_custom_op( op_name="_apply_gguf_embedding", op_func=_apply_gguf_embedding, - mutates_args=[], fake_impl=_apply_gguf_embedding_fake, ) apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 7f808fa92a9a8..e8e950a4bb7b6 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -51,9 +51,7 @@ if current_platform.is_rocm(): direct_register_custom_op( op_name="rocm_aiter_gemm_w8a8", op_func=rocm_aiter_gemm_w8a8_impl, - mutates_args=[], fake_impl=rocm_aiter_gemm_w8a8_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 2098086bf2401..0bc69fe7f9304 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -91,9 +91,7 @@ if current_platform.is_rocm(): direct_register_custom_op( op_name="rocm_aiter_gemm_w8a8_blockscale", op_func=rocm_aiter_gemm_w8a8_blockscale_impl, - mutates_args=[], fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake, - dispatch_key=current_platform.dispatch_key, ) if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_LINEAR and current_platform.is_fp8_fnuz()): @@ -135,7 +133,6 @@ def _w8a8_triton_block_scaled_mm_fake( direct_register_custom_op( "w8a8_triton_block_scaled_mm_func", _w8a8_triton_block_scaled_mm_func, - mutates_args=[], fake_impl=_w8a8_triton_block_scaled_mm_fake, dispatch_key="CUDA", ) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 3de928fea7202..d61ca7ad5dc4e 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -113,7 +113,6 @@ try: direct_register_custom_op( op_name="dequant_mxfp4", op_func=_dequant_mxfp4, - mutates_args=[], fake_impl=_dequant_mxfp4_fake, ) dequant_mxfp4 = torch.ops.vllm.dequant_mxfp4 @@ -124,7 +123,6 @@ try: direct_register_custom_op( op_name="quant_dequant_mxfp4", op_func=_quant_dequant_mxfp4, - mutates_args=[], fake_impl=_quant_dequant_mxfp4_fake, ) quant_dequant_mxfp4 = torch.ops.vllm.quant_dequant_mxfp4 diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 6ed482db4700e..b434b7acfea83 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -218,9 +218,7 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, direct_register_custom_op( op_name="rocm_per_tensor_w8a8_scaled_mm_impl", op_func=rocm_per_tensor_w8a8_scaled_mm_impl, - mutates_args=[], fake_impl=rocm_per_tensor_w8a8_scaled_mm_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index e3cd0a8e788eb..8619651067746 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -147,5 +147,4 @@ direct_register_custom_op( op_func=_flashinfer_rotary_embedding, mutates_args=["query", "key"], # These tensors are modified in-place fake_impl=_flashinfer_rotary_embedding_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index d7a65d43c2107..96dd58c0e4d23 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -136,9 +136,7 @@ def rocm_unquantized_gemm(layer: torch.nn.Module, direct_register_custom_op( op_name="rocm_unquantized_gemm_impl", op_func=rocm_unquantized_gemm_impl, - mutates_args=[], fake_impl=rocm_unquantized_gemm_impl_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 415d36c681d8c..9895ebbcdefee 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -56,7 +56,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import cdiv, direct_register_custom_op @@ -141,9 +140,7 @@ def sequence_parallel_chunk_fake(x: torch.Tensor) -> torch.Tensor: direct_register_custom_op( op_name="sequence_parallel_chunk", op_func=sequence_parallel_chunk, - mutates_args=[], fake_impl=sequence_parallel_chunk_fake, - dispatch_key=current_platform.dispatch_key, tags=(torch.Tag.needs_fixed_stride_order, ), ) diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 0292f3bf8317d..a7acf64f302bc 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -48,7 +48,6 @@ from vllm.model_executor.models.utils import ( is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata @@ -490,7 +489,6 @@ direct_register_custom_op( op_func=plamo2_mamba_mixer, mutates_args=["output"], fake_impl=plamo2_mamba_mixer_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ab23b494e561e..356b5001a7dc8 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1225,7 +1225,6 @@ direct_register_custom_op( op_func=gdn_attention, mutates_args=["output"], fake_impl=gdn_attention_fake, - dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 5d165f1662383..0a7af79f7a177 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2546,10 +2546,10 @@ vllm_lib = Library("vllm", "FRAGMENT") # noqa def direct_register_custom_op( op_name: str, op_func: Callable, - mutates_args: list[str], + mutates_args: Optional[list[str]] = None, fake_impl: Optional[Callable] = None, target_lib: Optional[Library] = None, - dispatch_key: str = "CUDA", + dispatch_key: Optional[str] = None, tags: tuple[torch.Tag, ...] = (), ): """ @@ -2577,6 +2577,13 @@ def direct_register_custom_op( "the required dependencies.") return + if mutates_args is None: + mutates_args = [] + + if dispatch_key is None: + from vllm.platforms import current_platform + dispatch_key = current_platform.dispatch_key + import torch.library if hasattr(torch.library, "infer_schema"): schema_str = torch.library.infer_schema(op_func, From abad204be6eaaae3630db4287ebdca004197f819 Mon Sep 17 00:00:00 2001 From: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:49:09 -0700 Subject: [PATCH 169/185] [BugFix] Fix OOM in vLLM replicas by ensuring consistent NCCL memory accounting (#25359) Signed-off-by: Kourosh Hakhamaneshi --- .buildkite/test-pipeline.yaml | 3 + .../v1/worker/test_worker_memory_snapshot.py | 174 ++++++++++++++++++ vllm/v1/worker/gpu_worker.py | 21 ++- 3 files changed, 191 insertions(+), 7 deletions(-) create mode 100644 tests/v1/worker/test_worker_memory_snapshot.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index aef6d709722fa..245d21b778c05 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -887,6 +887,8 @@ steps: - tests/v1/test_external_lb_dp.py - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/v1/worker/test_worker_memory_snapshot.py commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py @@ -908,6 +910,7 @@ steps: - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s models/multimodal/generation/test_maverick.py + - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Plugin Tests (2 GPUs) # 40min timeout_in_minutes: 60 diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py new file mode 100644 index 0000000000000..6faa6bcc591cb --- /dev/null +++ b/tests/v1/worker/test_worker_memory_snapshot.py @@ -0,0 +1,174 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import multiprocessing as mp +import os +import tempfile +from multiprocessing import Queue +from typing import Optional +from unittest.mock import patch + +import pytest +import torch + +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import MemorySnapshot +from vllm.v1.worker.gpu_worker import (Worker, + init_worker_distributed_environment) + +# Global queue to track operation order across processes +_QUEUE: Optional[Queue] = None + + +def track_operation(operation: str, rank: int): + """Track when an operation happens and its rank.""" + if _QUEUE is not None: + _QUEUE.put((operation, rank)) + + +def make_operation_tracker(operation_name: str, original_func): + """Create a mock function that tracks when an operation is called. + + Args: + operation_name: Name to use when tracking this operation + original_func: The original function to wrap + + Returns: + A wrapper function that tracks the operation and calls the original + """ + + def wrapper(*args, **kwargs): + rank = int(os.environ.get("RANK", "-1")) + track_operation(operation_name, rank) + return original_func(*args, **kwargs) + + return wrapper + + +def worker_process(rank: int, world_size: int, distributed_init_method: str, + queue: Queue, error_queue: Queue): + """Worker process that initializes a GPU worker with proper tracking.""" + global _QUEUE + _QUEUE = queue + + try: + # Set environment variables + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + # Create vLLM config with small model + vllm_config = EngineArgs(model="facebook/opt-125m", + tensor_parallel_size=2, + load_format="dummy").create_engine_config() + + # Create worker + worker = Worker( + vllm_config=vllm_config, + local_rank=rank, + rank=rank, + distributed_init_method=distributed_init_method, + ) + + # Get original functions before patching + original_init_worker = init_worker_distributed_environment + original_memory_snapshot_init = MemorySnapshot.__init__ + original_all_reduce = torch.distributed.all_reduce + + # Apply minimal patches to track operation order + init_patch = patch( + 'vllm.v1.worker.gpu_worker.init_worker_distributed_environment', + side_effect=make_operation_tracker("init_distributed", + original_init_worker)) + memory_patch = patch.object( + MemorySnapshot, '__init__', + make_operation_tracker("memory_snapshot", + original_memory_snapshot_init)) + all_reduce_patch = patch('torch.distributed.all_reduce', + side_effect=make_operation_tracker( + "nccl_all_reduce", original_all_reduce)) + + with init_patch, memory_patch, all_reduce_patch: + + # Initialize device (this is where we test the order) + worker.init_device() + + # Load model to ensure everything works + worker.load_model() + + # Signal success + queue.put(("success", rank)) + + except Exception as e: + error_queue.put((rank, str(e), type(e).__name__)) + raise + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs for tensor parallelism") +def test_init_distributed_is_called_before_memory_snapshot(): + """Test that distributed env is setup before memory snapshot. + + This test makes sure during worker initialization, the initial memory + snapshot is taken after distributed env is setup to include all the buffers + allocated by distributed env. + """ + world_size = 2 + + # Create a temporary file for distributed init + with tempfile.NamedTemporaryFile(delete=False) as f: + distributed_init_method = f"file://{f.name}" + + # Create queues for inter-process communication + ctx = mp.get_context("spawn") + operation_queue = ctx.Queue() + error_queue = ctx.Queue() + + # Start worker processes + processes = [] + for rank in range(world_size): + p = ctx.Process(target=worker_process, + args=(rank, world_size, distributed_init_method, + operation_queue, error_queue)) + p.start() + processes.append(p) + + # Wait for all processes to complete + for p in processes: + p.join(timeout=60) # 60 second timeout + + # Check for errors + errors = [] + while not error_queue.empty(): + rank, error_msg, error_type = error_queue.get() + errors.append(f"Rank {rank}: {error_type}: {error_msg}") + + if errors: + pytest.fail("Worker processes failed:\n" + "\n".join(errors)) + + # Collect all operations from the queue + operations = [] + while not operation_queue.empty(): + operations.append(operation_queue.get()) + + # Verify we got operations from both ranks + print(f"Collected operations: {operations}") + + # Check operations for each rank + for rank in range(world_size): + rank_ops = [op for op, r in operations if r == rank] + print(f"\nRank {rank} operations: {rank_ops}") + + # Raises ValueError if the operation is not found + init_distributed = rank_ops.index("init_distributed") + nccl_all_reduce = rank_ops.index("nccl_all_reduce") + memory_snapshot = rank_ops.index("memory_snapshot") + + # Verify order: init_distributed should happen before memory_snapshot + assert init_distributed < nccl_all_reduce < memory_snapshot, ( + f"Rank {rank}: init_distributed (index {init_distributed}) " + f"must happen before nccl_all_reduce (index {nccl_all_reduce}) " + f"and memory_snapshot (index {memory_snapshot})") + + # Clean up + os.unlink(distributed_init_method.replace("file://", "")) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f59dacf13d85a..9082bbfd8f8e6 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -169,6 +169,20 @@ class Worker(WorkerBase): current_platform.set_device(self.device) current_platform.check_if_supports_dtype(self.model_config.dtype) + + # Initialize the distributed environment BEFORE taking + # memory snapshot + # This ensures NCCL buffers are allocated before we measure + # available memory + init_worker_distributed_environment(self.vllm_config, self.rank, + self.distributed_init_method, + self.local_rank, + current_platform.dist_backend) + + # Set random seed. + set_random_seed(self.model_config.seed) + + # Now take memory snapshot after NCCL is initialized gc.collect() torch.cuda.empty_cache() @@ -190,13 +204,6 @@ class Worker(WorkerBase): else: raise RuntimeError( f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank, - current_platform.dist_backend) - # Set random seed. - set_random_seed(self.model_config.seed) # Construct the model runner self.model_runner: GPUModelRunner = GPUModelRunner( From c85d75cf08330f16f6c076d9cb1a586fa6c6fbb8 Mon Sep 17 00:00:00 2001 From: Corey Lowman Date: Tue, 23 Sep 2025 18:50:09 -0400 Subject: [PATCH 170/185] Add `VLLM_NVTX_SCOPES_FOR_PROFILING=1` to enable `nvtx.annotate` scopes (#25501) Signed-off-by: Corey Lowman --- vllm/envs.py | 5 +++++ vllm/v1/utils.py | 20 +++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 1c6c1e78ac9b1..33dae0be05f8d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -187,6 +187,7 @@ if TYPE_CHECKING: VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False + VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 @@ -1387,6 +1388,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))), + # Add optional nvtx scopes for profiling, disable to avoid overheads + "VLLM_NVTX_SCOPES_FOR_PROFILING": + lambda: bool(int(os.getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))), + # Represent block hashes in KV cache events as 64-bit integers instead of # raw bytes. Defaults to True for backward compatibility. "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index fd84b4a111f58..ec4417290f611 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -375,8 +375,22 @@ def report_usage_stats( }) +_PROFILER_FUNC = None + + def record_function_or_nullcontext(name: str) -> AbstractContextManager: + global _PROFILER_FUNC + + # fast path assume it is set + if _PROFILER_FUNC is not None: + return _PROFILER_FUNC(name) + + func = contextlib.nullcontext if envs.VLLM_CUSTOM_SCOPES_FOR_PROFILING: - return record_function(name) - else: - return contextlib.nullcontext() + func = record_function + elif envs.VLLM_NVTX_SCOPES_FOR_PROFILING: + import nvtx + func = nvtx.annotate + + _PROFILER_FUNC = func + return func(name) From 5e25b122366a639f6da9fa568b532d6932c45aff Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 24 Sep 2025 01:23:30 +0200 Subject: [PATCH 171/185] [Kernel] [Mamba] Remove BLOCK_H=1 from list of tuneable configurations for `_chunk_cumsum_fwd_kernel` (#25197) Signed-off-by: Thomas Parnell Co-authored-by: Chih-Chieh-Yang --- vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index a7b3c814859ce..2e657426143b1 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -17,7 +17,6 @@ from .mamba_ssm import softplus @triton.autotune( configs=[ - triton.Config({'BLOCK_SIZE_H': 1}), triton.Config({'BLOCK_SIZE_H': 2}), triton.Config({'BLOCK_SIZE_H': 4}), triton.Config({'BLOCK_SIZE_H': 8}), From bde2a1a8a442cd594eec35f5e356bddb5982de45 Mon Sep 17 00:00:00 2001 From: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:39:50 -0500 Subject: [PATCH 172/185] [ROCm] Small functional changes for gptoss (#25201) Signed-off-by: jpvillam Co-authored-by: jpvillam --- .../model_executor/layers/quantization/mxfp4.py | 9 ++++++--- .../layers/quantization/utils/mxfp4_utils.py | 17 ++++++++++++++--- vllm/platforms/rocm.py | 6 ++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index a71c8d32a22c7..b710f6ee249b1 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -212,12 +212,15 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 256) hidden_size = round_up(hidden_size, 256) - elif current_platform.is_rocm() or ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16): + elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16): intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 128) hidden_size = round_up(hidden_size, 128) + elif current_platform.is_rocm(): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 256) + hidden_size = round_up(hidden_size, 256) else: intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 64) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index d61ca7ad5dc4e..fb1d041f34499 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable, Optional +from typing import Any, Callable, Optional import torch @@ -21,6 +21,10 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor from triton_kernels.tensor_details import layout from triton_kernels.tensor_details.layout import StridedLayout + + value_layout_opts: dict[str, Any] = {} + scale_layout_opts: dict[str, Any] = {} + if (current_platform.is_cuda() and current_platform.is_device_capability(90) and not is_torch_equal_or_newer("2.8.1")): @@ -28,8 +32,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): "Mxfp4 on hopper is running on torch < 2.8.1, " "this cause swizling to be disabled, which may " "cause performance degradation. Please upgrade to torch nightly") - value_layout, value_layout_opts = StridedLayout, dict() - scale_layout, scale_layout_opts = StridedLayout, dict() + value_layout = StridedLayout + scale_layout = StridedLayout + elif current_platform.is_rocm(): + from triton_kernels.tensor_details.layout import (GFX950MXScaleLayout, + StridedLayout) + + from vllm.platforms.rocm import on_gfx950 + value_layout = StridedLayout + scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout else: value_layout, value_layout_opts = \ layout.make_default_matmul_mxfp4_w_layout(mx_axis=1) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 878718489fa88..942fd1973f4f3 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -118,6 +118,12 @@ def on_gfx9() -> bool: return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) +@cache +def on_gfx950() -> bool: + GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName + return any(arch in GPU_ARCH for arch in ["gfx950"]) + + @cache def use_rocm_custom_paged_attention( qtype: torch.dtype, From e0b24ea0305e0ead2c1cb1e0488744b5a91d524e Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 23 Sep 2025 19:53:34 -0400 Subject: [PATCH 173/185] [Perf] Increase default max splits for FA3 full cudagraphs (#25495) Signed-off-by: Lucas Wilkinson --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 33dae0be05f8d..689428ec59109 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,7 +119,7 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32 VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None @@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # max number splits for cuda graph decode "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", - "16")), + "32")), # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU, From 1210e4d95b515507cd151ee76ef947999fdcbe59 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Tue, 23 Sep 2025 19:57:55 -0400 Subject: [PATCH 174/185] [Bugfix] [B200] cutlass_mla - ensure kv_split == 1 for batch size > 1 (#25509) Signed-off-by: Alexander Matveev --- csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp index fbbc2e588c326..297d94dcc0631 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp @@ -135,10 +135,10 @@ public: max_splits = min(16, max_splits); // TODO: This avoids a hang when the batch size larger than 1 and - // there is more than 4 kv_splits. + // there is more than 1 kv_splits. // Discuss with NVIDIA how this can be fixed. if (B > 1) { - max_splits = min(2, max_splits); + max_splits = min(1, max_splits); } // printf(" max_splits = %d\n", max_splits); From dc464a3d3937e30267514e1fc5b988a35dd9dbdf Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 23 Sep 2025 20:00:29 -0400 Subject: [PATCH 175/185] [BugFix] AssertionError: Do not capture num_reqs > max_num_reqs for uniform batch (#25505) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f785824958147..eebdbcc621c69 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2828,7 +2828,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _dummy_run( self, num_tokens: int, - cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + cudagraph_runtime_mode: Optional[CUDAGraphMode] = None, force_attention: bool = False, uniform_decode: bool = False, allow_microbatching: bool = True, @@ -2844,6 +2844,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: num_tokens: Number of tokens to run the dummy forward pass. cudagraph_runtime_mode: used to control the behavior. + - if not set will determine the cudagraph mode based on using + the self.cudagraph_dispatcher. - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run - CUDAGraphMode.PIECEWISE: Piecewise cudagraph. - CUDAGraphMode.FULL: Full cudagraph, attention metadata is @@ -2857,7 +2859,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (1 token) and prefill (multiple tokens) requests. remove_lora: If False, dummy LoRAs are not destroyed after the run """ - assert cudagraph_runtime_mode in { + assert cudagraph_runtime_mode is None or cudagraph_runtime_mode in { CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL } @@ -2899,10 +2901,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): elif uniform_decode: assert not create_mixed_batch num_reqs = cdiv(num_tokens, max_query_len) - assert num_reqs <= max_num_reqs, \ - f"Do not capture num_reqs {num_reqs} > max_num_reqs " \ - f"{max_num_reqs} for uniform batch. Num tokens: " \ - f"{num_tokens}, max_query_len: {max_query_len}" num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: num_scheduled_tokens_list[-1] = num_tokens % max_query_len @@ -3043,18 +3041,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) - if cudagraph_runtime_mode == CUDAGraphMode.NONE: - batch_descriptor = None - else: - # filter out the valid batch descriptor - _cg_mode, batch_descriptor = \ - self.cudagraph_dispatcher.dispatch( - BatchDescriptor(num_tokens=num_tokens, - uniform_decode=uniform_decode)) - # sanity check - assert cudagraph_runtime_mode == _cg_mode, ( + + # filter out the valid batch descriptor + _cg_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( + BatchDescriptor(num_tokens=num_tokens, + uniform_decode=uniform_decode)) + if cudagraph_runtime_mode is not None: + # we allow forcing NONE when the dispatcher disagrees to support + # warm ups for cudagraph capture + assert cudagraph_runtime_mode == CUDAGraphMode.NONE or \ + cudagraph_runtime_mode == _cg_mode, ( f"Cudagraph runtime mode mismatch at dummy_run. " f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.") + else: + cudagraph_runtime_mode = _cg_mode if ubatch_slices is not None: num_tokens = num_tokens // 2 From 7ad5e50adf96826acca509747952452524e9422f Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Tue, 23 Sep 2025 20:03:31 -0400 Subject: [PATCH 176/185] Improve output when failing json.loads() on structured output test (#25483) Signed-off-by: dougbtv --- .../llm/test_struct_output_generate.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index e2c686928cea1..5017c83025ba1 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -81,16 +81,6 @@ class CarDescription(BaseModel): car_type: CarType -def _load_json(s: str, backend: str) -> str: - if backend != "xgrammar": - return json.loads(s) - - # xgrammar specific workarounds - # https://github.com/mlc-ai/xgrammar/issues/286 - s = re.sub(r'[\x00-\x1F\x7F-\xFF]', '', s) - return json.loads(s) - - def test_guided_decoding_deprecated(): with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"): @@ -177,7 +167,12 @@ def test_structured_output( if backend != 'lm-format-enforcer': assert "\n" not in generated_text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) + try: + output_json = json.loads(generated_text) + except json.JSONDecodeError as e: + pytest.fail( + f"Invalid JSON from backend={backend}: {generated_text!r}\n" + f"Schema: {sample_json_schema}\nError: {e}") jsonschema.validate(instance=output_json, schema=sample_json_schema) # @@ -425,7 +420,12 @@ def test_structured_output( generated_text = output.outputs[0].text assert generated_text is not None print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) + try: + output_json = json.loads(generated_text) + except json.JSONDecodeError as e: + pytest.fail( + f"Invalid JSON from backend={backend}: {generated_text!r}\n" + f"Schema: {json_schema}\nError: {e}") jsonschema.validate(instance=output_json, schema=json_schema) # @@ -468,7 +468,12 @@ def test_structured_output( generated_text = output.outputs[0].text assert generated_text is not None print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) + try: + output_json = json.loads(generated_text) + except json.JSONDecodeError as e: + pytest.fail( + f"Invalid JSON from backend={backend}: {generated_text!r}\n" + f"Schema: {json_schema}\nError: {e}") jsonschema.validate(instance=output_json, schema=json_schema) if backend not in ["outlines", "lm-format-enforcer"]: From 0d235b874aae47b3214b3354bb70fcefb91c4fce Mon Sep 17 00:00:00 2001 From: Chenxi Yang Date: Tue, 23 Sep 2025 17:07:42 -0700 Subject: [PATCH 177/185] Add CUTLASS FP8 MOE benchmark scripts and kernel config (#25302) Signed-off-by: Chenxi Yang Co-authored-by: Chenxi Yang --- .../kernels/benchmark_cutlass_moe_fp8.py | 406 ++++++++++++++++++ ...evice_name=NVIDIA_H100,dtype=fp8_w8a8.json | 123 ++++++ 2 files changed, 529 insertions(+) create mode 100644 benchmarks/kernels/benchmark_cutlass_moe_fp8.py create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py new file mode 100644 index 0000000000000..b419b2fa0e3eb --- /dev/null +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe +kernel. Both kernels take in fp8 quantized weights and 16-bit activations, +but use different quantization strategies and backends. +""" + +import nvtx +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +# Weight shapes for different models: [num_experts, topk, hidden_size, +# intermediate_size] +WEIGHT_SHAPES_MOE = { + "mixtral-8x7b": [ + [8, 2, 4096, 14336], + ], + "deepseek-v2": [ + [160, 6, 5120, 12288], + ], + "custom-small": [ + [8, 2, 2048, 7168], + ], + "glm45-fp8": [ + [128, 8, 4096, 1408], + ], + "Llama-4-Maverick-17B-128E-Instruct-FP8": [ + [128, 1, 5120, 8192], + ], +} + +DEFAULT_MODELS = [ + "mixtral-8x7b", +] + +DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False, True] +PER_OUT_CH_OPTS = [False, True] + +FP8_DTYPE = current_platform.fp8_dtype() + + +def bench_run( + results: list, + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): + (m, k, n) = mkn + + dtype = torch.half + device = "cuda" + + # Create input activations + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + + # Create weights + w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 + + # Create FP8 quantized weights and scales for both kernels + w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE) + w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE) + + # Create scales based on quantization strategy + if per_out_ch: + # Per-channel quantization + w1_scale = torch.empty( + (num_experts, 2 * n, 1), device=device, dtype=torch.float32 + ) + w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32) + else: + # Per-tensor quantization + w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + + # Quantize weights + for expert in range(num_experts): + if per_out_ch: + # Per-channel quantization - not yet implemented properly + # For now, fall back to per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Expand scalar scales to the expected per-channel shape + w1_scale[expert] = w1_scale_temp.expand(2 * n, 1) + w2_scale[expert] = w2_scale_temp.expand(k, 1) + else: + # Per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Store scalar scales in [1, 1] tensors + w1_scale[expert, 0, 0] = w1_scale_temp + w2_scale[expert, 0, 0] = w2_scale_temp + + # Prepare weights for CUTLASS (no transpose needed) + w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K] + w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N] + + # Create router scores and get topk + score = torch.randn((m, num_experts), device=device, dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) + + # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization + # Force per-tensor quantization for all cases to match working e2e setup + a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + + # Force per-tensor quantization for all cases + per_act_token = False + + # Create stride tensors for CUTLASS + ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device) + c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device) + c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + + def run_cutlass_moe_fp8( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + with nvtx.annotate("cutlass_moe_fp8", color="blue"): + cutlass_moe_fp8( + a=a, + w1_q=w1, + w2_q=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + + # Pre-create quantization config to avoid creating it inside CUDA graph + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly) + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + cutlass_moe_fp8( + a=a, + w1_q=w1_fp8q_cutlass, + w2_q=w2_fp8q_cutlass, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + torch.cuda.synchronize() + + # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + fused_experts( + a, + w1_fp8q, + w2_fp8q, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + torch.cuda.synchronize() + + def bench_cuda_graph(graph, num_warmup=5, num_iters=100): + """Benchmark CUDA graph using events like benchmark_moe.py""" + # Warmup + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + # Timing + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies = [] + for _ in range(num_iters): + torch.cuda.synchronize() + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + + # Divide by 10 since graph contains 10 calls + return sum(latencies) / (num_iters * 10) + + # Benchmark parameters + num_warmup = 5 + num_iters = 100 + + # Benchmark only CUDA graphs (more reliable and faster) + # Benchmark Triton MoE with CUDA graphs + triton_graph_time = bench_cuda_graph( + triton_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Benchmark CUTLASS MoE with CUDA graphs + cutlass_graph_time = bench_cuda_graph( + cutlass_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Convert ms to us and return results + triton_time_us = triton_graph_time * 1000 + cutlass_time_us = cutlass_graph_time * 1000 + + return { + "batch_size": m, + "triton_time_us": triton_time_us, + "cutlass_time_us": cutlass_time_us, + } + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + all_results = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in args.per_act_token_opts: + for per_out_ch in args.per_out_ch_opts: + print( + f"\n=== {model}, experts={num_experts}, topk={topk}," + f"per_act={per_act_token}, per_out_ch={per_out_ch} ===" + ) + + config_results = [] + for size_m in args.batch_sizes: + mkn = (size_m, size_k, size_n) + result = bench_run( + [], # Not used anymore + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) + if result: + config_results.append(result) + + # Print results table for this configuration + if config_results: + print( + f"\n{'Batch Size':<12}" + f"{'Triton (us)':<15}" + f"{'CUTLASS (us)':<15}" + ) + print("-" * 45) + for result in config_results: + print( + f"{result['batch_size']:<12}" + f"{result['triton_time_us']:<15.2f}" + f"{result['cutlass_time_us']:<15.2f}" + ) + + all_results.extend(config_results) + + print(f"\nTotal benchmarks completed: {len(all_results)}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE + across specified models/shapes/batches + + Example usage: + python benchmark_cutlass_moe_fp8.py \ + --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \ + --tp-sizes 8 \ + --batch-size 2 4 8 \ + --per-act-token-opts false \ + --per-out-ch-opts false + + """ + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument( + "--per-act-token-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-activation token quantization options (true/false)", + ) + parser.add_argument( + "--per-out-ch-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-output channel quantization options (true/false)", + ) + + args = parser.parse_args() + main(args) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..600bd4444535a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json @@ -0,0 +1,123 @@ +{ + "triton_version": "3.4.0", + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} From 88d7bdbd2337917fbdd65bdfe33e6af7ecdb45cd Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:07:51 -0400 Subject: [PATCH 178/185] [Bug] Fix AttributeError: 'FusedMoE' object has no attribute 'w13_weight_scale'. Did you mean: 'w13_weight_scale_inv' (#25519) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/warmup/deep_gemm_warmup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index f6df85a50238c..e495f9ee44724 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -53,9 +53,11 @@ def _extract_data_from_fused_moe_module( """ assert isinstance(m, FusedMoE) w13 = m.w13_weight - w13_s = getattr(m, "w13_weight_scale_inv", m.w13_weight_scale) + w13_s = m.w13_weight_scale_inv if hasattr( + m, "w13_weight_scale_inv") else m.w13_weight_scale w2 = m.w2_weight - w2_s = getattr(m, "w2_weight_scale_inv", m.w2_weight_scale) + w2_s = m.w2_weight_scale_inv if hasattr( + m, "w2_weight_scale_inv") else m.w2_weight_scale num_topk = m.top_k assert isinstance(w13, torch.Tensor) From c8bde93367fb252eca1e9a6ae78650caa4a9a951 Mon Sep 17 00:00:00 2001 From: ahao-anyscale Date: Tue, 23 Sep 2025 17:13:32 -0700 Subject: [PATCH 179/185] [BUG] Allows for RunAI Streamer and Torch.compile cache to be used together (#24922) Signed-off-by: ahao-anyscale --- tests/test_config.py | 107 +++++++++++++++++++++++++ vllm/config/model.py | 5 +- vllm/transformers_utils/runai_utils.py | 11 ++- 3 files changed, 119 insertions(+), 4 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 0796447c079b6..9e2bfb9e1b0ec 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os from dataclasses import MISSING, Field, asdict, dataclass, field +from unittest.mock import patch import pytest @@ -388,3 +390,108 @@ def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len, else: actual_max_len = model_config.get_and_verify_max_len(max_model_len) assert actual_max_len == expected_max_len + + +class MockConfig: + """Simple mock object for testing maybe_pull_model_tokenizer_for_runai""" + + def __init__(self, model: str, tokenizer: str): + self.model = model + self.tokenizer = tokenizer + self.model_weights = None + + +@pytest.mark.parametrize("s3_url", [ + "s3://example-bucket-1/model/", + "s3://example-bucket-2/model/", +]) +@patch('vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files') +def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): + """Test that S3 URLs create deterministic local directories for model and + tokenizer.""" + # Mock pull_files to avoid actually downloading files during tests + mock_pull_files.return_value = None + + # Create first mock and run the method + config1 = MockConfig(model=s3_url, tokenizer=s3_url) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url) + + # Check that model and tokenizer point to existing directories + assert os.path.exists( + config1.model), f"Model directory does not exist: {config1.model}" + assert os.path.isdir( + config1.model), f"Model path is not a directory: {config1.model}" + assert os.path.exists( + config1.tokenizer + ), f"Tokenizer directory does not exist: {config1.tokenizer}" + assert os.path.isdir( + config1.tokenizer + ), f"Tokenizer path is not a directory: {config1.tokenizer}" + + # Verify that the paths are different from the original S3 URL + assert config1.model != s3_url, ( + "Model path should be converted to local directory") + assert config1.tokenizer != s3_url, ( + "Tokenizer path should be converted to local directory") + + # Store the original paths + created_model_dir = config1.model + create_tokenizer_dir = config1.tokenizer + + # Create a new mock and run the method with the same S3 URL + config2 = MockConfig(model=s3_url, tokenizer=s3_url) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url) + + # Check that the new directories exist + assert os.path.exists( + config2.model), f"Model directory does not exist: {config2.model}" + assert os.path.isdir( + config2.model), f"Model path is not a directory: {config2.model}" + assert os.path.exists( + config2.tokenizer + ), f"Tokenizer directory does not exist: {config2.tokenizer}" + assert os.path.isdir( + config2.tokenizer + ), f"Tokenizer path is not a directory: {config2.tokenizer}" + + # Verify that the paths are deterministic (same as before) + assert config2.model == created_model_dir, ( + f"Model paths are not deterministic. " + f"Original: {created_model_dir}, New: {config2.model}") + assert config2.tokenizer == create_tokenizer_dir, ( + f"Tokenizer paths are not deterministic. " + f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}") + + +@patch('vllm.transformers_utils.runai_utils.ObjectStorageModel.pull_files') +def test_s3_url_different_models_create_different_directories(mock_pull_files): + """Test that different S3 URLs create different local directories.""" + # Mock pull_files to avoid actually downloading files during tests + mock_pull_files.return_value = None + + s3_url1 = "s3://example-bucket-1/model/" + s3_url2 = "s3://example-bucket-2/model/" + + # Create mocks with different S3 URLs and run the method + config1 = MockConfig(model=s3_url1, tokenizer=s3_url1) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1) + + config2 = MockConfig(model=s3_url2, tokenizer=s3_url2) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2) + + # Verify that different URLs produce different directories + assert config1.model != config2.model, ( + f"Different S3 URLs should create different model directories. " + f"URL1 model: {config1.model}, URL2 model: {config2.model}") + assert config1.tokenizer != config2.tokenizer, ( + f"Different S3 URLs should create different tokenizer directories. " + f"URL1 tokenizer: {config1.tokenizer}, " + f"URL2 tokenizer: {config2.tokenizer}") + + # Verify that both sets of directories exist + assert os.path.exists(config1.model) and os.path.isdir(config1.model) + assert os.path.exists(config1.tokenizer) and os.path.isdir( + config1.tokenizer) + assert os.path.exists(config2.model) and os.path.isdir(config2.model) + assert os.path.exists(config2.tokenizer) and os.path.isdir( + config2.tokenizer) diff --git a/vllm/config/model.py b/vllm/config/model.py index d8a8fe20fd030..d75bd5fa47b38 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -699,11 +699,12 @@ class ModelConfig: model: Model name or path tokenizer: Tokenizer name or path """ + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): return if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel() + object_storage_model = ObjectStorageModel(url=model) object_storage_model.pull_files( model, allow_pattern=["*.model", "*.py", "*.json"]) self.model_weights = model @@ -722,7 +723,7 @@ class ModelConfig: # Only download tokenizer if needed and not already handled if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel() + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) object_storage_tokenizer.pull_files(model, ignore_pattern=[ "*.pt", "*.safetensors", diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py index b7bee1974de5b..08466ca19b8a4 100644 --- a/vllm/transformers_utils/runai_utils.py +++ b/vllm/transformers_utils/runai_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import hashlib import os import shutil import signal @@ -56,12 +57,18 @@ class ObjectStorageModel: pull_files(): Pull model from object storage to the temporary directory. """ - def __init__(self) -> None: + def __init__(self, url: str) -> None: for sig in (signal.SIGINT, signal.SIGTERM): existing_handler = signal.getsignal(sig) signal.signal(sig, self._close_by_signal(existing_handler)) - self.dir = tempfile.mkdtemp() + dir_name = os.path.join( + tempfile.gettempdir(), + hashlib.sha256(str(url).encode()).hexdigest()[:8]) + if os.path.exists(dir_name): + shutil.rmtree(dir_name) + os.makedirs(dir_name) + self.dir = dir_name def __del__(self): self._close() From be0bb568c92069a1ef3250307c818a46702255de Mon Sep 17 00:00:00 2001 From: 0xNullPath Date: Wed, 24 Sep 2025 08:15:51 +0800 Subject: [PATCH 180/185] [Model] Support SeedOss Reason Parser (#24263) Signed-off-by: Yan Lu Co-authored-by: Michael Goin --- .../test_base_thinking_reasoning_parser.py | 392 ++++++++++++++++++ .../test_seedoss_reasoning_parser.py | 237 +++++++++++ vllm/reasoning/__init__.py | 4 + vllm/reasoning/abs_reasoning_parsers.py | 8 +- vllm/reasoning/basic_parsers.py | 156 +++++++ .../reasoning/deepseek_r1_reasoning_parser.py | 160 ++----- vllm/reasoning/mistral_reasoning_parser.py | 19 +- vllm/reasoning/qwen3_reasoning_parser.py | 129 ++---- vllm/reasoning/seedoss_reasoning_parser.py | 28 ++ 9 files changed, 887 insertions(+), 246 deletions(-) create mode 100644 tests/reasoning/test_base_thinking_reasoning_parser.py create mode 100644 tests/reasoning/test_seedoss_reasoning_parser.py create mode 100644 vllm/reasoning/basic_parsers.py create mode 100644 vllm/reasoning/seedoss_reasoning_parser.py diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py new file mode 100644 index 0000000000000..6a939dcfc2c9c --- /dev/null +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -0,0 +1,392 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser + + +# Create a concrete test implementation of BaseThinkingReasoningParser +class TestThinkingReasoningParser(BaseThinkingReasoningParser): + """Test implementation of BaseThinkingReasoningParser.""" + + @property + def start_token(self) -> str: + return "" + + @property + def end_token(self) -> str: + return "" + + +class TestThinkingReasoningParserAlt(BaseThinkingReasoningParser): + """Alternative test implementation with different tokens.""" + + @property + def start_token(self) -> str: + return "" + + @property + def end_token(self) -> str: + return "" + + +# Use a test model +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + +@pytest.fixture(scope="module") +def test_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + # Add custom test tokens + test_tokens = ["", "", "", ""] + existing_tokens = set(tokenizer.get_vocab().keys()) + new_tokens = [ + token for token in test_tokens if token not in existing_tokens + ] + if new_tokens: + tokenizer.add_tokens(new_tokens) + return tokenizer + + +class TestBaseThinkingReasoningParserInit: + """ + Test initialization and basic properties of + BaseThinkingReasoningParser. + """ + + def test_successful_initialization(self, test_tokenizer): + """Test successful initialization with valid tokens.""" + parser = TestThinkingReasoningParser(test_tokenizer) + assert parser.start_token == "" + assert parser.end_token == "" + assert parser.start_token_id is not None + assert parser.end_token_id is not None + + def test_initialization_with_missing_tokenizer(self): + """Test that initialization fails without tokenizer.""" + with pytest.raises(ValueError, match="model tokenizer must be passed"): + TestThinkingReasoningParser(None) + + def test_initialization_with_missing_tokens(self, test_tokenizer): + """Test that initialization fails when tokens are not in vocabulary.""" + + # Create a parser with tokens not in vocabulary + class MissingTokenParser(BaseThinkingReasoningParser): + + @property + def start_token(self) -> str: + return "" + + @property + def end_token(self) -> str: + return "" + + with pytest.raises(RuntimeError, + match="could not locate think start/end tokens"): + MissingTokenParser(test_tokenizer) + + def test_initialization_with_empty_tokens(self, test_tokenizer): + """Test that initialization fails with empty token strings.""" + + class EmptyTokenParser(BaseThinkingReasoningParser): + + @property + def start_token(self) -> str: + return "" + + @property + def end_token(self) -> str: + return "" + + with pytest.raises(ValueError, + match="start_token and end_token must be defined"): + EmptyTokenParser(test_tokenizer) + + +class TestBaseThinkingReasoningParserMethods: + """Test the methods of BaseThinkingReasoningParser.""" + + def test_is_reasoning_end(self, test_tokenizer): + """Test the is_reasoning_end method.""" + parser = TestThinkingReasoningParser(test_tokenizer) + end_token_id = parser.end_token_id + + # Test with end token present + assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True + + # Test without end token + assert parser.is_reasoning_end([1, 2, 3, 4]) is False + + # Test with empty list + assert parser.is_reasoning_end([]) is False + + def test_extract_content_ids(self, test_tokenizer): + """Test the extract_content_ids method.""" + parser = TestThinkingReasoningParser(test_tokenizer) + end_token_id = parser.end_token_id + + # Test with end token in the middle + input_ids = [1, 2, end_token_id, 4, 5] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [4, 5] + + # Test with end token at the end + input_ids = [1, 2, 3, end_token_id] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [] + + # Test without end token + input_ids = [1, 2, 3, 4] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [] + + # Test with end token as last element (should not extract) + input_ids = [1, 2, 3, end_token_id] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [] + + +class TestBaseThinkingReasoningParserExtraction: + """Test reasoning content extraction methods.""" + + def test_extract_reasoning_content_with_both_tokens(self, test_tokenizer): + """Test extraction when both start and end tokens are present.""" + parser = TestThinkingReasoningParser(test_tokenizer) + request = ChatCompletionRequest(messages=[], model="test-model") + + model_output = ("This is reasoning" + "This is content") + reasoning, content = parser.extract_reasoning_content( + model_output, request) + + assert reasoning == "This is reasoning" + assert content == "This is content" + + def test_extract_reasoning_content_only_end_token(self, test_tokenizer): + """Test extraction when only end token is present.""" + parser = TestThinkingReasoningParser(test_tokenizer) + request = ChatCompletionRequest(messages=[], model="test-model") + + model_output = ("This is reasoningThis is content") + reasoning, content = parser.extract_reasoning_content( + model_output, request) + + assert reasoning == "This is reasoning" + assert content == "This is content" + + def test_extract_reasoning_content_no_end_token(self, test_tokenizer): + """Test extraction when no end token is present.""" + parser = TestThinkingReasoningParser(test_tokenizer) + request = ChatCompletionRequest(messages=[], model="test-model") + + model_output = "This is just content" + reasoning, content = parser.extract_reasoning_content( + model_output, request) + + assert reasoning == "This is just content" + assert content is None + + def test_extract_reasoning_content_empty_output(self, test_tokenizer): + """Test extraction with empty output.""" + parser = TestThinkingReasoningParser(test_tokenizer) + request = ChatCompletionRequest(messages=[], model="test-model") + + model_output = "" + reasoning, content = parser.extract_reasoning_content( + model_output, request) + + assert reasoning == "" + assert content is None + + def test_extract_reasoning_content_only_tokens(self, test_tokenizer): + """Test extraction with only tokens and no content.""" + parser = TestThinkingReasoningParser(test_tokenizer) + request = ChatCompletionRequest(messages=[], model="test-model") + + model_output = ("") + reasoning, content = parser.extract_reasoning_content( + model_output, request) + + assert reasoning == "" + assert content is None + + +class TestBaseThinkingReasoningParserStreaming: + """Test streaming functionality of BaseThinkingReasoningParser.""" + + @pytest.mark.parametrize("streaming", [True, False]) + def test_simple_reasoning_extraction(self, test_tokenizer, streaming): + """ + Test basic reasoning extraction in both + streaming and non-streaming modes. + """ + parser = TestThinkingReasoningParser(test_tokenizer) + + model_output = [ + "", "Some ", "reasoning ", "content", "", + "Final ", "answer" + ] + + reasoning, content = run_reasoning_extraction(parser, + model_output, + streaming=streaming) + + assert reasoning == "Some reasoning content" + assert content == "Final answer" + + def test_streaming_with_incremental_deltas(self, test_tokenizer): + """Test streaming processing with small incremental deltas.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + deltas = [ + "", + "Some ", + "reasoning ", + "content", + "", + "Final ", + "answer", + ] + + reasoning, content = run_reasoning_extraction(parser, + deltas, + streaming=True) + + assert reasoning == "Some reasoning content" + assert content == "Final answer" + + def test_streaming_with_start_token(self, test_tokenizer): + """Test streaming with start token included.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + deltas = [ + "", + "Some ", + "reasoning", + "", + "Answer", + ] + + reasoning, content = run_reasoning_extraction(parser, + deltas, + streaming=True) + + assert reasoning == "Some reasoning" + assert content == "Answer" + + def test_streaming_no_end_token(self, test_tokenizer): + """Test streaming when no end token is encountered.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + deltas = [ + "", + "Some ", + "reasoning ", + "without ", + "end", + ] + + reasoning, content = run_reasoning_extraction(parser, + deltas, + streaming=True) + + assert reasoning == "Some reasoning without end" + assert content is None + + def test_streaming_only_end_token(self, test_tokenizer): + """Test streaming when only end token appears.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + deltas = [ + "", + "Reasoning ", + "content", + "", + "Final", + ] + + reasoning, content = run_reasoning_extraction(parser, + deltas, + streaming=True) + + assert reasoning == "Reasoning content" + assert content == "Final" + + +class TestBaseThinkingReasoningParserMultipleImplementations: + """ + Test that multiple implementations of + BaseThinkingReasoningParser work correctly. + """ + + def test_different_token_implementations(self, test_tokenizer): + """ + Test that different implementations + with different tokens work independently. + """ + parser1 = TestThinkingReasoningParser(test_tokenizer) + parser2 = TestThinkingReasoningParserAlt(test_tokenizer) + + # Test parser1 + model_output1 = ("Reasoning1Content1") + reasoning1, content1 = run_reasoning_extraction( + parser1, [model_output1]) + assert reasoning1 == "Reasoning1" + assert content1 == "Content1" + + # Test parser2 + model_output2 = "Reasoning2Content2" + reasoning2, content2 = run_reasoning_extraction( + parser2, [model_output2]) + assert reasoning2 == "Reasoning2" + assert content2 == "Content2" + + # Verify tokens are different + assert parser1.start_token != parser2.start_token + assert parser1.end_token != parser2.end_token + assert parser1.start_token_id != parser2.start_token_id + assert parser1.end_token_id != parser2.end_token_id + + +class TestBaseThinkingReasoningParserEdgeCases: + """Test edge cases and error conditions.""" + + def test_multiple_end_tokens(self, test_tokenizer): + """Test behavior with multiple end tokens.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + model_output = ("FirstMiddleLast") + reasoning, content = run_reasoning_extraction(parser, [model_output]) + + # Should stop at first end token + assert reasoning == "First" + assert content == "MiddleLast" + + def test_nested_tokens(self, test_tokenizer): + """Test behavior with nested-like token patterns.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + model_output = ("Outer" + "InnerContent") + reasoning, content = run_reasoning_extraction(parser, [model_output]) + + # Should process normally, start from first start token + assert reasoning == "OuterInner" + assert content == "Content" + + def test_malformed_tokens(self, test_tokenizer): + """Test behavior with malformed token-like strings.""" + parser = TestThinkingReasoningParser(test_tokenizer) + + model_output = ("Not a real token" + "Content") + reasoning, content = run_reasoning_extraction(parser, [model_output]) + + # Should treat as regular content since tokens don't match exactly + assert reasoning == ("Not a real token" + "Content") + assert content is None diff --git a/tests/reasoning/test_seedoss_reasoning_parser.py b/tests/reasoning/test_seedoss_reasoning_parser.py new file mode 100644 index 0000000000000..bb5dc0f4ffe4d --- /dev/null +++ b/tests/reasoning/test_seedoss_reasoning_parser.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, cast + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "seed_oss" +start_token = "" +end_token = "" + +# Use a test model that contains our custom tokens +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + +@pytest.fixture(scope="module") +def seedoss_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + # Add custom SeedOSS tokens if they don't exist + if start_token not in tokenizer.get_vocab(): + tokenizer.add_tokens([start_token, end_token]) + return tokenizer + + +SIMPLE_REASONING: dict[str, Any] = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning_content": "This is a reasoning section", + "content": "This is the rest", + "is_reasoning_end": True, +} +COMPLETE_REASONING: dict[str, Any] = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, + "is_reasoning_end": True, +} +NO_CONTENT: dict[str, Any] = { + "output": "This is content", + "reasoning_content": "This is content", + "content": None, + "is_reasoning_end": False, +} +NO_REASONING_STREAMING: dict[str, Any] = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, + "is_reasoning_end": False, +} +MULTIPLE_LINES: dict[str, Any] = { + "output": "This\nThatThis is the rest\nThat", + "reasoning_content": "This\nThat", + "content": "This is the rest\nThat", + "is_reasoning_end": True, +} +WITH_START_TOKEN: dict[str, Any] = { + "output": ("This is a reasoning section" + "This is the rest"), + "reasoning_content": + "This is a reasoning section", + "content": + "This is the rest", + "is_reasoning_end": + True, +} +ONLY_END_TOKEN: dict[str, Any] = { + "output": "Some reasoningThis is the rest", + "reasoning_content": "Some reasoning", + "content": "This is the rest", + "is_reasoning_end": True, +} +NO_TOKENS: dict[str, Any] = { + "output": "This is just content without any reasoning tokens", + "reasoning_content": "This is just content without any reasoning tokens", + "content": None, + "is_reasoning_end": False, +} + + +def test_seedoss_reasoning_parser_creation(seedoss_tokenizer): + """Test that the SeedOSS reasoning parser can be created and registered.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + assert isinstance(parser, ReasoningParser) + assert parser.start_token == start_token + assert parser.end_token == end_token + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_simple_reasoning(seedoss_tokenizer, streaming): + """Test basic reasoning extraction with both tokens.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming) + + assert reasoning == SIMPLE_REASONING["reasoning_content"] + assert content == SIMPLE_REASONING["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_complete_reasoning(seedoss_tokenizer, streaming): + """Test reasoning extraction when there's no content after reasoning.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming) + + assert reasoning == COMPLETE_REASONING["reasoning_content"] + assert content == COMPLETE_REASONING["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_content(seedoss_tokenizer, streaming): + """Test when there's no end token - everything is reasoning content.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, NO_CONTENT["output"])], streaming=streaming) + + assert reasoning == NO_CONTENT["reasoning_content"] + assert content == NO_CONTENT["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_multiple_lines(seedoss_tokenizer, streaming): + """Test reasoning extraction with multiline content.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming) + + assert reasoning == MULTIPLE_LINES["reasoning_content"] + assert content == MULTIPLE_LINES["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_with_start_token(seedoss_tokenizer, streaming): + """Test reasoning extraction with both start and end tokens.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming) + + assert reasoning == WITH_START_TOKEN["reasoning_content"] + assert content == WITH_START_TOKEN["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_only_end_token(seedoss_tokenizer, streaming): + """ + Test reasoning extraction with only end token + (SeedOSS typical behavior). + """ + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming) + + assert reasoning == ONLY_END_TOKEN["reasoning_content"] + assert content == ONLY_END_TOKEN["content"] + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tokens(seedoss_tokenizer, streaming): + """Test when there are no reasoning tokens at all.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + reasoning, content = run_reasoning_extraction( + parser, [cast(str, NO_TOKENS["output"])], streaming=streaming) + + assert reasoning == NO_TOKENS["reasoning_content"] + assert content == NO_TOKENS["content"] + + +def test_is_reasoning_end(seedoss_tokenizer): + """Test the is_reasoning_end method.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + # Test with end token present + end_token_id = parser.end_token_id + assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True + + # Test without end token + assert parser.is_reasoning_end([1, 2, 3, 4]) is False + + +def test_extract_content_ids(seedoss_tokenizer): + """Test the extract_content_ids method.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + end_token_id = parser.end_token_id + + # Test with end token in the middle + input_ids = [1, 2, end_token_id, 4, 5] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [4, 5] + + # Test with end token at the end + input_ids = [1, 2, 3, end_token_id] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [] + + # Test without end token + input_ids = [1, 2, 3, 4] + content_ids = parser.extract_content_ids(input_ids) + assert content_ids == [] + + +def test_streaming_delta_processing(seedoss_tokenizer): + """Test streaming processing with small deltas.""" + parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name) + parser = parser_cls(seedoss_tokenizer) + + # Test streaming with incremental tokens + deltas = [ + "Some ", "reasoning ", "content", "", "Final ", "answer" + ] + + reasoning, content = run_reasoning_extraction(parser, + deltas, + streaming=True) + + assert reasoning == "Some reasoning content" + assert content == "Final answer" diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index b987adeb6428f..3c8a9c6ae0d33 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager +from .basic_parsers import BaseThinkingReasoningParser from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser from .gptoss_reasoning_parser import GptOssReasoningParser @@ -9,10 +10,12 @@ from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .mistral_reasoning_parser import MistralReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser +from .seedoss_reasoning_parser import SeedOSSReasoningParser from .step3_reasoning_parser import Step3ReasoningParser __all__ = [ "ReasoningParser", + "BaseThinkingReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser", "GraniteReasoningParser", @@ -22,4 +25,5 @@ __all__ = [ "MistralReasoningParser", "Step3ReasoningParser", "GptOssReasoningParser", + "SeedOSSReasoningParser", ] diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index df9e84163f16c..39b08ec111073 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -7,7 +7,7 @@ import os from abc import abstractmethod from collections.abc import Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Union from vllm.logger import init_logger from vllm.utils import import_from_path, is_list_of @@ -77,7 +77,7 @@ class ReasoningParser: self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest], - ) -> tuple[Optional[str], Optional[str]]: + ) -> tuple[str | None, str | None]: """ Extract reasoning content from a complete model-generated string. @@ -135,7 +135,7 @@ class ReasoningParserManager: def _register_module( cls, module: type, - module_name: Optional[Union[str, list[str]]] = None, + module_name: Union[str, list[str]] | None = None, force: bool = True, ) -> None: if not issubclass(module, ReasoningParser): @@ -155,7 +155,7 @@ class ReasoningParserManager: @classmethod def register_module( cls, - name: Optional[Union[str, list[str]]] = None, + name: Union[str, list[str]] | None = None, force: bool = True, module: Union[type, None] = None, ) -> Union[type, Callable]: diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py new file mode 100644 index 0000000000000..03cb882c26939 --- /dev/null +++ b/vllm/reasoning/basic_parsers.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import abstractmethod +from collections.abc import Sequence +from typing import Optional, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage, ResponsesRequest) +from vllm.reasoning.abs_reasoning_parsers import ReasoningParser +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class BaseThinkingReasoningParser(ReasoningParser): + """ + Base class for reasoning parsers that use thinking tokens. + + This class provides common functionality for parsers that use start and end + tokens to delimit reasoning content ( + e.g., ..., ...). + + Subclasses must implement the start and end tokens via abstract + properties. + """ + + @property + @abstractmethod + def start_token(self) -> str: + """The token that starts reasoning content.""" + raise NotImplementedError + + @property + @abstractmethod + def end_token(self) -> str: + """The token that ends reasoning content.""" + raise NotImplementedError + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + if not self.start_token or not self.end_token: + raise ValueError( + "start_token and end_token must be defined in subclasses") + + self.start_token_id = self.vocab.get(self.start_token) + self.end_token_id = self.vocab.get(self.end_token) + if self.start_token_id is None or self.end_token_id is None: + raise RuntimeError( + f"{self.__class__.__name__} reasoning parser could not locate " + "think start/end tokens in the tokenizer!") + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return self.end_token_id in input_ids + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + """ + Extract the content after the end tokens + """ + if self.end_token_id not in input_ids[:-1]: + return [] + else: + return input_ids[input_ids.index(self.end_token_id) + 1:] + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.start_token_id, self.end_token_id + ]): + return None + + # Check if start token is present in previous or delta. + # Keep compatibility with models that don't generate start tokens. + if self.start_token_id in previous_token_ids: + if self.end_token_id in delta_token_ids: + # start token in previous, end token in delta, + # extract reasoning content + end_index = delta_text.find(self.end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.end_token):] + return DeltaMessage( + reasoning_content=reasoning_content, + content=content if content else None, + ) + elif self.end_token_id in previous_token_ids: + # start token in previous, end token in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # start token in previous, no end token in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.start_token_id in delta_token_ids: + if self.end_token_id in delta_token_ids: + # start token in delta, end token in delta, + # extract reasoning content + start_index = delta_text.find(self.start_token) + end_index = delta_text.find(self.end_token) + reasoning_content = delta_text[start_index + + len(self.start_token):end_index] + content = delta_text[end_index + len(self.end_token):] + return DeltaMessage( + reasoning_content=reasoning_content, + content=content if content else None, + ) + else: + # start token in delta, no end token in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # not find thinking start token + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from the model output. + + This is the base implementation that works for most models. + Subclasses can override this method for specific behavior. + """ + # Check if the start token is present in the model output, remove it + # if it is present. + model_output_parts = model_output.partition(self.start_token) + model_output = model_output_parts[2] if model_output_parts[ + 1] else model_output_parts[0] + + # For models that may not generate start token, + # assume the reasoning content is always at the start. + if self.end_token not in model_output: + return model_output, None + else: + reasoning_content, _, content = model_output.partition( + self.end_token) + # If generation stops right after end-of-think, return null content + final_content = content or None + return reasoning_content, final_content diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index 1a5ca46a60f1d..76d2959e1c9a4 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -2,20 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional, Union +from typing import Union -from transformers import PreTrainedTokenizerBase - -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) -from vllm.logger import init_logger -from vllm.reasoning import ReasoningParser, ReasoningParserManager - -logger = init_logger(__name__) +from vllm.entrypoints.openai.protocol import DeltaMessage +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser @ReasoningParserManager.register_module("deepseek_r1") -class DeepSeekR1ReasoningParser(ReasoningParser): +class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for DeepSeek R1 model. @@ -23,38 +18,15 @@ class DeepSeekR1ReasoningParser(ReasoningParser): text. This parser extracts the reasoning content from the model output. """ - start_token_id: int - end_token_id: int + @property + def start_token(self) -> str: + """The token that starts reasoning content.""" + return "" - start_token: str = "" - end_token: str = "" - - def __init__(self, tokenizer: PreTrainedTokenizerBase): - super().__init__(tokenizer) - - if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " - "constructor during construction.") - - self.start_token_id = self.vocab.get(self.start_token) - self.end_token_id = self.vocab.get(self.end_token) - if self.start_token_id is None or self.end_token_id is None: - raise RuntimeError( - "DeepSeek R1 reasoning parser could not locate think start/end " - "tokens in the tokenizer!") - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - return self.end_token_id in input_ids - - def extract_content_ids(self, input_ids: list[int]) -> list[int]: - """ - Extract the content after the end tokens - """ - if self.end_token_id not in input_ids[:-1]: - return [] - else: - return input_ids[input_ids.index(self.end_token_id) + 1:] + @property + def end_token(self) -> str: + """The token that ends reasoning content.""" + return "" def extract_reasoning_content_streaming( self, @@ -65,63 +37,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser): current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ) -> Union[DeltaMessage, None]: - """ - Extract reasoning content from a delta message. - Handles streaming output where previous + delta = current. - Uses token IDs for faster processing. - For text abcxyz: - - 'abc' goes to reasoning_content - - 'xyz' goes to content - """ - # Skip single special tokens - if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ - self.start_token_id, self.end_token_id - ]): - return None - - # Check if is present in previous or delta. - # Keep compatibility with models that don't generate tokens. - if self.start_token_id in previous_token_ids: + ret = super().extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + ) + if (ret is not None and self.start_token_id not in previous_token_ids + and self.start_token_id not in delta_token_ids): if self.end_token_id in delta_token_ids: - # in previous, in delta, - # extract reasoning content - end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.end_token):] - return DeltaMessage( - reasoning_content=reasoning_content, - content=content if content else None, - ) - elif self.end_token_id in previous_token_ids: - # in previous, in previous, - # reasoning content continues - return DeltaMessage(content=delta_text) - else: - # in previous, no in previous or delta, - # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) - elif self.start_token_id in delta_token_ids: - if self.end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content - start_index = delta_text.find(self.start_token) - end_index = delta_text.find(self.end_token) - reasoning_content = delta_text[start_index + - len(self.start_token):end_index] - content = delta_text[end_index + len(self.end_token):] - return DeltaMessage( - reasoning_content=reasoning_content, - content=content if content else None, - ) - else: - # in delta, no in delta, - # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) - else: - # No in previous or delta, also need to check for . - # Because the model may have generated without - # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f - if self.end_token_id in delta_token_ids: - # in delta with more tokens, + # end token in delta with more tokens, # extract reasoning content and content end_index = delta_text.find(self.end_token) reasoning_content = delta_text[:end_index] @@ -131,43 +58,10 @@ class DeepSeekR1ReasoningParser(ReasoningParser): content=content if content else None, ) elif self.end_token_id in previous_token_ids: - # in previous, thinking content ends + # end token in previous, thinking content ends return DeltaMessage(content=delta_text) else: - # no in previous or delta, reasoning content continues + # no end token in previous or delta, reasoning content continues return DeltaMessage(reasoning_content=delta_text) - def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest - ) -> tuple[Optional[str], Optional[str]]: - """ - Extract reasoning content from the model output. - - For text abcxyz: - - 'abc' goes to reasoning_content - - 'xyz' goes to content - - Returns: - tuple[Optional[str], Optional[str]]: reasoning content and content - """ - - # Check if the start token is present in the model output, remove it - # if it is present. - model_output_parts = model_output.partition(self.start_token) - model_output = model_output_parts[2] if model_output_parts[ - 1] else model_output_parts[0] - - # DeepSeek R1 doesn't generate now. - # Thus we assume the reasoning content is always at the start. - # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f - if self.end_token not in model_output: - return model_output, None - else: - reasoning_content, _, content = model_output.partition( - self.end_token) - # If the end token is not found, return the model output as is. - # It should not happen since we already checked for the presence - # of the end token. - # If generation stops right after end-of-think, return null content - final_content = content or None - return reasoning_content, final_content + return ret diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index 6c707a4079fa0..5cb54e6acbb31 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from functools import cached_property + from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning.deepseek_r1_reasoning_parser import ( @@ -31,11 +33,6 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser): "The model tokenizer must be passed to the ReasoningParser " "constructor during construction.") - from mistral_common.tokens.tokenizers.base import SpecialTokens - - self.start_token = SpecialTokens.begin_think - self.end_token = SpecialTokens.end_think - self.start_token_id = tokenizer.tokenizer.get_control_token( self.start_token) self.end_token_id = tokenizer.tokenizer.get_control_token( @@ -45,3 +42,15 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser): raise RuntimeError( "Mistral reasoning parser could not locate think start/end " "tokens in the tokenizer!") + + @cached_property + def start_token(self) -> str: + """The token that starts reasoning content.""" + from mistral_common.tokens.tokenizers.base import SpecialTokens + return SpecialTokens.begin_think + + @cached_property + def end_token(self) -> str: + """The token that ends reasoning content.""" + from mistral_common.tokens.tokenizers.base import SpecialTokens + return SpecialTokens.end_think diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 61bafc724c17f..3e3c7f32796bd 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -1,21 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Sequence from typing import Optional, Union -from transformers import PreTrainedTokenizerBase - from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) -from vllm.logger import init_logger -from vllm.reasoning import ReasoningParser, ReasoningParserManager - -logger = init_logger(__name__) + ResponsesRequest) +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser @ReasoningParserManager.register_module("qwen3") -class Qwen3ReasoningParser(ReasoningParser): +class Qwen3ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for the Qwen3 model. @@ -26,100 +21,25 @@ class Qwen3ReasoningParser(ReasoningParser): output. """ - def __init__(self, tokenizer: PreTrainedTokenizerBase): - super().__init__(tokenizer) - self.think_start_token = "" - self.think_end_token = "" + @property + def start_token(self) -> str: + """The token that starts reasoning content.""" + return "" - if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " - "constructor during construction.") - - self.think_start_token_id = self.vocab.get(self.think_start_token) - self.think_end_token_id = self.vocab.get(self.think_end_token) - if (self.think_start_token_id is None - or self.think_end_token_id is None): - raise RuntimeError( - "Qwen3 reasoning parser could not locate think start/end " - "tokens in the tokenizer!") - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - return self.think_end_token_id in input_ids - - def extract_content_ids(self, input_ids: list[int]) -> list[int]: - """ - Extract the content after the end tokens - """ - if self.think_end_token_id not in input_ids[:-1]: - return [] - else: - return input_ids[input_ids.index(self.think_end_token_id) + 1:] - - def extract_reasoning_content_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - ) -> Union[DeltaMessage, None]: - """ - Extract reasoning content from a delta message. - Handles streaming output where previous + delta = current. - Uses token IDs for faster processing. - For text abcxyz: - - 'abc' goes to reasoning_content - - 'xyz' goes to content - """ - # Skip single special tokens - if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ - self.think_start_token_id, self.think_end_token_id - ]): - return None - - if self.think_start_token_id in previous_token_ids: - if self.think_end_token_id in delta_token_ids: - # in previous, in delta, - # extract reasoning content - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token):] - return DeltaMessage(reasoning_content=reasoning_content, - content=content if content else None) - elif self.think_end_token_id in previous_token_ids: - # in previous, in previous, - # reasoning content continues - return DeltaMessage(content=delta_text) - else: - # in previous, no in previous or delta, - # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) - elif self.think_start_token_id in delta_token_ids: - if self.think_end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content - start_index = delta_text.find(self.think_start_token) - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[start_index + - len(self.think_start_token - ):end_index] - content = delta_text[end_index + len(self.think_end_token):] - return DeltaMessage(reasoning_content=reasoning_content, - content=content if content else None) - else: - # in delta, no in delta, - # reasoning content continues - return DeltaMessage(reasoning_content=delta_text) - else: - # thinking is disabled, just content - return DeltaMessage(content=delta_text) + @property + def end_token(self) -> str: + """The token that ends reasoning content.""" + return "" def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. + + Qwen3 has stricter requirements - it needs both start and end tokens + to be present, unlike other models that work with just the end token. For text abcxyz: - 'abc' goes to reasoning_content @@ -129,23 +49,24 @@ class Qwen3ReasoningParser(ReasoningParser): tuple[Optional[str], Optional[str]]: reasoning content and content """ - # Check if the model output contains the and tokens. - if (self.think_start_token not in model_output - or self.think_end_token not in model_output): + # Check if the model output contains both and tokens. + if (self.start_token not in model_output + or self.end_token not in model_output): return None, model_output + # Check if the is present in the model output, remove it # if it is present. - model_output_parts = model_output.partition(self.think_start_token) + model_output_parts = model_output.partition(self.start_token) model_output = model_output_parts[2] if model_output_parts[ 1] else model_output_parts[0] + # Check if the model output contains the tokens. # If the end token is not found, return the model output as is. - if self.think_end_token not in model_output: + if self.end_token not in model_output: return None, model_output # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition( - self.think_end_token) + reasoning_content, _, content = model_output.partition(self.end_token) final_content = content or None return reasoning_content, final_content diff --git a/vllm/reasoning/seedoss_reasoning_parser.py b/vllm/reasoning/seedoss_reasoning_parser.py new file mode 100644 index 0000000000000..5f4bbbf1557eb --- /dev/null +++ b/vllm/reasoning/seedoss_reasoning_parser.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser + + +@ReasoningParserManager.register_module("seed_oss") +class SeedOSSReasoningParser(BaseThinkingReasoningParser): + """ + Reasoning parser for SeedOSS model. + + The SeedOSS model uses ... tokens to + denote reasoning content text. This parser extracts + the reasoning content from the model output. + Similar to DeepSeek R1, it supports cases + where the model doesn't generate the start token. + """ + + @property + def start_token(self) -> str: + """The token that starts reasoning content.""" + return "" + + @property + def end_token(self) -> str: + """The token that ends reasoning content.""" + return "" From d06b5a95cbd5114478ca87dabac3156a33bb6bca Mon Sep 17 00:00:00 2001 From: baxingpiaochong <771405853@qq.com> Date: Wed, 24 Sep 2025 08:19:04 +0800 Subject: [PATCH 181/185] [V1][Metrics] Add per-request TPOT histogram (#24015) Signed-off-by: baxingpiaochong <771405853@qq.com> --- vllm/v1/metrics/loggers.py | 15 +++++++++++++++ vllm/v1/metrics/stats.py | 10 +++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f0076b2d81dbf..52264e41e7a18 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -411,6 +411,19 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_inter_token_latency = make_per_engine( histogram_inter_token_latency, engine_indexes, model_name) + histogram_request_time_per_output_token = self._histogram_cls( + name="vllm:request_time_per_output_token_seconds", + documentation= + "Histogram of time_per_output_token_seconds per request.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_request_time_per_output_token = make_per_engine( + histogram_request_time_per_output_token, engine_indexes, + model_name) + request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 @@ -583,6 +596,8 @@ class PrometheusStatLogger(StatLoggerBase): finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request[engine_idx].observe( finished_request.num_generation_tokens) + self.histogram_request_time_per_output_token[engine_idx].observe( + finished_request.mean_time_per_output_token) if finished_request.max_tokens_param: self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 0eff557336bc0..296c39e8cdb5c 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -86,6 +86,7 @@ class FinishedRequestStats: prefill_time: float = 0.0 inference_time: float = 0.0 decode_time: float = 0.0 + mean_time_per_output_token: float = 0.0 class IterationStats: @@ -177,6 +178,12 @@ class IterationStats: # Any preemptions during prefill or decode are included inference_time = req_stats.last_token_ts - req_stats.scheduled_ts + # Do not count the token generated by the prefill phase + mean_time_per_output_token = (decode_time / + (req_stats.num_generation_tokens - 1) + if req_stats.num_generation_tokens - + 1 > 0 else 0) + finished_req = \ FinishedRequestStats(finish_reason=finish_reason, e2e_latency=e2e_latency, @@ -186,7 +193,8 @@ class IterationStats: queued_time=queued_time, prefill_time=prefill_time, inference_time=inference_time, - decode_time=decode_time) + decode_time=decode_time, + mean_time_per_output_token=mean_time_per_output_token) self.finished_requests.append(finished_req) From 1983609239caaab24ab1ed2bfa2aa92e8c76c1b1 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 23 Sep 2025 20:19:56 -0400 Subject: [PATCH 182/185] [Bugfix] Use a separate FlashInfer workspace buffer for trtllm-gen (#25520) --- vllm/v1/attention/backends/flashinfer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index cb092aa74e7f1..1a5c171430bc6 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -48,6 +48,16 @@ FP4_DTYPE = torch.uint8 logger = init_logger(__name__) +trtllm_gen_workspace_buffer = None + + +def _get_trtllm_gen_workspace_buffer(): + global trtllm_gen_workspace_buffer + if trtllm_gen_workspace_buffer is None: + trtllm_gen_workspace_buffer = torch.zeros( + FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device='cuda') + return trtllm_gen_workspace_buffer + @triton.jit def _trtllm_prefill_attn_kvfp8_dequant( @@ -862,7 +872,7 @@ class FlashInferImpl(AttentionImpl): else: # prefill_query may be non-contiguous prefill_query = prefill_query.contiguous() - workspace_buffer = prefill_wrapper._float_workspace_buffer + workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_prefill = attn_metadata.block_table_tensor[ num_decode_tokens:] seq_lens_prefill = attn_metadata.seq_lens[num_decode_tokens:] @@ -943,7 +953,7 @@ class FlashInferImpl(AttentionImpl): else: # decode_query may be non-contiguous decode_query = decode_query.contiguous() - workspace_buffer = decode_wrapper._float_workspace_buffer + workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_decode = attn_metadata.\ block_table_tensor[:num_decode_tokens] seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens] From de94289a98d7ec52a5ef02719e01a1db8b505170 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 24 Sep 2025 01:30:26 +0100 Subject: [PATCH 183/185] [Core] Support weight_loader_v2 for `UnquantizedLinearMethod` (#23036) Signed-off-by: Kyle Sayers --- vllm/compilation/decorators.py | 43 ++++++++++++++++++++++++---- vllm/model_executor/layers/linear.py | 16 +++++++---- vllm/model_executor/parameter.py | 23 ++++++++++++++- 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index b7a6e23c1aa79..6e9a36a2b0b99 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -8,6 +8,7 @@ from unittest.mock import patch import torch import torch.nn as nn +from packaging import version from torch._dynamo.symbolic_convert import InliningInstructionTranslator from vllm.compilation.counter import compilation_counter @@ -300,13 +301,13 @@ def _support_torch_compile( logger.debug( "enable_cpp_symbolic_shape_guards config not available") - with patch.object(InliningInstructionTranslator, 'inline_call', - patched_inline_call), torch._dynamo.config.patch( - **dynamo_config_patches - ), maybe_use_cudagraph_partition_wrapper( - self.vllm_config): + with patch.object( + InliningInstructionTranslator, "inline_call", + patched_inline_call), torch._dynamo.config.patch( + **dynamo_config_patches + ), maybe_use_cudagraph_partition_wrapper( + self.vllm_config), _torch27_patch_tensor_subclasses(): output = self.compiled_callable(*args, **kwargs) - return output # usually, capturing the model once is enough, and then we can @@ -367,3 +368,33 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE and compilation_config.use_inductor_graph_partition): torch._inductor.utils.set_customized_partition_wrappers(None) + + +@contextlib.contextmanager +def _torch27_patch_tensor_subclasses(): + """ + Add support for using tensor subclasses (ie `BasevLLMParameter`, ect) when + using torch 2.7.0. This enables using weight_loader_v2 and the use of + `BasevLLMParameters` without having to replace them with regular tensors + before `torch.compile`-time. + """ + from vllm.model_executor.parameter import (BasevLLMParameter, + ModelWeightParameter, + RowvLLMParameter, + _ColumnvLLMParameter) + + def return_false(*args, **kwargs): + return False + + if version.parse("2.7") <= version.parse( + torch.__version__) < version.parse("2.8"): + yield + return + + with (torch._dynamo.config.patch("traceable_tensor_subclasses", [ + BasevLLMParameter, ModelWeightParameter, _ColumnvLLMParameter, + RowvLLMParameter + ]), + patch("torch._dynamo.variables.torch.can_dispatch_torch_function", + return_false)): + yield diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 5bf96398bc710..df5bced6b2288 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.utils import dispatch_unquantized_gemm # yapf: disable from vllm.model_executor.parameter import (BasevLLMParameter, BlockQuantScaleParameter, + ModelWeightParameter, PackedColumnParameter, PackedvLLMParameter, PerTensorScaleParameter, @@ -34,6 +35,7 @@ from vllm.utils import GiB_bytes logger = init_logger(__name__) WEIGHT_LOADER_V2_SUPPORTED = [ + "UnquantizedLinearMethod", "CompressedTensorsLinearMethod", "CompressedTensorsLinearTransformMethod", "BitBLASLinearMethod", @@ -196,10 +198,14 @@ class UnquantizedLinearMethod(LinearMethodBase): # The amount of memory allocated for the weights is # sum(output_partition_sizes) * input_size_per_partition. try: - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + weight_loader = extra_weight_attrs.pop("weight_loader") + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) except torch.cuda.OutOfMemoryError as e: logger.error("Failed to create unquantized linear weights: %s", e) if torch.cuda.is_available(): @@ -212,7 +218,7 @@ class UnquantizedLinearMethod(LinearMethodBase): "Failed to create unquantized linear weights. " "This may be caused by insufficient memory to allocate " "the weight.") from e - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 03e5e5809b678..66add98dab443 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -61,9 +61,24 @@ class BasevLLMParameter(Parameter): self.tp_size = get_tensor_model_parallel_world_size() @property - def weight_loader(self): + def weight_loader(self) -> Callable: + # NOTE(@ksayers) some models such as mamba_mixer2 override the + # weight loader to support custom loading. In the future, model-specific + # weight loading should be implemented via Model.load_weights. In the + # meantime, support deleting and overriding `weight_loader`` attribute + if self._weight_loader is None: + raise AttributeError(f"{self.__class__.__name__} weight_loader " + "attribute has been deleted") return self._weight_loader + @weight_loader.setter + def weight_loader(self, value: Callable): + self._weight_loader = value + + @weight_loader.deleter + def weight_loader(self): + self._weight_loader = None # type: ignore[assignment] + def _is_1d_and_scalar(self, loaded_weight: torch.Tensor): cond1 = self.data.ndim == 1 and self.data.numel() == 1 cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1 @@ -97,6 +112,12 @@ class BasevLLMParameter(Parameter): assert shard_id in qkv_idxs return qkv_idxs[shard_id] + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return super().__torch_function__(func, types, args, kwargs) + class _ColumnvLLMParameter(BasevLLMParameter): """ From bf68fd76a91bd00f1693a453368688540bf22305 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:42:48 -0400 Subject: [PATCH 184/185] [Compile] Fix AMD Compile Error (#25518) Signed-off-by: yewentao256 --- csrc/quantization/activation_kernels.cu | 7 ++++++- csrc/rocm/attention.cu | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 9aa1411b4a25c..b94cc9ce5086c 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -23,9 +23,14 @@ typedef __hip_bfloat162 __nv_bfloat162; typedef __hip_bfloat16 __nv_bfloat16; typedef __hip_bfloat16_raw __nv_bfloat16_raw; - + #if defined(HIP_FP8_TYPE_OCP) typedef __hip_fp8_e4m3 __nv_fp8_e4m3; typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3; + #else +// ROCm 6.2 fallback: only *_fnuz types exist +typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3; +typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3; + #endif #endif #include "core/registration.h" diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index dac9df6048f2a..133a545045b12 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -25,6 +25,12 @@ #include "../attention/dtype_fp8.cuh" #include "../quantization/fp8/amd/quant_utils.cuh" +// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent +#if !defined(HIP_FP8_TYPE_OCP) +using __hip_fp8_e4m3 = __hip_fp8_e4m3_fnuz; +using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz; +#endif + #if defined(__HIPCC__) && \ (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) #define __HIP__GFX9__ From 9df8da548e920f83d4f7710e481f106c2ac92d31 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 23 Sep 2025 21:09:43 -0400 Subject: [PATCH 185/185] [BugFix] Fix MLA assert with CUTLASS MLA (#25478) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/mla/common.py | 64 +++++++++++++++++------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index a177117a50bd1..e84f2d89943e7 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -204,7 +204,7 @@ from vllm.attention.backends.utils import get_mla_dims from vllm.attention.ops.common import cp_lse_ag_out_rs from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import get_flash_attn_version -from vllm.config import VllmConfig +from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -436,6 +436,34 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): """ reorder_batch_threshold: ClassVar[int] = 1 + @staticmethod + def determine_chunked_prefill_workspace_size( + vllm_config: VllmConfig) -> int: + scheduler_config = vllm_config.scheduler_config + cache_config = vllm_config.cache_config + model_config = vllm_config.model_config + + chunked_prefill_workspace_size = min( + # Try for 8 full length request or at least 4 pages per-request + max(8 * model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * cache_config.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 64 * 1024) + + # Enforce that we enough for at least 1 page per request + chunked_prefill_workspace_size = max( + chunked_prefill_workspace_size, + scheduler_config.max_num_seqs * cache_config.block_size) + + return chunked_prefill_workspace_size + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], @@ -448,7 +476,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): scheduler_config = vllm_config.scheduler_config self.model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config - cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config self.device = device @@ -468,22 +495,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size - self.chunked_prefill_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max(8 * self.model_config.max_model_len, - 4 * scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 64 * 1024) - assert self.chunked_prefill_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size + self.chunked_prefill_workspace_size = \ + self.determine_chunked_prefill_workspace_size(vllm_config) + if self.dcp_world_size > 1: # Note(hc): The local kvcache is incomplete when DCP is triggered, # an additional kvcache allgather across the DCP group is therefore @@ -999,6 +1013,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.dcp_world_size: Optional[int] = None + self.chunked_prefill_workspace_size = \ + MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size( + get_current_vllm_config()) + def _flash_attn_varlen_diff_headdims(self, q, k, @@ -1513,6 +1531,16 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): " for MLACommonImpl") if attn_metadata is None: + # During the profile run try to simulate to worse case output size + # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context` + # since this can be large + _ = torch.empty( + (self.chunked_prefill_workspace_size, self.num_heads, + self.qk_nope_head_dim + self.v_head_dim), + device=k_c_normed.device, + dtype=k_c_normed.dtype, + ) + # The zero fill is required when used with DP + EP # to ensure all ranks within a DP group compute the # same expert outputs.