From f48a9af8924ea617a964b1158acc142b64843edb Mon Sep 17 00:00:00 2001 From: Alex <30671301+killershrimp@users.noreply.github.com> Date: Wed, 27 Aug 2025 23:27:36 -0500 Subject: [PATCH 01/19] [CI] make all multi-gpu weight loading tests run nightly (#23792) Signed-off-by: Alex Yun --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0d3b7a294d963..cf90505257e90 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -798,6 +798,7 @@ steps: mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 + optional: true source_file_dependencies: - vllm/ - tests/weight_loading From c8851a47235f5dfd3da3abf6c89453b3bdb41ad1 Mon Sep 17 00:00:00 2001 From: Jinheng Date: Thu, 28 Aug 2025 13:34:29 +0800 Subject: [PATCH 02/19] Add deprecation warning for lora_extra_vocab_size (#23635) Signed-off-by: Jinheng Li --- vllm/config/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 351833d3f02d0..cfc5e07d83299 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2439,8 +2439,8 @@ class LoRAConfig: lora_dtype: Union[torch.dtype, LoRADType] = "auto" """Data type for LoRA. If auto, will default to base model dtype.""" lora_extra_vocab_size: int = 256 - """Maximum size of extra vocabulary that can be present in a LoRA adapter - (added to the base model vocabulary).""" + """(Deprecated) Maximum size of extra vocabulary that can be present in a + LoRA adapter. Will be removed in v0.12.0.""" lora_vocab_padding_size: ClassVar[int] = current_platform\ .get_lora_vocab_padding_size() @@ -2482,6 +2482,12 @@ class LoRAConfig: return hash_str def __post_init__(self): + # Deprecation warning for lora_extra_vocab_size + logger.warning( + "`lora_extra_vocab_size` is deprecated and will be removed " + "in v0.12.0. Additional vocabulary support for " + "LoRA adapters is being phased out.") + # Setting the maximum rank to 512 should be able to satisfy the vast # majority of applications. possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) From 22feac8e957a2f9787eb721c685269afc15bb3b1 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 28 Aug 2025 02:43:48 -0400 Subject: [PATCH 03/19] [Transform] [Quantization] Add transforms to compressed tensors (#22486) --- tests/conftest.py | 43 +++- tests/quantization/test_compressed_tensors.py | 22 ++ vllm/model_executor/layers/linear.py | 16 +- .../compressed_tensors/compressed_tensors.py | 52 ++-- .../compressed_tensors/transform/linear.py | 227 ++++++++++++++++++ .../compressed_tensors/transform/module.py | 135 +++++++++++ .../transform/schemes/linear_qutlass_nvfp4.py | 21 ++ .../compressed_tensors/transform/utils.py | 13 + vllm/model_executor/parameter.py | 168 +++++++++++-- 9 files changed, 661 insertions(+), 36 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py diff --git a/tests/conftest.py b/tests/conftest.py index f8bfdfc8e6259..6052ada1c5fd7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json +import math import os import tempfile from enum import Enum -from typing import Any, Callable, Optional, TypedDict, TypeVar, Union +from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast import numpy as np import pytest @@ -33,6 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams +from vllm.sequence import Logprob from vllm.transformers_utils.utils import maybe_model_redirect logger = init_logger(__name__) @@ -602,7 +604,7 @@ class HfRunner: def _hidden_states_to_logprobs( self, hidden_states: tuple[tuple[torch.Tensor, ...], ...], - num_logprobs: int, + num_logprobs: Optional[int], ) -> tuple[list[dict[int, float]], int]: seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) output_len = len(hidden_states) @@ -630,7 +632,7 @@ class HfRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: int, + num_logprobs: Optional[int], images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, @@ -677,7 +679,7 @@ class HfRunner: self, encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, - num_logprobs: int, + num_logprobs: Optional[int], images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> list[TokensTextLogprobs]: @@ -966,7 +968,7 @@ class VllmRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: int, + num_logprobs: Optional[int], num_prompt_logprobs: Optional[int] = None, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, @@ -991,11 +993,40 @@ class VllmRunner: videos=videos, **kwargs) + def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]: + """ + Return the perplexity score associated with generating the prompts + + :param prompts: list of prompts to score + :return: perplexity score of each prompt + """ + outputs = self.generate_greedy_logprobs(prompts, + max_tokens=1, + num_logprobs=None, + num_prompt_logprobs=0) + + perplexities = [] + for output in outputs: + output = cast(TokensTextLogprobsPromptLogprobs, output) + token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + assert token_datas[0] is None + token_log_probs = [] + for token_data in token_datas[1:]: + assert token_data is not None + assert len(token_data) == 1 + token_log_prob = list(token_data.values())[0].logprob + token_log_probs.append(token_log_prob) + + perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs)) + perplexities.append(perplexity) + + return perplexities + def generate_encoder_decoder_greedy_logprobs( self, encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, - num_logprobs: int, + num_logprobs: Optional[int], num_prompt_logprobs: Optional[int] = None, skip_special_tokens: bool = True, ) -> Union[list[TokensTextLogprobs], diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b9774b7ee2631..484f53246f349 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -719,3 +719,25 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args): output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) assert output + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="This test is skipped on non-CUDA platform.") +@pytest.mark.parametrize("model,prompt,exp_perplexity", [ + ( + "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16", + "Flat is better than nested.\nSparse is better than dense.", + 150.0, + ), + ( + "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16", + "Flat is better than nested.\nSparse is better than dense.", + 150.0, + ), +]) +def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt, + exp_perplexity): + with vllm_runner(model, enforce_eager=True) as llm: + perplexity = llm.generate_prompt_perplexity([prompt])[0] + print(perplexity) + assert perplexity <= exp_perplexity \ No newline at end of file diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c0fcacd1e6ee9..19ff63145024f 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -35,6 +35,7 @@ logger = init_logger(__name__) WEIGHT_LOADER_V2_SUPPORTED = [ "CompressedTensorsLinearMethod", + "CompressedTensorsLinearTransformMethod", "BitBLASLinearMethod", "GPTQBitBLASLinearMethod", "AWQMarlinLinearMethod", @@ -199,6 +200,7 @@ class UnquantizedLinearMethod(LinearMethodBase): set_weight_attrs(weight, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # special postprocessing for CPU SGL if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL: from vllm.model_executor.layers.utils import check_cpu_sgl_kernel N, K = layer.weight.size() @@ -1470,7 +1472,7 @@ class QKVCrossParallelLinear(LinearBase): self.bias = torch.nn.Parameter() set_weight_attrs(self.bias, { "output_dim": 0, - "weight_loader": self.weight_loader, + "weight_loader": self.weight_loader_v1, }) else: self.bias = None @@ -1580,6 +1582,18 @@ class QKVCrossParallelLinear(LinearBase): k, v = kv_enc.split(self.kv_size, dim=-1) return q, k, v + def weight_loader_v1(self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + # just like all other parameters, does not yet + # support loading bias with weight_loader_v2 + layer = (self.q_proj_decoder + if loaded_shard_id == "q" else self.kv_proj_encoder) + target_param = self.select_proj_params(layer, param) + shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else () + layer.weight_loader(target_param, loaded_weight, *shard_id_args) + def weight_loader(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 245cf122ebab1..230572041c80d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -11,6 +11,7 @@ from compressed_tensors.config import (CompressionFormat, from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) +from compressed_tensors.transform import TransformConfig from pydantic import BaseModel import vllm.envs as envs @@ -30,6 +31,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501 + CompressedTensorsLinearTransformMethod, get_linear_transform_schemes) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( find_matched_target, is_activation_quantization_format, should_ignore_layer) @@ -60,6 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig): sparsity_ignore_list: list[str], kv_cache_scheme: Optional[dict[str, Any]] = None, config: Optional[dict[str, Any]] = None, + transform_config: Optional[TransformConfig] = None, ): super().__init__() self.ignore = ignore @@ -71,6 +75,12 @@ class CompressedTensorsConfig(QuantizationConfig): self.sparsity_ignore_list = sparsity_ignore_list self.config = config + if transform_config is not None: + self.transform_config = TransformConfig.model_validate( + transform_config) + else: + self.transform_config = None + def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) @@ -103,18 +113,27 @@ class CompressedTensorsConfig(QuantizationConfig): ) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import - # Check if the layer is skipped for quantization. - # TODO (@robertgshaw2): support module names - if should_ignore_layer(prefix, - ignore=self.ignore, - fused_mapping=self.packed_modules_mapping): - return UnquantizedLinearMethod() if isinstance(layer, LinearBase): - scheme = self.get_scheme(layer=layer, layer_name=prefix) - if scheme is None: - return UnquantizedLinearMethod() - layer.scheme = scheme - return CompressedTensorsLinearMethod(self) + # collect schemes + quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + input_tfms, output_tfms = get_linear_transform_schemes( + layer, prefix, self.transform_config, + self.packed_modules_mapping) + + # choose quantization method + quant_method: LinearMethodBase = UnquantizedLinearMethod() + if quant_scheme is not None: + layer.scheme = quant_scheme + quant_method = CompressedTensorsLinearMethod(self) + + # choose transform method + if any((input_tfms, output_tfms)): + return CompressedTensorsLinearTransformMethod.from_schemes( + quant_method, input_tfms, output_tfms) + + else: + return quant_method + if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) if isinstance(layer, FusedMoE): @@ -129,6 +148,7 @@ class CompressedTensorsConfig(QuantizationConfig): config=config) sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config( config=config) + transform_config = config.get("transform_config") return cls( target_scheme_map=target_scheme_map, @@ -137,6 +157,7 @@ class CompressedTensorsConfig(QuantizationConfig): sparsity_scheme_map=sparsity_scheme_map, sparsity_ignore_list=sparsity_ignore_list, config=config, + transform_config=transform_config, ) @classmethod @@ -537,9 +558,11 @@ class CompressedTensorsConfig(QuantizationConfig): # Find the "target" in the compressed-tensors config # that our layer conforms to. - # TODO (@robertgshaw): add compressed-tensors as dep - # so we do not have to re-write these functions - # need to make accelerate optional in ct to do this + # TODO (@kylesayrs): support ignore module names with ct matching utils + if should_ignore_layer(layer_name, + ignore=self.ignore, + fused_mapping=self.packed_modules_mapping): + return None # Will be empty for models with only sparsity weight_quant = input_quant = None @@ -722,7 +745,6 @@ class CompressedTensorsLinearMethod(LinearMethodBase): layer input. See LinearMethodBase for param details """ - scheme = layer.scheme if scheme is None: raise ValueError("A scheme must be defined for each layer") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py new file mode 100644 index 0000000000000..2fc94b3c257e6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py @@ -0,0 +1,227 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Generator +from itertools import accumulate +from typing import Callable, Optional + +import torch +from compressed_tensors.transform import (TransformArgs, TransformConfig, + TransformLocation, TransformScheme) +from compressed_tensors.utils import is_match + +from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, + LinearMethodBase, + QKVCrossParallelLinear) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import ( # noqa: E501 + HadamardTransform) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 + TransformTuple) + + +class CompressedTensorsLinearTransformMethod(LinearMethodBase): + """ + Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds + input and output transforms to either side of the original apply method + """ + + @classmethod + def from_schemes( + cls, quant_method: LinearMethodBase, input_tfms: dict[int, + TransformTuple], + output_tfms: dict[int, TransformTuple] + ) -> "CompressedTensorsLinearTransformMethod": + assert input_tfms or output_tfms + + # TODO (@ksayers): implement QutlassLinearMethodNvFP4 + # hadacore and fwht can be selected by Transform module + + return cls(quant_method, input_tfms, output_tfms) + + def __init__(self, quant_method: LinearMethodBase, + input_tfms: dict[int, TransformTuple], + output_tfms: dict[int, TransformTuple]): + self.quant_method = quant_method + self.input_tfms = input_tfms + self.output_tfms = output_tfms + + self.input_transform: Optional[HadamardTransform] = None + self.output_transform: Optional[HadamardTransform] = None + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + + # get weight loader for transforms + weight_loader: Callable = extra_weight_attrs.get( + "weight_loader") # type: ignore[assignment] + + # HACK: UnquantizedLinearMethod does not support weight loader v2, but + # transforms (specifically SharedWeightParameter) requires + # weight loader v2. Until UnquantizedLinearMethod supports v2, we must + # hack around this by getting weight loader v1 so ULM can load correctly + quant_method_name = self.quant_method.__class__.__name__ + if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED: + if isinstance(layer, QKVCrossParallelLinear): + weight_loader_v1 = layer.weight_loader_v1 + else: + weight_loader_v1 = layer.weight_loader + extra_weight_attrs["weight_loader"] = weight_loader_v1 + + self.quant_method.create_weights( + layer=layer, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + input_size=input_size, + output_size=output_size, + params_dtype=params_dtype, + **extra_weight_attrs) + + # validate schemes + num_partitions = len(output_partition_sizes) + self._validate_tfm_schemes(num_partitions) + + # create submodules for weight loading + if len(self.input_tfms) > 0: + scheme_name = list(self.input_tfms.values())[0].scheme_name + location = list(self.input_tfms.values())[0].args.location + transform_name = f"{scheme_name}_{location}" + + transform = HadamardTransform(self.input_tfms, layer, + weight_loader, + input_size_per_partition, + output_partition_sizes) + layer.register_module(transform_name, transform) + self.input_transform = transform + + if len(self.output_tfms) > 0: + scheme_name = list(self.output_tfms.values())[0].scheme_name + location = list(self.output_tfms.values())[0].args.location + transform_name = f"{scheme_name}_{location}" + + transform = HadamardTransform(self.output_tfms, layer, + weight_loader, + input_size_per_partition, + output_partition_sizes) + layer.register_module(transform_name, transform) + self.output_transform = transform + + # compute partition ranges for slicing activations + starts = [0] + list(accumulate(output_partition_sizes))[:-1] + self.partition_ranges = list(zip(starts, output_partition_sizes)) + + def process_weights_after_loading(self, layer): + self.quant_method.process_weights_after_loading(layer) + + for submodule in layer.children(): + if isinstance(submodule, HadamardTransform): + submodule.process_weights_after_loading() + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + if self.input_transform is not None: + x = self.input_transform(x) + + assert bias is None + x = self.quant_method.apply(layer, x, bias) + + # TODO (@ksayers): Write a triton kernel to do this in parallel + if self.output_transform is not None: + for part_id, (start, length) in enumerate(self.partition_ranges): + x[:, start:start + length] = self.output_transform( + x[:, start:start + length], part_id=part_id) + + return x + + def _validate_tfm_schemes(self, num_partitions: int): + if len(self.input_tfms) > 0: + if 0 not in self.input_tfms: + raise ValueError("Must have same input") + + for part_index in range(num_partitions): + if self.input_tfms[part_index] != self.input_tfms[0]: + raise ValueError("Must have same input") + + if len(self.output_tfms) > 0: + scheme_name = list(self.output_tfms.values())[0].scheme_name + location = list(self.output_tfms.values())[0].args.location + + for tfm in self.output_tfms.values(): + if tfm.scheme_name != scheme_name: + raise ValueError("Must have same scheme name") + if tfm.args.location != location: + raise ValueError("Must have same location") + + return self.input_tfms, self.output_tfms + + +def get_linear_transform_schemes( + layer: torch.nn.Module, layer_name: str, + transform_config: Optional[TransformConfig], + packed_modules_mapping: dict[str, list[str]] +) -> tuple[dict[int, TransformTuple], dict[ + int, TransformTuple]]: # [input_transform, [output_transform, ...]] + # there can only be one transform input scheme per (fused) module + input_tfms = {} + output_tfms = {} + + partition_names = get_layer_partition_names(layer_name, + packed_modules_mapping) + + for scheme_name, scheme, args in get_schemes_args(transform_config): + for part_index, part_name in enumerate(partition_names): + if is_match(part_name, layer, args.targets, + args.ignore) and args.is_online(): + if args.location == TransformLocation.INPUT: + input_tfms[part_index] = TransformTuple( + scheme_name, scheme, args) + + elif args.location == TransformLocation.OUTPUT: + output_tfms[part_index] = TransformTuple( + scheme_name, scheme, args) + + else: + raise ValueError(f"Cannot apply `{args.location}` " + f"transform to `{layer_name}`") + + return (input_tfms, output_tfms) + + +def get_schemes_args( + transform_config: Optional[TransformConfig] +) -> Generator[tuple[str, TransformScheme, TransformArgs]]: + if transform_config is None: + return + + for scheme_name, scheme in transform_config.config_groups.items(): + for args in scheme.apply: + yield (scheme_name, scheme, args) + + +def get_layer_partition_names( + layer_name: str, packed_modules_mapping: dict[str, + list[str]]) -> list[str]: + """ + Get all partition names associated with this layer. + Names are returned in order of their partition indices. + + ```python + mapping = {"gate_up_proj", "gate_proj", "up_proj"} + + assert get_layer_partition_names( + "mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"] + assert get_layer_partition_names( + "mlp.down_proj", mapping) == ["down_proj"] + """ + for fused_suffix, part_suffixes in packed_modules_mapping.items(): + if layer_name.endswith(fused_suffix): + return [ + layer_name.removesuffix(fused_suffix) + part_suffix + for part_suffix in part_suffixes + ] + + return [layer_name] diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py new file mode 100644 index 0000000000000..b3be254717734 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math +from collections.abc import Hashable +from typing import Callable, Optional + +import torch +from compressed_tensors.transform import TransformLocation, TransformScheme +from torch import Tensor + +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 + TransformTuple) +from vllm.model_executor.layers.utils import dispatch_unquantized_gemm +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parameter import SharedWeightParameter + + +class HadamardTransform(torch.nn.Module): + """ + Class which handles weight loading, postprocessing, and application of + transforms. Meant to be used with `CompressedTensorsLinearTransformMethod` + and attention transforms method (not implemented yet) + """ + transforms: dict[int, TransformTuple] # info parsed from transforms config + weight: SharedWeightParameter # container for shared tensors + + kernel: Callable # function used during application + scales: dict[int, float] # hadamard scale, usually sqrt(matrix.size(0)) + + def __init__(self, + transforms: dict[int, TransformTuple], + layer: torch.nn.Module, + weight_loader: Callable, + input_size_per_partition: int, + output_partition_sizes: list[int], + kernel: Optional[Callable] = None): + super().__init__() + self.transforms = transforms + self.scales = {} + + if get_tensor_model_parallel_world_size() > 1: + raise NotImplementedError("Online transforms with tensor " + "parallelism is not supported") + + # Similar to row/col parallel params, but tensors are separate + # to allow for loading with shared memory + self.weight = SharedWeightParameter(weight_loader=weight_loader) + + # create shared partition data for each partition of the original weight + input_size = input_size_per_partition + for part_index, (_scheme_name, scheme, + args) in self.transforms.items(): + output_size = output_partition_sizes[part_index] + weight_size = self._get_weight_size(layer, args.location, + input_size, output_size) + + data_key = self._get_data_key(scheme, weight_size) + self.weight.add_partition( + part_index, + data_key, + size=(weight_size, weight_size), + dtype=scheme.precision, + ) + + # validate that shared tensors and schemes are correct + self._validate_input_transforms() + + # select kernel based on transform schemes + self.kernel = self._infer_kernel() if kernel is None else kernel + + def process_weights_after_loading(self): + for part_id in self.weight.partitions: + data = self.weight.partitions[part_id].data + + # required by torch.compile + self.weight.process_weights_after_loading() + + # precompute scale as a runtime multiply, not division + # do not fold into weight in order to utilize FWHT + self.scales[part_id] = 1 / math.sqrt(data.size(0)) + + # FUTURE: avoid runtime tranpose by processing weights + # prior to apply + + def forward(self, value: Tensor, part_id: int = 0) -> Tensor: + if part_id not in self.weight.partitions: + return value + + weight = self.weight.partitions[part_id] + weight = weight if self.transforms[ + part_id].args.inverse else weight.T # linear := x(W.T) + scale = self.scales[part_id] + return self.kernel(self, value.to(weight.dtype), weight, None).to( + value.dtype) * scale + + def _get_data_key(self, scheme: TransformScheme, + weight_size: int) -> Hashable: + return (id(scheme), weight_size) + + def _get_weight_size(self, layer: torch.nn.Module, + location: TransformLocation, input_size: int, + output_size: int) -> int: + if isinstance(layer, LinearBase): + if location == TransformLocation.INPUT: + return input_size + + elif location == TransformLocation.OUTPUT: + return output_size + + elif isinstance(layer, VocabParallelEmbedding): + if location == TransformLocation.INPUT: + return output_size + + elif location == TransformLocation.OUTPUT: + return input_size + + raise ValueError() + + def _validate_input_transforms(self): + assert len(self.transforms) > 0 + location = list(self.transforms.values())[0].args.location + + if location == TransformLocation.INPUT: + first_data = self.weight.partitions[0].data + for partition in self.weight.partitions.values(): + if partition.data.data_ptr() != first_data.data_ptr(): + raise ValueError("") + + def _infer_kernel(self) -> Callable: + # TODO (@ksayers): use fwht, hadacore + return dispatch_unquantized_gemm() diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py new file mode 100644 index 0000000000000..f42258f9f9d7f --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501 + CompressedTensorsLinearTransformMethod) + + +# Because qutlass fuses hadamard with quantization, it cannot automatically be +# composed with kernels in the way CompressedTensorsLinearTransformMethod does. +# Therefore, a separate scheme must be created for each quantized dtype +class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod): + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + # fused hadamard quant linear method + raise NotImplementedError() diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py new file mode 100644 index 0000000000000..2f353de1e6a74 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import NamedTuple + +from compressed_tensors.transform import TransformArgs, TransformScheme + +__all__ = ["TransformTuple"] + + +class TransformTuple(NamedTuple): + scheme_name: str + scheme: TransformScheme + args: TransformArgs diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 750ee78502688..9465308e94e65 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -1,13 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Hashable from fractions import Fraction from typing import Callable, Optional, Union +from weakref import WeakValueDictionary import torch from torch.nn import Parameter -from vllm.distributed import get_tensor_model_parallel_rank +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) from vllm.logger import init_logger from vllm.model_executor.utils import _make_synced_weight_loader @@ -27,7 +30,7 @@ class BasevLLMParameter(Parameter): into the parameter when the provided weight loader is called. """ - def __new__(cls, data: torch.Tensor, **kwargs): + def __new__(cls, data: Optional[torch.Tensor], **kwargs): return super().__new__(cls, data=data, requires_grad=False) @@ -81,6 +84,17 @@ class BasevLLMParameter(Parameter): def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): self._assert_and_load(loaded_weight) + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + # if not int, assume shard_id for qkv + # map to int and return + qkv_idxs = {"q": 0, "k": 1, "v": 2} + assert isinstance(shard_id, str) + assert shard_id in qkv_idxs + return qkv_idxs[shard_id] + class _ColumnvLLMParameter(BasevLLMParameter): """ @@ -113,6 +127,7 @@ class _ColumnvLLMParameter(BasevLLMParameter): shard_offset = kwargs.get("shard_offset") shard_size = kwargs.get("shard_size") + # TODO: move these to PackedColumnParameter and PackedvLLMParameter if isinstance( self, (PackedColumnParameter, @@ -137,6 +152,7 @@ class _ColumnvLLMParameter(BasevLLMParameter): shard_id = kwargs.get("shard_id") num_heads = kwargs.get("num_heads") + # TODO: move these to PackedColumnParameter and PackedvLLMParameter if isinstance( self, (PackedColumnParameter, @@ -224,19 +240,8 @@ class PerTensorScaleParameter(BasevLLMParameter): """ def __init__(self, **kwargs): - self.qkv_idxs = {"q": 0, "k": 1, "v": 2} super().__init__(**kwargs) - def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: - if isinstance(shard_id, int): - return shard_id - - # if not int, assume shard_id for qkv - # map to int and return - assert isinstance(shard_id, str) - assert shard_id in self.qkv_idxs - return self.qkv_idxs[shard_id] - # For row parallel layers, no sharding needed # load weight into parameter as is def load_row_parallel_weight(self, *args, **kwargs): @@ -373,6 +378,141 @@ class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter): pass +class SharedWeightParameter(BasevLLMParameter): + """ + Parameter for weights with many shared tensors across a model + + For example, when applying transforms to the "gate" and "up" partitions of + `MergedColumnParallelLinear`, the transform weights must stay separate + tensors in order to allow for tensor memory sharing between layers. + """ + # global registry for sharing tensors based on passed `data_key` + # this dict holds weaksrefs to avoid memory leak after model cleanup + tensors_registry: WeakValueDictionary = WeakValueDictionary() + + # local container for strong references to shared tensors + # this set compensates for the fact that torch.nn.Parameter + # and Parameter subclasses do not hold reliable references to tensors + local_tensors: set[torch.Tensor] + + # dictionary mapping partition indices to associated parameters + partitions: dict[int, Union[ModelWeightParameter, Parameter]] + + def __new__(cls, **kwargs): + return super().__new__(cls, data=None, **kwargs) + + def __init__(self, input_dim: int = 1, output_dim: int = 0, **kwargs): + weight_loader: Callable = kwargs.get( + "weight_loader") # type: ignore[assignment] + super().__init__(data=None, weight_loader=weight_loader) + + self.local_tensors = set() + self.partitions = {} + self.kwargs = { + "input_dim": input_dim, + "output_dim": output_dim, + "weight_loader": self._fake_weight_loader + } + + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + if self.tp_size > 1: + raise NotImplementedError(f"{self.__class__.__name__} does not " + "currently support tensor parallelism") + + def add_partition(self, index: int, data_key: Hashable, *args, **kwargs): + """ + Add a partition to the weight parameter. Partitions whose `data_key` + is the same will share tensor data + + :param index: index of partition to add + :param data_key: hashable key used to key shared tensors + :param *args: arguments for `torch.empty` + :param **kwargs: keyword arguments for `torch.empty` + """ + # load (shared) tensor using `data_key` + if data_key not in self.tensors_registry: + data = torch.empty(*args, **kwargs) + self.tensors_registry[data_key] = data + else: + data = self.tensors_registry[data_key] + + # create associated model parameter + self.partitions[index] = ModelWeightParameter( + data=data, **self.kwargs) # type: ignore[arg-type] + + # hold local reference, since ModelWeightParameter does not + # see https://github.com/pytorch/pytorch/issues/75932 + self.local_tensors.add(data) + + def load_column_parallel_weight(self, loaded_weight: torch.Tensor): + assert len(self.partitions) == 1 and 0 in self.partitions + partition = self.partitions[0] + + ModelWeightParameter.load_column_parallel_weight( + partition, loaded_weight) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + assert len(self.partitions) == 1 and 0 in self.partitions + partition = self.partitions[0] + + ModelWeightParameter.load_row_parallel_weight(partition, loaded_weight) + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + partition_id = kwargs.pop("shard_id") + partition_id = self._shard_id_as_int(partition_id) + partition = self.partitions[partition_id] + + input_dim = self.kwargs.get("input_dim") + shard_size = partition.data.size(input_dim) // self.tp_size + shard_offset = self.tp_rank * shard_size + + ModelWeightParameter.load_merged_column_weight( + partition, + loaded_weight, + shard_offset=shard_offset, + shard_size=shard_size) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + partition_id = self._shard_id_as_int(kwargs.pop("shard_id")) + partition = self.partitions[partition_id] + + input_dim = self.kwargs.get("input_dim") + shard_size = partition.data.size(input_dim) // self.tp_size + shard_offset = self.tp_rank * shard_size + shard_id = "q" # fake first partition + num_heads = kwargs.get("num_heads") + + ModelWeightParameter.load_qkv_weight( + partition, + loaded_weight, + shard_offset=shard_offset, + shard_size=shard_size, + shard_id=shard_id, + num_heads=num_heads, + ) + + def process_weights_after_loading(self): + for key in self.partitions: + self.partitions[key] = torch.nn.Parameter( + data=self.partitions[key].data, requires_grad=False) + + @property + def data(self): + raise ValueError("Accessing `data` of a " + "`PartitionedModelWeightParameter` is not allowed. " + "Instead, use `get_partition` to get the weight of " + "the particular partition you want to access") + + def _fake_weight_loader(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_weight_shard_id: Optional[Union[str, int]]): + raise ValueError("When loading partition weights of " + f"{self.__class__.__name__}, use methods provided by " + f"{self.__class__.__name__}, not partition loader") + + def permute_param_layout_(param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs) -> BasevLLMParameter: """ @@ -456,4 +596,4 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor, shard_offset=shard_offset, bitblas_tile_size=bitblas_tile_size) - return shard_size, shard_offset \ No newline at end of file + return shard_size, shard_offset From c07a73317d202c2dad67f12893fcddb6d3664950 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 28 Aug 2025 14:51:24 +0800 Subject: [PATCH 04/19] [CI] enable idefics3 and fuyu-8b test in multimodal test (#23790) Signed-off-by: zjy0516 --- .../multimodal/generation/test_common.py | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 2b60faae8ec0b..d61b182761e44 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = { }, marks=[pytest.mark.core_model], ), - # FIXME(Isotr0py): Enable this test after - # https://github.com/huggingface/transformers/pull/39470 released - # "idefics3-transformers": VLMTestInfo( - # models=["HuggingFaceTB/SmolVLM-256M-Instruct"], - # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - # prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 - # img_idx_to_prompt=lambda idx: "", - # max_model_len=8192, - # max_num_seqs=2, - # auto_cls=AutoModelForImageTextToText, - # hf_output_post_proc=model_utils.idefics3_trunc_hf_output, - # image_size_factors=[(0.25, 0.5, 1.0)], - # vllm_runner_kwargs={ - # "model_impl": "transformers", - # }, - # marks=[pytest.mark.core_model], - # ), + "idefics3-transformers": VLMTestInfo( + models=["HuggingFaceTB/SmolVLM-256M-Instruct"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + img_idx_to_prompt=lambda idx: "", + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + hf_output_post_proc=model_utils.idefics3_trunc_hf_output, + image_size_factors=[(0.25, 0.5, 1.0)], + vllm_runner_kwargs={ + "model_impl": "transformers", + }, + marks=[pytest.mark.core_model], + ), # Pixel values from processor are not 4D or 5D arrays "qwen2_5_vl-transformers": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], @@ -322,10 +320,6 @@ VLM_TEST_SETTINGS = { vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we - # should enable this again after the fix is released: - # https://github.com/huggingface/transformers/pull/39915 - marks=[pytest.mark.skip("HF model is broken")], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], From daa1273b14da5bdf643aa4b1bcbef3985b1edd75 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 28 Aug 2025 15:27:45 +0800 Subject: [PATCH 05/19] [Bugfix] when set offline model running error (#23711) Signed-off-by: rongfu.leng --- vllm/entrypoints/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index d8905fc141245..d2d7dba3ae46f 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]): # Handle EngineArgs instance elif isinstance(args, EngineArgs): - default_args = EngineArgs() # Create default instance + default_args = EngineArgs(model=args.model) # Create default instance for field in dataclasses.fields(args): current_val = getattr(args, field.name) default_val = getattr(default_args, field.name) if current_val != default_val: non_default_args[field.name] = current_val + if default_args.model != EngineArgs.model: + non_default_args["model"] = default_args.model else: raise TypeError("Unsupported argument type. " \ "Must be argparse.Namespace or EngineArgs instance.") From 186aced5ffb62b62b41eb1beaf2a598ada43351b Mon Sep 17 00:00:00 2001 From: yzds <41983536+youzhedian@users.noreply.github.com> Date: Thu, 28 Aug 2025 15:29:11 +0800 Subject: [PATCH 06/19] [Kernel] cuda kernels for upcoming decode context parallel feature (#23791) Co-authored-by: hongchao --- csrc/cache.h | 17 +- csrc/cache_kernels.cu | 247 ++++++++++++++++++++++++++ csrc/torch_bindings.cpp | 15 ++ tests/kernels/attention/test_cache.py | 72 ++++++++ vllm/_custom_ops.py | 24 +++ 5 files changed, 374 insertions(+), 1 deletion(-) diff --git a/csrc/cache.h b/csrc/cache.h index fb0c353b96137..e8e069aefd9c5 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, const std::string& kv_cache_dtype, torch::Tensor& scale); +void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, + torch::Tensor& cp_local_token_select_indices, + torch::Tensor& kv_cache, + torch::Tensor& slot_mapping, + const std::string& kv_cache_dtype, + torch::Tensor& scale); + // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); @@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache( torch::Tensor const& cu_seq_lens, // [BATCH+1] int64_t batch_size, const std::string& kv_cache_dtype, torch::Tensor const& scale, - std::optional seq_starts = std::nullopt); \ No newline at end of file + std::optional seq_starts = std::nullopt); + +// TODO(hc): cp_gather_cache need support scaled kvcahe in the future. +void cp_gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, std::optional seq_starts = std::nullopt); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index b3a985c2d5bbb..fc82a1fa8ed78 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include "cuda_utils.h" #include "cuda_compat.h" @@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel( copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); } +template +__global__ void cp_fused_concat_and_cache_mla_kernel( + const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank] + const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim] + const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens] + cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank + // + pe_dim)] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int block_stride, // + const int entry_stride, // + const int kv_c_stride, // + const int k_pe_stride, // + const int kv_lora_rank, // + const int pe_dim, // + const int block_size, // + const float* scale // +) { + const int64_t token_idx = cp_local_token_select_indices[blockIdx.x]; + const int64_t slot_idx = slot_mapping[blockIdx.x]; + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0) { + return; + } + const int64_t block_idx = slot_idx / block_size; + const int64_t block_offset = slot_idx % block_size; + + auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst, + int src_stride, int dst_stride, int size, int offset) { + for (int i = threadIdx.x; i < size; i += blockDim.x) { + const int64_t src_idx = token_idx * src_stride + i; + const int64_t dst_idx = + block_idx * block_stride + block_offset * entry_stride + i + offset; + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst[dst_idx] = src[src_idx]; + } else { + dst[dst_idx] = + fp8::scaled_convert(src[src_idx], *scale); + } + } + }; + + copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0); + copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); +} + } // namespace vllm // KV_T is the data type of key and value tensors. @@ -508,6 +554,20 @@ void reshape_and_cache_flash( kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ reinterpret_cast(scale.data_ptr())); +// KV_T is the data type of key and value tensors. +// CACHE_T is the stored data type of kv-cache. +// KV_DTYPE is the real data type of kv-cache. +#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ + vllm::cp_fused_concat_and_cache_mla_kernel \ + <<>>( \ + reinterpret_cast(kv_c.data_ptr()), \ + reinterpret_cast(k_pe.data_ptr()), \ + cp_local_token_select_indices.data_ptr(), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, entry_stride, \ + kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ + reinterpret_cast(scale.data_ptr())); + void concat_and_cache_mla( torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] torch::Tensor& k_pe, // [num_tokens, pe_dim] @@ -546,6 +606,50 @@ void concat_and_cache_mla( CALL_CONCAT_AND_CACHE_MLA); } +// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel +// calls into one: +// k_c_normed.index_select(0, cp_local_token_select_indices) + \ +// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \ +// concat_and_cache_mla. +void cp_fused_concat_and_cache_mla( + torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank] + torch::Tensor& k_pe, // [num_total_tokens, pe_dim] + torch::Tensor& cp_local_token_select_indices, // [num_tokens] + torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + + // pe_dim)] + torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] + const std::string& kv_cache_dtype, torch::Tensor& scale) { + // NOTE(woosuk): In vLLM V1, key.size(0) can be different from + // slot_mapping.size(0) because of padding for CUDA graphs. + // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because + // both include padding. + // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0) + // since key includes padding for CUDA graphs, while slot_mapping does not. + // In this case, slot_mapping.size(0) represents the actual number of tokens + // before padding. + // For compatibility with both cases, we use slot_mapping.size(0) as the + // number of tokens. + int num_tokens = slot_mapping.size(0); + int kv_lora_rank = kv_c.size(1); + int pe_dim = k_pe.size(1); + int block_size = kv_cache.size(1); + + TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); + + int kv_c_stride = kv_c.stride(0); + int k_pe_stride = k_pe.stride(0); + int block_stride = kv_cache.stride(0); + int entry_stride = kv_cache.stride(1); + + dim3 grid(num_tokens); + dim3 block(std::min(kv_lora_rank, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, + CALL_CP_FUSED_CONCAT_AND_CACHE_MLA); +} + namespace vllm { template @@ -779,3 +883,146 @@ void gather_and_maybe_dequant_cache( DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE); } + +namespace vllm { +template +// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by +// block_size. +__global__ void cp_gather_cache( + const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRY_SIZE] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t block_size, const int32_t entry_size, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride, + const int32_t* __restrict__ seq_starts // Optional: starting offsets per + // batch +) { + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = cu_seq_lens[bid]; + const int32_t seq_end = cu_seq_lens[bid + 1]; + const int32_t seq_len = seq_end - seq_start; + const int32_t tot_slots = seq_len; + const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + + const int32_t split_start = split * split_slots; + const int32_t split_end = min((split + 1) * split_slots, tot_slots); + + const bool is_active_split = (split_start < tot_slots); + const bool is_last_split = (split_end == tot_slots); + + if (!is_active_split) return; + + // Adjust the pointer for the block_table for this batch. + // If seq_starts is provided, compute an offset based on it + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = split_start; + if (seq_starts != nullptr) { + offset += seq_starts[bid]; + } + int32_t offset_div = offset / block_size; + offset = offset % block_size; + const int32_t* batch_block_table = block_table + batch_offset; + + // Adjust dst pointer based on the cumulative sequence lengths. + dst += seq_start * dst_entry_stride; + + auto copy_entry = [&](const scalar_t* __restrict__ _src, + scalar_t* __restrict__ _dst) { + for (int i = threadIdx.x; i < entry_size; i += blockDim.x) + _dst[i] = _src[i]; + }; + + for (int pid = split_start; pid < split_end; ++pid) { + auto block_id = batch_block_table[offset_div]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = dst + pid * dst_entry_stride; + copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr); + offset += 1; + // bump to next block + if (offset == block_size) { + offset_div += 1; + offset = 0; + } + } +} +} // namespace vllm + +// Macro to dispatch the kernel based on the data type. +#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \ + vllm::cp_gather_cache<<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + block_size, entry_size, block_table_stride, cache_block_stride, \ + cache_entry_stride, dst_entry_stride, seq_starts_ptr); + +// Gather sequences from the cache into the destination tensor. +// - cu_seq_lens contains the cumulative sequence lengths for each batch +// - block_table contains the cache block indices for each sequence +// - Optionally, seq_starts (if provided) offsets the starting slot index by +// seq_starts[bid] +void cp_gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, + std::optional seq_starts = std::nullopt) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t entry_size = src_cache.flatten(2, -1).size(2); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32, + "cu_seq_lens must be int32"); + if (seq_starts.has_value()) { + TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, + "seq_starts must be int32"); + } + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == cu_seq_lens.device(), + "src_cache and cu_seq_lens must be on the same device"); + if (seq_starts.has_value()) { + TORCH_CHECK(src_cache.device() == seq_starts.value().device(), + "src_cache and seq_starts must be on the same device"); + } + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size. + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(1024); + + TORCH_CHECK(src_cache.dtype() == dst.dtype(), + "src_cache and dst must have the same dtype"); + + const int dtype_bits = src_cache.element_size() * 8; + const int32_t* seq_starts_ptr = + seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; + + if (dtype_bits == 32) { + CALL_CP_GATHER_CACHE(uint32_t); + } else if (dtype_bits == 16) { + CALL_CP_GATHER_CACHE(uint16_t); + } else if (dtype_bits == 8) { + CALL_CP_GATHER_CACHE(uint8_t); + } else { + TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); + } +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7ae054dc19fbd..608b724403076 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -686,6 +686,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor scale) -> ()"); cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla); + cache_ops.def( + "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe," + " Tensor cp_local_token_select_indices," + " Tensor! kv_cache," + " Tensor slot_mapping," + " str kv_cache_dtype," + " Tensor scale) -> ()"); + cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA, + &cp_fused_concat_and_cache_mla); + // Convert the key and value cache to fp8 data type. cache_ops.def( "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, " @@ -702,6 +712,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor scale, Tensor? seq_starts) -> ()"); cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA, &gather_and_maybe_dequant_cache); + + cache_ops.def( + "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, " + "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()"); + cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index cbf11da63cab9..69e96dfd2cb13 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -790,6 +790,78 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim, torch.testing.assert_close(dst, expected) +@pytest.mark.parametrize("kv_lora_rank", [512]) +@pytest.mark.parametrize("qk_rope_head_dim", [64]) +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_blocks", [1024]) +@pytest.mark.parametrize("max_seq_len", [512]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("dtype", [torch.float32]) +@pytest.mark.parametrize("kv_cache_dtype", + ["auto"]) # You can also test "fp8" if needed. +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size, + num_blocks, max_seq_len, batch_size, dtype, + kv_cache_dtype, device): + entry_size = kv_lora_rank + qk_rope_head_dim + src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device) + _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype) + + seq_len_tensor = torch.randint(0, + max_seq_len + 1, (batch_size, ), + device=device) + + total_tokens = seq_len_tensor.sum() + cu_seq_lens = torch.empty((batch_size + 1), + dtype=torch.int32, + device=device) + cu_seq_lens[0] = 0 + cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32) + print("seq_len_tensor", seq_len_tensor) + + tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size + block_table = torch.empty((batch_size, num_blocks), + dtype=torch.int32, + device=device) + + for b in range(batch_size): + perm = torch.randperm(num_blocks, device=device) + block_table[b, :] = perm + + dst = torch.zeros((total_tokens, entry_size), + dtype=src_cache.dtype, + device=device) + + expected_batches = [] + for b in range(batch_size): + s = seq_len_tensor[b] + if s == 0: + continue + tot = tot_blocks_tensor[b] + blocks = block_table[b, :tot].tolist() + + gathered_rows = [] + for i in range(tot - 1): + gathered_rows.append(src_cache[blocks[i]]) + remaining = s - (tot - 1) * block_size + gathered_rows.append(src_cache[blocks[-1], :remaining, :]) + + batch_expected = torch.cat(gathered_rows, dim=0) + expected_batches.append(batch_expected) + expected = torch.cat(expected_batches, dim=0) + + opcheck( + torch.ops._C_cache_ops.cp_gather_cache, + (src_cache, dst, block_table, cu_seq_lens, batch_size, None), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + ) + + ops.cp_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size) + torch.testing.assert_close(dst, expected) + + @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 054dc9d985a4c..340d6e1164e4f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1625,6 +1625,20 @@ def concat_and_cache_mla( scale) +def cp_fused_concat_and_cache_mla( + kv_c: torch.Tensor, + k_pe: torch.Tensor, + cp_local_token_select_indices: torch.Tensor, + kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + scale: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla( + kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping, + kv_cache_dtype, scale) + + def copy_blocks(key_caches: list[torch.Tensor], value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: @@ -1662,6 +1676,16 @@ def gather_and_maybe_dequant_cache( scale, seq_starts) +def cp_gather_cache(src_cache: torch.Tensor, + dst: torch.Tensor, + block_table: torch.Tensor, + cu_seq_lens: torch.Tensor, + batch_size: int, + seq_starts: Optional[torch.Tensor] = None) -> None: + torch.ops._C_cache_ops.cp_gather_cache(src_cache, dst, block_table, + cu_seq_lens, batch_size, seq_starts) + + def get_device_attribute(attribute: int, device: int) -> int: return torch.ops._C_cuda_utils.get_device_attribute(attribute, device) From 11a7fafaa8807bfeea4b60466c576ec6a7031bfd Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 28 Aug 2025 15:36:42 +0800 Subject: [PATCH 07/19] [New Model]: Support GteNewModelForSequenceClassification (#23524) Signed-off-by: wang.yuqi --- docs/models/supported_models.md | 4 + tests/conftest.py | 5 +- tests/models/language/pooling/embed_utils.py | 3 + tests/models/language/pooling/mteb_utils.py | 6 ++ .../pooling/test_bge_reranker_v2_gemma.py | 24 ++---- tests/models/language/pooling/test_gte.py | 24 +++--- .../language/pooling/test_mxbai_rerank.py | 19 ++--- .../language/pooling/test_qwen3_reranker.py | 26 ++---- tests/models/registry.py | 4 + tests/models/utils.py | 28 ++++--- vllm/model_executor/models/bert_with_rope.py | 83 +++++++++++++++++-- vllm/model_executor/models/config.py | 1 + vllm/model_executor/models/registry.py | 6 +- 13 files changed, 157 insertions(+), 76 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 20cf75873af76..34e465584888b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -497,6 +497,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | +| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | ✅︎ | | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | @@ -513,6 +514,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' ``` +!!! note + The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. + !!! note Load the official original `mxbai-rerank-v2` by using the following command. diff --git a/tests/conftest.py b/tests/conftest.py index 6052ada1c5fd7..9fed43cba54b6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -456,11 +456,10 @@ class HfRunner: # output is final logits all_inputs = self.get_inputs(prompts) outputs = [] + problem_type = getattr(self.config, "problem_type", "") + for inputs in all_inputs: output = self.model(**self.wrap_device(inputs)) - - problem_type = getattr(self.config, "problem_type", "") - if problem_type == "regression": logits = output.logits[0].tolist() elif problem_type == "multi_label_classification": diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 61c5fcab4f8a4..a74ad2aa25972 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner, vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype + if model_info.hf_overrides is not None: + vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + with vllm_runner(model_info.name, runner="pooling", max_model_len=None, diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 4a1f8a53d024c..640858125bfca 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner, vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype + if model_info.hf_overrides is not None: + vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + with vllm_runner(model_info.name, runner="pooling", max_model_len=None, @@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner, vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype + if model_info.hf_overrides is not None: + vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + with vllm_runner(model_info.name, runner="pooling", max_model_len=None, diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index 206524d7caad3..f473e0ba01ffa 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -13,7 +13,14 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models RERANK_MODELS = [ LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", - architecture="GemmaForSequenceClassification"), + architecture="GemmaForSequenceClassification", + hf_overrides={ + "architectures": + ["GemmaForSequenceClassification"], + "classifier_from_token": ["Yes"], + "method": + "no_post_processing", + }), ] PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501 @@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder): @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo, - monkeypatch) -> None: - monkeypatch.setenv("VLLM_USE_V1", "0") - - assert model_info.architecture == "GemmaForSequenceClassification" - - vllm_extra_kwargs: dict[str, Any] = { - "hf_overrides": { - "architectures": ["GemmaForSequenceClassification"], - "classifier_from_token": ["Yes"], - "method": "no_post_processing", - } - } +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: mteb_test_rerank_models(GemmaRerankerHfRunner, vllm_runner, model_info, - vllm_extra_kwargs, vllm_mteb_encoder=GemmaMtebEncoder) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index f805a64103c06..9911620c018ef 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any import pytest @@ -33,12 +32,15 @@ MODELS = [ ########### NewModel CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", + hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", + hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", + hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), ########### Qwen2ForCausalLM LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", @@ -60,11 +62,16 @@ MODELS = [ ] RERANK_MODELS = [ - # classifier_pooling: mean CLSPoolingRerankModelInfo( + # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", architecture="ModernBertForSequenceClassification", enable_test=True), + CLSPoolingRerankModelInfo( + "Alibaba-NLP/gte-multilingual-reranker-base", + architecture="GteNewForSequenceClassification", + hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, + enable_test=True), ] @@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner, check_transformers_version(model_info.name, max_transformers_version="4.53.2") - vllm_extra_kwargs: dict[str, Any] = {} - if model_info.architecture == "GteNewModel": - vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} - - mteb_test_embed_models(hf_runner, vllm_runner, model_info, - vllm_extra_kwargs) + mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) @@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner, check_transformers_version(model_info.name, max_transformers_version="4.53.2") - vllm_extra_kwargs: dict[str, Any] = {} - if model_info.architecture == "GteNewModel": - vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} - correctness_test_embed_models(hf_runner, vllm_runner, model_info, - example_prompts, vllm_extra_kwargs) + example_prompts) @pytest.mark.parametrize("model_info", RERANK_MODELS) diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py index 480bd5e4567cb..73823deeff4e0 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -10,12 +10,20 @@ from tests.conftest import HfRunner from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models +mxbai_rerank_hf_overrides = { + "architectures": ["Qwen2ForSequenceClassification"], + "classifier_from_token": ["0", "1"], + "method": "from_2_way_softmax", +} + RERANK_MODELS = [ LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", + hf_overrides=mxbai_rerank_hf_overrides, enable_test=True), LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", + hf_overrides=mxbai_rerank_hf_overrides, enable_test=False) ] @@ -71,13 +79,4 @@ class MxbaiRerankerHfRunner(HfRunner): @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: - vllm_extra_kwargs: dict[str, Any] = {} - if model_info.architecture == "Qwen2ForSequenceClassification": - vllm_extra_kwargs["hf_overrides"] = { - "architectures": ["Qwen2ForSequenceClassification"], - "classifier_from_token": ["0", "1"], - "method": "from_2_way_softmax", - } - - mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info, - vllm_extra_kwargs) + mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 37f5566a330d0..8c6537f3193f8 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -11,12 +11,20 @@ from tests.utils import multi_gpu_test from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models +qwen3_reranker_hf_overrides = { + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, +} + RERANK_MODELS = [ LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", + hf_overrides=qwen3_reranker_hf_overrides, enable_test=True), LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B", architecture="Qwen3ForSequenceClassification", + hf_overrides=qwen3_reranker_hf_overrides, enable_test=False) ] @@ -74,18 +82,7 @@ class Qwen3RerankerHfRunner(HfRunner): @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: - assert model_info.architecture == "Qwen3ForSequenceClassification" - - vllm_extra_kwargs: dict[str, Any] = { - "hf_overrides": { - "architectures": ["Qwen3ForSequenceClassification"], - "classifier_from_token": ["no", "yes"], - "is_original_qwen3_reranker": True, - } - } - - mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, - vllm_extra_kwargs) + mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", RERANK_MODELS) @@ -96,11 +93,6 @@ def test_rerank_models_mteb_tp(vllm_runner, assert model_info.architecture == "Qwen3ForSequenceClassification" vllm_extra_kwargs: dict[str, Any] = { - "hf_overrides": { - "architectures": ["Qwen3ForSequenceClassification"], - "classifier_from_token": ["no", "yes"], - "is_original_qwen3_reranker": True, - }, "tensor_parallel_size": 2, } diff --git a/tests/models/registry.py b/tests/models/registry.py index 2538e71692c4e..85b4c96e3b1c3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -365,6 +365,10 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { # [Cross-encoder] "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501 + "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501 + trust_remote_code=True, + hf_overrides={ + "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501 "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501 "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501 "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501 diff --git a/tests/models/utils.py b/tests/models/utils.py index 84aeb927c5fa9..0fb1f5b3753b5 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -3,7 +3,8 @@ import warnings from collections.abc import Sequence -from typing import Any, NamedTuple, Optional, Union +from dataclasses import dataclass +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -339,36 +340,43 @@ def softmax(data): return F.softmax(data, dim=-1) -class EmbedModelInfo(NamedTuple): +@dataclass +class ModelInfo: name: str - is_matryoshka: bool = False - matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" dtype: str = "auto" + hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" enable_test: bool = True +@dataclass +class EmbedModelInfo(ModelInfo): + is_matryoshka: bool = False + matryoshka_dimensions: Optional[list[int]] = None + + +@dataclass class CLSPoolingEmbedModelInfo(EmbedModelInfo): default_pooling_type: str = "CLS" +@dataclass class LASTPoolingEmbedModelInfo(EmbedModelInfo): default_pooling_type: str = "LAST" -class RerankModelInfo(NamedTuple): - name: str - architecture: str = "" - dtype: str = "auto" - default_pooling_type: str = "" - enable_test: bool = True +@dataclass +class RerankModelInfo(ModelInfo): + pass +@dataclass class CLSPoolingRerankModelInfo(RerankModelInfo): default_pooling_type: str = "CLS" +@dataclass class LASTPoolingRerankModelInfo(RerankModelInfo): default_pooling_type: str = "LAST" diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index dcb7e75456cde..3be7e11d947d5 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -27,12 +27,15 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.utils import WeightsMapper +from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, + maybe_prefix) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from .interfaces import SupportsQuant +from ..layers.pooler import ClassifierPooler, DispatchPooler, Pooler +from .bert import BertPooler +from .interfaces import SupportsCrossEncoding, SupportsQuant from .interfaces_base import default_pooling_type @@ -406,9 +409,14 @@ class BertWithRopeEncoder(nn.Module): class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + add_pooling_layer: bool = False): super().__init__() self.vllm_config = vllm_config + self.add_pooling_layer = add_pooling_layer self.config = vllm_config.model_config.hf_config self.embeddings = BertWithRopeEmbedding(self.config) self.encoder = BertWithRopeEncoder( @@ -416,6 +424,7 @@ class BertWithRope(nn.Module, SupportsQuant): bias=getattr(self.config, "bias", True), rotary_kwargs=self.config.rotary_kwargs, prefix=f"{prefix}.encoder") + self.pooler = BertPooler(self.config) if add_pooling_layer else None def forward( self, @@ -448,7 +457,7 @@ class BertWithRope(nn.Module, SupportsQuant): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if "pooler" in name: + if not self.add_pooling_layer and "pooler" in name: continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -508,8 +517,8 @@ class GteNewModel(BertWithRope): "attention.o_proj": "attn.out_proj", }) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs): + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) # GteNewModel only gate_up_proj does not have bias. # Hack method learned from vllm/model_executor/models/glm.py @@ -614,3 +623,65 @@ class JinaRobertaModel(BertWithRope): torch.Tensor]]) -> set[str]: weights = self.jina_merge_lora_weights(weights) return super().load_weights(weights) + + +@default_pooling_type("CLS") +class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding): + is_pooling_model = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.new = GteNewModel(vllm_config=vllm_config, + prefix=prefix, + add_pooling_layer=True) + self.classifier = RowParallelLinear(config.hidden_size, + config.num_labels, + input_is_parallel=False, + bias=True, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "classifier"), + return_bias=False) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=self.new.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=self.new.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loaded_params = loader.load_weights(weights) + return loaded_params + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + return self.new(input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index b0dbfacece3ab..377b7bf26a07a 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -406,6 +406,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, "GteNewModel": GteNewModelConfig, + "GteNewForSequenceClassification": GteNewModelConfig, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 12c0c77784db8..9040189ee5585 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -191,12 +191,14 @@ _EMBEDDING_MODELS = { _CROSS_ENCODER_MODELS = { "BertForSequenceClassification": ("bert", "BertForSequenceClassification"), + "GteNewForSequenceClassification": ("bert_with_rope", + "GteNewForSequenceClassification"), + "ModernBertForSequenceClassification": ("modernbert", + "ModernBertForSequenceClassification"), "RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"), "XLMRobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"), - "ModernBertForSequenceClassification": ("modernbert", - "ModernBertForSequenceClassification"), # [Auto-converted (see adapters.py)] "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501, } From c5d004aaaf3b2106d33974c673bec0568c18f762 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 28 Aug 2025 16:03:28 +0800 Subject: [PATCH 08/19] [Model] Add PP support and VLM backbone compatability for GPT-OSS (#23680) Signed-off-by: Isotr0py --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/gpt_oss.py | 119 +++++++++++++++++++------- 2 files changed, 87 insertions(+), 34 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 34e465584888b..17947e8cfad72 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -358,7 +358,7 @@ th { | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | -| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ | +| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ | | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 9c1c05320cf36..2b118d8491edd 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -11,7 +11,8 @@ from transformers import GptOssConfig from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank, +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -27,7 +28,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import cdiv +from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -75,8 +79,6 @@ class OAIAttention(nn.Module): dtype=torch.bfloat16, requires_grad=False)) - self.norm = RMSNorm(config.hidden_size, eps=1e-5) - self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 @@ -119,16 +121,13 @@ class OAIAttention(nn.Module): def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor) -> torch.Tensor: - t = self.norm(hidden_states) - - qkv, _ = self.qkv(t) + qkv, _ = self.qkv(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) v = v.contiguous() attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) - - return output + hidden_states + return output class MLPBlock(torch.nn.Module): @@ -145,7 +144,6 @@ class MLPBlock(torch.nn.Module): self.num_experts = config.num_local_experts self.experts_per_token = config.num_experts_per_tok self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - self.norm = RMSNorm(config.hidden_size, eps=1e-5) self.router = torch.nn.Linear(config.hidden_size, config.num_local_experts, dtype=torch.bfloat16) @@ -163,10 +161,9 @@ class MLPBlock(torch.nn.Module): activation="swigluoai") def forward(self, x: torch.Tensor) -> torch.Tensor: - t = self.norm(x) - g = self.router(t) - t = self.experts(hidden_states=t, router_logits=g) - return x + t + g = self.router(x) + x = self.experts(hidden_states=x, router_logits=g) + return x class TransformerBlock(torch.nn.Module): @@ -187,12 +184,28 @@ class TransformerBlock(torch.nn.Module): self.layer_idx, quant_config=quant_config, prefix=f"{prefix}.mlp") + self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5) - def forward(self, hidden_states: torch.Tensor, - positions: torch.Tensor) -> torch.Tensor: - attn_output = self.attn(hidden_states, positions) - output = self.mlp(attn_output) - return output + def forward( + self, + hidden_states: torch.Tensor, + positions: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.attn(hidden_states, positions) + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + output = self.mlp(hidden_states) + return output, residual @support_torch_compile @@ -214,22 +227,52 @@ class GptOssModel(nn.Module): self.config.vocab_size, self.config.hidden_size, ) - self.layers = torch.nn.ModuleList([ - TransformerBlock( + self.start_layer, self.end_layer, self.layers = make_layers( + self.config.num_hidden_layers, + lambda prefix: TransformerBlock( self.config, cache_config=self.cache_config, quant_config=self.quant_config, - prefix=maybe_prefix(prefix, f"block.{layer_idx}"), - ) for layer_idx in range(self.config.num_hidden_layers) - ]) + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) self.norm = RMSNorm(self.config.hidden_size, eps=1e-5) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], self.config.hidden_size)) - def forward(self, input_ids: torch.Tensor, - positions: torch.Tensor) -> torch.Tensor: - x = self.embedding(input_ids) - for layer in self.layers: - x = layer(x, positions) - x = self.norm(x) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embedding(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + x = inputs_embeds + else: + x = self.get_input_embeddings(input_ids) + + residual = None + else: + assert intermediate_tensors is not None + x = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + x, residual = layer(x, positions, residual) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": x, + "residual": residual + }) + x, _ = self.norm(x, residual) return x def _load_weights_mxfp4( @@ -264,6 +307,10 @@ class GptOssModel(nn.Module): intermediate_size) for name, weight in weights: + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # FIXME(woosuk): Remove this after testing. weight = weight.cuda() @@ -445,6 +492,10 @@ class GptOssModel(nn.Module): intermediate_size) for name, weight in weights: + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if ".w13_weight" in name: # Handle MLP gate and up projection weights # Extract gate and up projection parts @@ -562,18 +613,15 @@ class GptOssModel(nn.Module): weights, stacked_params_mapping) -class GptOssForCausalLM(nn.Module): +class GptOssForCausalLM(nn.Module, SupportsPP): packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".self_attn.": ".attn.", - ".post_attention_layernorm.": ".mlp.norm.", }, orig_to_new_suffix={ ".embed_tokens.weight": ".embedding.weight", - ".input_layernorm.weight": ".attn.norm.weight", - ".post_attention_layernorm.weight": ".mlp.norm.weight", # MoE MXFP4 weights ".gate_up_proj_blocks": ".w13_weight", @@ -609,6 +657,11 @@ class GptOssForCausalLM(nn.Module): self.config.hidden_size, ) self.logits_processor = LogitsProcessor(self.config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) def forward(self, input_ids: torch.Tensor, From 3462c1c522d214755f1dfce3d645ab5afe7f00ae Mon Sep 17 00:00:00 2001 From: JartX Date: Thu, 28 Aug 2025 11:03:22 +0200 Subject: [PATCH 09/19] [FIXBUG] Add return_success parameter to moe_wna16_weight_loader function (#22797) Signed-off-by: JartX Co-authored-by: Michael Goin --- .../layers/quantization/moe_wna16.py | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 364d1ac314d2d..0cde104cc75d7 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -124,7 +124,7 @@ class MoeWNA16Config(QuantizationConfig): awq_min_capability = AWQConfig.get_min_capability() gptq_compatible = quant_method == "gptq" and \ - not desc_act and num_bits in [4, 8] + not desc_act and num_bits in [4, 8] awq_compatible = quant_method == "awq" and num_bits == 4 and \ device_capability >= awq_min_capability @@ -175,11 +175,8 @@ class MoeWNA16Method(FusedMoEMethodBase): quant_config: The MOE WNA16 (W8A16/W4A16) quantization config. """ - def __init__( - self, - quant_config: MoeWNA16Config, - moe: FusedMoEConfig, - ): + def __init__(self, quant_config: MoeWNA16Config, + moe: "FusedMoEConfig") -> None: super().__init__(moe) self.quant_config = quant_config @@ -187,6 +184,7 @@ class MoeWNA16Method(FusedMoEMethodBase): hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + self.moe = layer layer.quant_config = self.quant_config bit8_pack_factor = self.quant_config.bit8_pack_factor group_size = self.quant_config.group_size @@ -308,7 +306,6 @@ class MoeWNA16Method(FusedMoEMethodBase): logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `MoeWNA16Method` yet.") @@ -404,12 +401,14 @@ class MoeWNA16Method(FusedMoEMethodBase): def moe_wna16_weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str, shard_id: str, - expert_id: int): + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False): if "g_idx" in weight_name: - return + return False if return_success else None if not layer.quant_config.has_zp and "qzeros" in weight_name: - return + return False if return_success else None device = get_tp_group().device tp_rank = get_tensor_model_parallel_rank() @@ -455,11 +454,18 @@ class MoeWNA16Method(FusedMoEMethodBase): param.data[expert_id, :shard_size // 2] = tensor else: param.data[expert_id, shard_size // 2:] = tensor + return True if return_success else None elif "w2_qzeros" in weight_name: param.data[expert_id] = loaded_weight.view( loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank] + return True if return_success else None else: - weight_loader(param, loaded_weight, weight_name, shard_id, - expert_id) + # Delegate to the original loader, passing return_success + return weight_loader(param, + loaded_weight, + weight_name, + shard_id, + expert_id, + return_success=return_success) return moe_wna16_weight_loader From d99c3a4f7bd33e3e3acf7c2c82d52d15ba501eaf Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 28 Aug 2025 13:38:19 +0200 Subject: [PATCH 10/19] [Doc]: fix typos in .md files (including those of #23751) (#23825) Signed-off-by: Didier Durand --- docs/contributing/ci/update_pytorch_version.md | 2 +- docs/contributing/model/multimodal.md | 2 +- docs/deployment/frameworks/lobe-chat.md | 2 +- docs/deployment/k8s.md | 2 +- docs/design/fused_moe_modular_kernel.md | 2 +- docs/design/metrics.md | 4 ++-- docs/features/lora.md | 2 +- docs/features/reasoning_outputs.md | 2 +- docs/features/structured_outputs.md | 2 +- docs/getting_started/installation/aws_neuron.md | 4 ++-- docs/getting_started/installation/cpu/apple.inc.md | 2 +- docs/getting_started/installation/gpu/cuda.inc.md | 2 +- docs/getting_started/installation/gpu/rocm.inc.md | 4 ++-- docs/models/pooling_models.md | 2 +- docs/models/supported_models.md | 2 +- docs/usage/usage_stats.md | 2 +- 16 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 7ef22d6f8c3f5..3dae62dd5d944 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`) when manually triggering a build on Buildkite. This branch accomplishes two things: -1. Increase the timeout limit to 10 hours so that the build doesn't timeout. +1. Increase the timeout limit to 10 hours so that the build doesn't time out. 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket to warm it up so that future builds are faster. diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 76d0f067fd452..dc742c8fcf2cd 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -855,7 +855,7 @@ Examples: ### Custom HF processor -Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. +Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. Examples: diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md index e3e7dbe6e1e80..8ecd1484eab06 100644 --- a/docs/deployment/frameworks/lobe-chat.md +++ b/docs/deployment/frameworks/lobe-chat.md @@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application. -It supports vLLM as a AI model provider to efficiently serve large language models. +It supports vLLM as an AI model provider to efficiently serve large language models. For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm). diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index cad801a4312cc..ca23e0b9fd8af 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -380,7 +380,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated" -If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened: +If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened: 1. container log contains "KeyboardInterrupt: terminated" 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted` diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 202e9c1caf113..b03483d1c9b21 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C #### Step 1: Add an All2All manager -The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py). +The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py). #### Step 2: Add a FusedMoEPrepareAndFinalize Type diff --git a/docs/design/metrics.md b/docs/design/metrics.md index b24364247b3f8..90b2fd32f2979 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201 ### Multi-process Mode -In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See . +In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See . ### Built in Python/Process Metrics -The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used: +The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used: - `python_gc_objects_collected_total` - `python_gc_objects_uncollectable_total` diff --git a/docs/features/lora.md b/docs/features/lora.md index 668460a368a77..db794b2ebd71d 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -52,7 +52,7 @@ Check out for an exa ## Serving LoRA Adapters LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use -`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: +`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server: ```bash vllm serve meta-llama/Llama-2-7b-hf \ diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 04b943efbbbb4..d9a785eb73fbe 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att print(content, end="", flush=True) ``` -Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). +Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). ## Tool Calling diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 8a934d406f382..0d6294a5fdd79 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104). -For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct` +For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct` Here is a simple example demonstrating how to get structured output using Pydantic models: diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md index b8bd76bd5bcbe..ff2500f035270 100644 --- a/docs/getting_started/installation/aws_neuron.md +++ b/docs/getting_started/installation/aws_neuron.md @@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you - `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the - artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set, - but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts + artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set, + but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts under this specified path. - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 2828173a76a9a..124a41adf1ae2 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS. +vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 69a9842e4719b..275232e12e08c 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE #### Install the latest code -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`. ```bash uv pip install -U vllm \ diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 560883d3caf9e..80e99d3034d39 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -149,7 +149,7 @@ Build a docker image from which setup ROCm **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.** If you choose to build this rocm_base image yourself, the steps are as follows. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: ```json { @@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \ #### Build an image with vLLM First, build a docker image from and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: +It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: ```bash { diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index fbb5f6f6dd171..d2fbb1870dde6 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -258,4 +258,4 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -A openai client example can be found here: +An OpenAI client example can be found here: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 17947e8cfad72..01c1090c6fca8 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -40,7 +40,7 @@ If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it mean #### Custom models -If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! +If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM! For a model to be compatible with the Transformers backend for vLLM it must: diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md index e78c67522f61b..4c7a7ff019e8c 100644 --- a/docs/usage/usage_stats.md +++ b/docs/usage/usage_stats.md @@ -51,7 +51,7 @@ tail ~/.config/vllm/usage_stats.json ## Opting out -You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: +You can opt out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: ```bash # Any of the following methods can disable usage stats collection From 67cee40da035b7478483c76dfbe0bfc321c3822f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 28 Aug 2025 19:57:05 +0800 Subject: [PATCH 11/19] [CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 20 +++++++++---------- vllm/model_executor/models/utils.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 9dec9f8e9eb32..8b8f0e8c6578d 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -49,23 +49,23 @@ function cpu_tests() { # Run kernel tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_onednn.py" + pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e # Note: disable until supports V1 - # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # Note: disable Bart until supports V1 - pytest -v -s tests/models/language/generation -m cpu_model \ + pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ + VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - pytest -v -s tests/models/language/pooling -m cpu_model - pytest -v -s tests/models/multimodal/generation \ + pytest -x -v -s tests/models/language/pooling -m cpu_model + pytest -x -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" @@ -73,20 +73,20 @@ function cpu_tests() { # Run compressed-tensor test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -s -v \ + # VLLM_USE_V1=0 pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/lora/test_qwen2vl.py" # online serving diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 11e098f1d7bdb..28cfefac30ddb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -507,10 +507,10 @@ def merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if isinstance(placeholder_token_id, list): - placeholder_token_id = torch.tensor(placeholder_token_id, - pin_memory=True).to( - device=input_ids.device, - non_blocking=True) + placeholder_token_id = torch.tensor( + placeholder_token_id, + pin_memory=is_pin_memory_available()).to(device=input_ids.device, + non_blocking=True) return _merge_multimodal_embeddings( inputs_embeds, torch.isin(input_ids, placeholder_token_id), From a3432f18fdd85eb18e29fc32327507fe1063ad57 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Aug 2025 05:26:45 -0700 Subject: [PATCH 12/19] [BugFix][Spec Decode] Use float64 for uniform_probs (#23803) Signed-off-by: Woosuk Kwon --- examples/offline_inference/spec_decode.py | 2 +- vllm/v1/sample/rejection_sampler.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index c4972f02d0f8e..5af232cb6af6a 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -138,7 +138,7 @@ def main(): sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) if not args.custom_mm_prompts: outputs = llm.generate( - TokensPrompt(prompt_token_ids=prompt_ids), + [TokensPrompt(prompt_token_ids=x) for x in prompt_ids], sampling_params=sampling_params, ) else: diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 2d9ce3101b6c9..511cdb3234253 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -365,9 +365,14 @@ def generate_uniform_probs( A tensor of shape `(num_tokens, )` containing uniform random values in the range [0, 1). """ + # NOTE(woosuk): We deliberately use float64 instead of float32 here + # because when using float32, there's a non-negligible chance that + # uniform_prob is sampled to be exact 0.0 as reported in + # https://github.com/pytorch/pytorch/issues/16706. Using float64 + # mitigates the issue. uniform_probs = torch.rand( (num_tokens, ), - dtype=torch.float32, + dtype=torch.float64, device=device, ) start_idx = 0 From bfab219648fdd6d398c09cd022117b0e663c9e36 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 28 Aug 2025 20:36:55 +0800 Subject: [PATCH 13/19] [Model] [gpt-oss] fix gpt-oss pp support (#23815) Signed-off-by: zjy0516 --- vllm/model_executor/models/gpt_oss.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 2b118d8491edd..e0b4df7728757 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -668,9 +668,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP): positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: - assert intermediate_tensors is None - assert inputs_embeds is None - return self.model(input_ids, positions) + return self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: From d3da2eea546b33b9444519f99c26721f7344117f Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:37:38 +0200 Subject: [PATCH 14/19] [Doc]: fix typos in Python scripts (#23828) Signed-off-by: Didier Durand --- vllm/compilation/backends.py | 4 ++-- vllm/config/cache.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/entrypoints/chat_utils.py | 2 +- vllm/entrypoints/openai/api_server.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- vllm/v1/cudagraph_dispatcher.py | 4 ++-- vllm/v1/worker/block_table.py | 2 +- vllm/v1/worker/cpu_model_runner.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index fa86773d24743..3361b65a9b885 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule, outputs.append( SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) - # sort by intetger graph_id, rather than string name + # sort by integer graph_id, rather than string name outputs.sort(key=lambda x: x.graph_id) return split_gm, outputs @@ -424,7 +424,7 @@ class VllmBackend: # if the model is initialized with a non-empty prefix, # then usually it's enough to use that prefix, - # e.g. launguage_model, vision_model, etc. + # e.g. language_model, vision_model, etc. # when multiple parts are initialized as independent # models, we need to use the model_tag to distinguish # them, e.g. backbone (default), eagle_head, etc. diff --git a/vllm/config/cache.py b/vllm/config/cache.py index a9550d4390ad6..3d2aa6b17be79 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -115,7 +115,7 @@ class CacheConfig: In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), some layers can skip tokens corresponding to prefill. This flag enables - attention metadata for eligible layers to be overriden with metadata + attention metadata for eligible layers to be overridden with metadata necessary for implementing this optimization in some models (e.g. Gemma3n) """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4d205aeb8633..7802802f138b7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1053,7 +1053,7 @@ class EngineArgs: self.trust_remote_code, self.revision, self.code_revision, self.config_format) - # if loading a SpeculatorsConfig, load the specualtive_config + # if loading a SpeculatorsConfig, load the speculative_config # details from the config directly # no user input required / expected if isinstance(hf_config, SpeculatorsConfig): diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 7b11a50642de9..1954cbcbf1edd 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC): def __init__(self) -> None: super().__init__() - # stores model placehodlers list with corresponding + # stores model placeholders list with corresponding # general MM placeholder: # { # "<##IMAGE##>": ["", "", ""], diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9a2470649c8d2..a28d38729f9f0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE: raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, detail="Missing 'method' in request body") # For security reason, only serialized string args/kwargs are passed. - # User-defined `method` is responsible for deseralization if needed. + # User-defined `method` is responsible for deserialization if needed. args: list[str] = body.get("args", []) kwargs: dict[str, str] = body.get("kwargs", {}) timeout: Optional[float] = body.get("timeout") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 230572041c80d..b07bf675ca47d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig): format = scheme_dict.get("format") # Find the sparsity scheme of the layer - # assume that fused layers inerhit first component's sparsity scheme + # assume that fused layers inherit first component's sparsity scheme sparsity_targets = (self.sparsity_scheme_map.keys() - set(self.sparsity_ignore_list)) sparsity_scheme: Optional[SparsityCompressionConfig] = None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index af9d1c46f68f4..2cad9ff0d321e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): ) -> "CompressedTensorsMoEMethod": # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. - # Check if a using "Linear" to select scheems + # Check if a using "Linear" to select schemes if "Linear" in quant_config.target_scheme_map: matched_target = "Linear" else: diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 02e65820b7c00..3b4f1d20b64fb 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -11,7 +11,7 @@ logger = init_logger(__name__) class CudagraphDispatcher: """ - Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs. + Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs. The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one for FULL cudagraph runtime mode. The keys are initialized depending on @@ -21,7 +21,7 @@ class CudagraphDispatcher: At runtime, the dispatch method generates the runtime cudagraph mode (FULL, PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor) - based on the input key. After dispatching (commuicate via forward context), + based on the input key. After dispatching (communicate via forward context), the cudagraph wrappers will trust the dispatch key to do either capturing or replaying (if mode matched), or pass through to the underlying runnable without cudagraph (if mode no match or mode is NONE). diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 5662fc350e198..6ab5ce2748a4a 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -110,7 +110,7 @@ class BlockTable: self.block_table_cpu.fill_(0) def get_device_tensor(self) -> torch.Tensor: - """Ruturns the device tensor of the block table.""" + """Returns the device tensor of the block table.""" return self.block_table def get_cpu_tensor(self) -> torch.Tensor: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 742e553b77e09..7d0726112704a 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner): Args: scheduler_output: The scheduler output. """ - # Attention free models have zero kv_cache_goups, however models + # Attention free models have zero kv_cache_groups, however models # like Mamba are also attention free but use the kv_cache for # keeping its internal state. This is why we check the number # of kv_cache groups instead of solely checking From 66548f66031006ca873ac799d2dc8497fec33339 Mon Sep 17 00:00:00 2001 From: "YUQI.CHENG" <420985011@qq.com> Date: Thu, 28 Aug 2025 21:44:09 +0800 Subject: [PATCH 15/19] [Bugfix] Fix benchmark_moe.py for blockwise fp8. (#23823) Signed-off-by: crischeng <420985011@qq.com> Co-authored-by: cris --- benchmarks/kernels/benchmark_moe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 752c2d0082167..710d30adfd846 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -419,8 +419,10 @@ class BenchmarkWorker: ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. + block_n = block_quant_shape[0] if block_quant_shape else None + block_k = block_quant_shape[1] if block_quant_shape else None op_config = get_moe_configs( - num_experts, shard_intermediate_size // 2, dtype_str + num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k ) if op_config is None: config = get_default_config( @@ -430,6 +432,7 @@ class BenchmarkWorker: hidden_size, topk, dtype_str, + block_quant_shape, ) else: config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] From 1f096f9b9536aa8f520f89b178a518da294a7dce Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 28 Aug 2025 15:52:01 +0200 Subject: [PATCH 16/19] [CI] Fix linting error on main (#23835) Signed-off-by: Thomas Parnell --- vllm/v1/cudagraph_dispatcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 3b4f1d20b64fb..d2db7dcb3f091 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -11,7 +11,8 @@ logger = init_logger(__name__) class CudagraphDispatcher: """ - Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs. + Runtime cudagraph dispatcher to dispatch keys for multiple set of + cudagraphs. The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one for FULL cudagraph runtime mode. The keys are initialized depending on From 95089607fa307c5facfb9706ea919292fb56e78c Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Thu, 28 Aug 2025 21:56:20 +0800 Subject: [PATCH 17/19] [Model][gpt-oss] Support DP+EP for GPT-OSS with FlashInfer trtllm-gen MoE (#23819) Signed-off-by: Po-Han Huang --- vllm/model_executor/layers/fused_moe/config.py | 15 ++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++---- vllm/model_executor/layers/quantization/mxfp4.py | 6 ++---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index cab610decf901..0b501cd87fb5d 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -190,12 +190,6 @@ class FusedMoEParallelConfig: return (self.use_all2all_kernels and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency") - @property - def use_flashinfer_cutlass_kernels(self): - return (envs.VLLM_USE_FLASHINFER_MOE_FP4 - and has_flashinfer_cutlass_fused_moe() - and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") - @staticmethod def make(tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig": @@ -404,7 +398,14 @@ class FusedMoEConfig: @property def use_flashinfer_cutlass_kernels(self): - return self.moe_parallel_config.use_flashinfer_cutlass_kernels + """ + Whether to use FlashInfer cutlass kernels for NVFP4 MoE. + """ + return (self.quant_config is not None + and self.quant_config.quant_dtype == "nvfp4" + and envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") @staticmethod def make( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b9de03ddd216e..28123d3958adc 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -920,7 +920,7 @@ class FusedMoE(CustomOp): self.batched_router_logits: Optional[torch.Tensor] = None if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels - or self.moe_parallel_config.use_flashinfer_cutlass_kernels): + or self.moe_config.use_flashinfer_cutlass_kernels): self.batched_hidden_states = torch.zeros( (moe.max_num_tokens, self.hidden_size), dtype=moe.in_dtype, @@ -974,7 +974,7 @@ class FusedMoE(CustomOp): @property def use_flashinfer_cutlass_kernels(self): - return self.moe_parallel_config.use_flashinfer_cutlass_kernels + return self.moe_config.use_flashinfer_cutlass_kernels def update_expert_map(self): # ep_size and ep_rank should already be updated @@ -1665,7 +1665,7 @@ class FusedMoE(CustomOp): # only when data parallelism (DP) is enabled. use_flashinfer_cutlass_kernels = ( self.dp_size > 1 - and self.moe_parallel_config.use_flashinfer_cutlass_kernels) + and self.moe_config.use_flashinfer_cutlass_kernels) if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or use_flashinfer_cutlass_kernels): @@ -1674,7 +1674,7 @@ class FusedMoE(CustomOp): do_naive_dispatch_combine: bool = ( self.dp_size > 1 and not self.moe_parallel_config.use_deepep_ht_kernels - and not self.moe_parallel_config.use_flashinfer_cutlass_kernels) + and not self.moe_config.use_flashinfer_cutlass_kernels) if do_naive_dispatch_combine: hidden_states, router_logits = get_ep_group().dispatch( hidden_states, router_logits) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 6724796904f01..f7d591328f93a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if should_use_flashinfer_mxfp4(): from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe - assert not self.moe.use_ep, ( - "EP is not supported for flashinfer mxfp4 moe backend yet.") if _should_use_flashinfer_mxfp4_bf16(): assert x.dtype == torch.bfloat16 x_quant = x @@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): None, # output1_scale_scalar None, # output1_scale_gate_scalar None, # output2_scale_scalar - self.num_experts, + global_num_experts, top_k, None, # n_group None, # topk_group self.intermediate_size, # padded to multiple of 256 - 0, # local_expert_offset + layer.ep_rank * layer.local_num_experts, # local_expert_offset self.num_experts, # local num experts None, self._get_tile_tokens_dim(x, top_k), From db74d604900d397e4ee524f93bcb256537679ce4 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Thu, 28 Aug 2025 08:25:56 -0700 Subject: [PATCH 18/19] [Bugfix] Add fake mode around passes (#23349) Signed-off-by: angelayi --- vllm/compilation/activation_quant_fusion.py | 2 + vllm/compilation/collective_fusion.py | 6 ++ vllm/compilation/fusion.py | 2 + vllm/compilation/fusion_attn.py | 71 ++++++++++----------- vllm/compilation/inductor_pass.py | 20 ++++++ vllm/compilation/sequence_parallelism.py | 2 + 6 files changed, 64 insertions(+), 39 deletions(-) diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index ce4e50a2b02d1..826014f770df3 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -10,6 +10,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from .inductor_pass import enable_fake_mode from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) @@ -61,6 +62,7 @@ class ActivationQuantFusionPass(VllmInductorPass): https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 """ + @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 0c545d8cffd24..7a99aaff707dc 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -19,6 +19,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +from .inductor_pass import enable_fake_mode from .vllm_inductor_pass import VllmInductorPass FP8_DTYPE = current_platform.fp8_dtype() @@ -349,6 +350,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern): class AsyncTPPass(VllmInductorPass): + @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) @@ -1121,6 +1123,10 @@ class AllReduceFusionPass(VllmInductorPass): # in fallback path, when we don't use flashinfer fuse_rms_quant=config.compilation_config.pass_config.enable_fusion) + self.register_patterns() + + @enable_fake_mode + def register_patterns(self): for epsilon in [1e-5, 1e-6]: AllReduceFusedRMSNormStaticQuantFP8Pattern( epsilon, diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 0d8d562514e31..afa739c966a5b 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from .fx_utils import find_getitem_maybe +from .inductor_pass import enable_fake_mode from .multi_output_match import MultiOutputMatch from .vllm_inductor_pass import VllmInductorPass @@ -528,6 +529,7 @@ class FusionPass(VllmInductorPass): cls._instance.pass_config = config.compilation_config.pass_config return cls._instance + @enable_fake_mode def __init__(self, config: VllmConfig): assert self.__class__._instance is None, \ "FusionPass singleton instance already exists" diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index f942afe6a28ee..3095f17110fde 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -7,8 +7,6 @@ import torch import torch._inductor.pattern_matcher as pm from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from torch._subclasses.fake_tensor import (FakeTensorMode, - unset_fake_temporarily) from vllm.attention import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config @@ -19,6 +17,7 @@ from vllm.platforms import current_platform from vllm.utils import round_up from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 +from .inductor_pass import enable_fake_mode from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) @@ -139,24 +138,21 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): output_block_scale=None) return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size]) - # Need custom fake mode, otherwise tracing happens with real tensors. - # That would not work for the unified_attention custom op. - with unset_fake_temporarily(), FakeTensorMode(): - inputs = [ - empty_bf16(5, self.num_heads, self.head_size), # q - empty_bf16(5, self.num_heads, self.head_size), # k - empty_bf16(5, self.num_heads, self.head_size), # v - empty_bf16(5, self.num_heads, self.head_size), # attn_output - self.empty_quant(5, self.num_heads * - self.head_size), # quant_output - empty_fp32(1, 1) # scale - ] + inputs = [ + empty_bf16(5, self.num_heads, self.head_size), # q + empty_bf16(5, self.num_heads, self.head_size), # k + empty_bf16(5, self.num_heads, self.head_size), # v + empty_bf16(5, self.num_heads, self.head_size), # attn_output + self.empty_quant(5, + self.num_heads * self.head_size), # quant_output + empty_fp32(1, 1) # scale + ] - pm.register_replacement( - pattern, replacement, inputs, - AttentionQuantPattern.wrap_trace_fn( - AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only), - pm_pass) + pm.register_replacement( + pattern, replacement, inputs, + AttentionQuantPattern.wrap_trace_fn( + AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only), + pm_pass) class AttentionNvfp4QuantPattern(AttentionQuantPattern): @@ -219,27 +215,23 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): [-1, self.num_heads * self.head_size // 2]) return output, at2[2] - # Need custom fake mode, otherwise tracing happens with real tensors. - # That would not work for the unified_attention custom op. - with unset_fake_temporarily(), FakeTensorMode(): - inputs = [ - empty_bf16(5, self.num_heads, self.head_size), # q - empty_bf16(5, self.num_heads, self.head_size), # k - empty_bf16(5, self.num_heads, self.head_size), # v - empty_bf16(5, self.num_heads, self.head_size), # output_attn - self.empty_quant(5, self.num_heads * self.head_size // - 2), # output_quant - empty_i32(128, - round_up(self.num_heads * self.head_size // 16, - 4)), # output_scale - empty_fp32(1, 1), # input_scale - ] + inputs = [ + empty_bf16(5, self.num_heads, self.head_size), # q + empty_bf16(5, self.num_heads, self.head_size), # k + empty_bf16(5, self.num_heads, self.head_size), # v + empty_bf16(5, self.num_heads, self.head_size), # output_attn + self.empty_quant(5, self.num_heads * self.head_size // + 2), # output_quant + empty_i32(128, round_up(self.num_heads * self.head_size // 16, + 4)), # output_scale + empty_fp32(1, 1), # input_scale + ] - pm.register_replacement( - pattern, replacement, inputs, - AttentionQuantPattern.wrap_trace_fn( - AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only), - pm_pass) + pm.register_replacement( + pattern, replacement, inputs, + AttentionQuantPattern.wrap_trace_fn( + AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only), + pm_pass) class AttnFusionPass(VllmInductorPass): @@ -255,6 +247,7 @@ class AttnFusionPass(VllmInductorPass): support are attention kernels, which need to support fusing output quant. """ + @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 2a149c65b3877..e1b691df385d7 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools import hashlib import inspect import json @@ -10,6 +11,8 @@ from typing import Any, Callable, Optional, Union import torch from torch import fx +from torch._subclasses.fake_tensor import (FakeTensorMode, + unset_fake_temporarily) from vllm.utils import is_torch_equal_or_newer @@ -114,3 +117,20 @@ class CallableInductorPass(InductorPass): def uuid(self) -> Any: return self._uuid + + +def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]: + """ + Applies a FakeTensorMode context. This is useful when you don't want to + create or run things with real tensors. + """ + + @functools.wraps(fn) + def fn_new(*args, **kwargs) -> Any: + with torch._guards.tracing( + None), unset_fake_temporarily(), FakeTensorMode(): + result = fn(*args, **kwargs) + + return result + + return fn_new diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index ebc025cba71ed..1758ed4c86d27 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import ( from vllm.logger import init_logger from vllm.platforms import current_platform +from .inductor_pass import enable_fake_mode from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) @@ -436,6 +437,7 @@ class SequenceParallelismPass(VllmInductorPass): performance. """ + @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) From 0583578f42fb23cc8a6d612e041c6be402551282 Mon Sep 17 00:00:00 2001 From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:59:19 +0200 Subject: [PATCH 19/19] [ci] breaks down V1 Test into 3 groups of approx 30 minutes runtime (#23757) Signed-off-by: Jean Schmidt --- .buildkite/test-pipeline.yaml | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index cf90505257e90..24cc57e9dfb97 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -234,7 +234,26 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test +- label: V1 Test e2e + engine + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -242,8 +261,6 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core - - pytest -v -s v1/engine - - pytest -v -s v1/entrypoints - pytest -v -s v1/executor - pytest -v -s v1/sample - pytest -v -s v1/logits_processors @@ -256,9 +273,6 @@ steps: - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_metrics_reader.py - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine