From 970713d4a40b1d83244eb0ed4eb4d690b6bb4f14 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:34:08 +0000 Subject: [PATCH 1/6] Remove `SkipValidation` from `ModelConfig` (#30695) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 7ff095bcb9ccd..1de9d15cf8c52 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -8,7 +8,7 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, Literal, cast, get_args import torch -from pydantic import ConfigDict, SkipValidation, field_validator, model_validator +from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from transformers.configuration_utils import ALLOWED_LAYER_TYPES @@ -109,7 +109,7 @@ class ModelConfig: """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - tokenizer: SkipValidation[str] = None # type: ignore + tokenizer: str = Field(default=None) """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode | str = "auto" @@ -164,7 +164,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: SkipValidation[int] = None # type: ignore + max_model_len: int = Field(default=None, gt=0) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -175,7 +175,7 @@ class ModelConfig: - 25.6k -> 25,600""" spec_target_max_model_len: int | None = None """Specify the maximum length for spec decoding draft models.""" - quantization: SkipValidation[QuantizationMethods | None] = None + quantization: QuantizationMethods | str | None = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is `None`, we assume the model weights are not quantized and use `dtype` to @@ -597,6 +597,14 @@ class ModelConfig: self._verify_cuda_graph() self._verify_bnb_config() + @field_validator("tokenizer", "max_model_len", mode="wrap") + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + @field_validator("tokenizer_mode", mode="after") def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str: return tokenizer_mode.lower() @@ -610,13 +618,14 @@ class ModelConfig: @model_validator(mode="after") def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + """Called after __post_init__""" if not isinstance(self.tokenizer, str): raise ValueError( f"tokenizer must be a string, got " f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. " "Please provide a valid tokenizer path or HuggingFace model ID." ) - if not isinstance(self.max_model_len, int) or self.max_model_len <= 0: + if not isinstance(self.max_model_len, int): raise ValueError( f"max_model_len must be a positive integer, " f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. " From ec154c36ee74f35def28e4ddc1c16a0dc7a8c112 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 16 Dec 2025 01:36:07 +0800 Subject: [PATCH 2/6] [Platform] Refactor Platform attention backend selection to avoid breakpoint for OOT platform (#30212) Signed-off-by: Isotr0py Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/attention/selector.py | 59 +++++++++++++++++------------ vllm/platforms/cpu.py | 15 ++------ vllm/platforms/cuda.py | 74 ++++++++----------------------------- vllm/platforms/interface.py | 12 +----- vllm/platforms/rocm.py | 22 +++++------ vllm/platforms/tpu.py | 13 ++----- vllm/platforms/xpu.py | 15 ++------ 7 files changed, 73 insertions(+), 137 deletions(-) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index bbf95ff009001..e66f698add99d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -2,11 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import cache -from typing import cast, get_args +from typing import NamedTuple, cast, get_args import torch -from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend, AttentionType from vllm.attention.backends.registry import ( MAMBA_TYPE_TO_BACKEND_MAP, MambaAttentionBackendEnum, @@ -18,6 +18,31 @@ from vllm.utils.import_utils import resolve_obj_by_qualname logger = init_logger(__name__) +class AttentionSelectorConfig(NamedTuple): + head_size: int + dtype: torch.dtype + kv_cache_dtype: CacheDType | None + block_size: int | None + use_mla: bool = False + has_sink: bool = False + use_sparse: bool = False + use_mm_prefix: bool = False + attn_type: str = AttentionType.DECODER + + def __repr__(self): + return ( + f"AttentionSelectorConfig(head_size={self.head_size}, " + f"dtype={self.dtype}, " + f"kv_cache_dtype={self.kv_cache_dtype}, " + f"block_size={self.block_size}, " + f"use_mla={self.use_mla}, " + f"has_sink={self.has_sink}, " + f"use_sparse={self.use_sparse}, " + f"use_mm_prefix={self.use_mm_prefix}, " + f"attn_type={self.attn_type})" + ) + + def get_attn_backend( head_size: int, dtype: torch.dtype, @@ -43,8 +68,7 @@ def get_attn_backend( vllm_config = get_current_vllm_config() backend_enum = vllm_config.attention_config.backend - return _cached_get_attn_backend( - backend=backend_enum, + attn_selector_config = AttentionSelectorConfig( head_size=head_size, dtype=dtype, kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype), @@ -53,36 +77,25 @@ def get_attn_backend( has_sink=has_sink, use_sparse=use_sparse, use_mm_prefix=use_mm_prefix, - attn_type=attn_type, + attn_type=attn_type or AttentionType.DECODER, + ) + + return _cached_get_attn_backend( + backend=backend_enum, + attn_selector_config=attn_selector_config, ) @cache def _cached_get_attn_backend( backend, - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: CacheDType | None, - block_size: int | None, - use_mla: bool = False, - has_sink: bool = False, - use_sparse: bool = False, - use_mm_prefix: bool = False, - attn_type: str | None = None, + attn_selector_config: AttentionSelectorConfig, ) -> type[AttentionBackend]: from vllm.platforms import current_platform attention_cls = current_platform.get_attn_backend_cls( backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, + attn_selector_config=attn_selector_config, ) if not attention_cls: raise ValueError( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index d961dcf13e53e..e1b461d79a655 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -23,6 +23,7 @@ from .interface import CpuArchEnum, Platform, PlatformEnum logger = init_logger(__name__) if TYPE_CHECKING: + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig else: VllmConfig = None @@ -126,21 +127,13 @@ class CpuPlatform(Platform): def get_attn_backend_cls( cls, selected_backend: "AttentionBackendEnum", - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: str | None, - block_size: int, - use_mla: bool, - has_sink: bool, - use_sparse: bool, - use_mm_prefix: bool, - attn_type: str | None = None, + attn_selector_config: "AttentionSelectorConfig", ) -> str: if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN: logger.info("Cannot use %s backend on CPU.", selected_backend) - if use_mla: + if attn_selector_config.use_mla: raise NotImplementedError("MLA is not supported on CPU.") - if use_sparse: + if attn_selector_config.use_sparse: raise NotImplementedError("Sparse Attention is not supported on CPU.") return AttentionBackendEnum.CPU_ATTN.get_path() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index ad5a6789b2023..2dc4ba5d70cac 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,7 +14,6 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration import vllm._C # noqa -from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.import_utils import import_pynvml @@ -23,6 +22,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.config.cache import CacheDType else: @@ -258,16 +258,8 @@ class CudaPlatformBase(Platform): @classmethod def get_valid_backends( cls, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - device_capability, - attn_type, + device_capability: DeviceCapability, + attn_selector_config: "AttentionSelectorConfig", ) -> tuple[ list[tuple["AttentionBackendEnum", int]], dict["AttentionBackendEnum", list[str]], @@ -275,21 +267,15 @@ class CudaPlatformBase(Platform): valid_backends_priorities = [] invalid_reasons = {} - backend_priorities = _get_backend_priorities(use_mla, device_capability) + backend_priorities = _get_backend_priorities( + attn_selector_config.use_mla, device_capability + ) for priority, backend in enumerate(backend_priorities): try: backend_class = backend.get_class() invalid_reasons_i = backend_class.validate_configuration( - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - device_capability, - attn_type, + device_capability=device_capability, + **attn_selector_config._asdict(), ) except ImportError: invalid_reasons_i = ["ImportError"] @@ -304,37 +290,19 @@ class CudaPlatformBase(Platform): def get_attn_backend_cls( cls, selected_backend: "AttentionBackendEnum", - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: "CacheDType | None", - block_size: int | None, - use_mla: bool, - has_sink: bool, - use_sparse: bool, - use_mm_prefix: bool, - attn_type: str | None = None, + attn_selector_config: "AttentionSelectorConfig", ) -> str: - if attn_type is None: - attn_type = AttentionType.DECODER - device_capability = cls.get_device_capability() assert device_capability is not None + attn_selector_config = attn_selector_config._replace(block_size=None) # First try checking just the selected backend, if there is one. if selected_backend is not None: try: backend_class = selected_backend.get_class() invalid_reasons = backend_class.validate_configuration( - head_size, - dtype, - kv_cache_dtype, - None, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - device_capability, - attn_type, + device_capability=device_capability, + **attn_selector_config._asdict(), ) except ImportError: invalid_reasons = ["ImportError"] @@ -350,16 +318,8 @@ class CudaPlatformBase(Platform): # No selected backend or the selected backend is invalid, # so we try finding a valid backend. valid_backends_priorities, invalid_reasons = cls.get_valid_backends( - head_size, - dtype, - kv_cache_dtype, - None, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - device_capability, - attn_type, + device_capability=device_capability, + attn_selector_config=attn_selector_config, ) reasons_str = ( "{" @@ -369,11 +329,7 @@ class CudaPlatformBase(Platform): ) + "}" ) - config_str = ( - f"head_size: {head_size}, dtype: {dtype}, " - f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, " - f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}" - ) + config_str = attn_selector_config.__repr__() logger.debug_once( f"Some attention backends are not valid for {cls.device_name} with " f"{config_str}. Reasons: {reasons_str}." diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 9788e5b564165..d4b40045df384 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -18,8 +18,8 @@ from vllm.logger import init_logger if TYPE_CHECKING: from torch.distributed import PrefixStore, ProcessGroup + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig - from vllm.config.cache import CacheDType from vllm.inputs import ProcessorInputs, PromptType from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -226,15 +226,7 @@ class Platform: def get_attn_backend_cls( cls, selected_backend: "AttentionBackendEnum", - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: "CacheDType | None", - block_size: int, - use_mla: bool, - has_sink: bool, - use_sparse: bool, - use_mm_prefix: bool, - attn_type: str | None = None, + attn_selector_config: "AttentionSelectorConfig", ) -> str: """Get the attention backend class of a device.""" return "" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index b90fb3686c280..e469a928da229 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -15,6 +15,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig logger = init_logger(__name__) @@ -190,21 +191,16 @@ class RocmPlatform(Platform): @classmethod def get_attn_backend_cls( cls, - selected_backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type: str | None = None, + selected_backend: "AttentionBackendEnum", + attn_selector_config: "AttentionSelectorConfig", ) -> str: from vllm._aiter_ops import rocm_aiter_ops - if use_sparse: - if kv_cache_dtype.startswith("fp8"): + block_size = attn_selector_config.block_size + kv_cache_dtype = attn_selector_config.kv_cache_dtype + + if attn_selector_config.use_sparse: + if kv_cache_dtype and kv_cache_dtype.startswith("fp8"): raise ValueError( "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype." ) @@ -214,7 +210,7 @@ class RocmPlatform(Platform): logger.info_once("Using Sparse MLA backend on V1 engine.") return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path() - if use_mla: + if attn_selector_config.use_mla: if selected_backend is None: selected_backend = ( AttentionBackendEnum.ROCM_AITER_MLA diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 50de87098f05c..7c479bf2b6a0e 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -16,6 +16,7 @@ from .interface import Platform, PlatformEnum if TYPE_CHECKING: from typing import TypeAlias + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.config.cache import BlockSize from vllm.pooling_params import PoolingParams @@ -57,17 +58,9 @@ class TpuPlatform(Platform): def get_attn_backend_cls( cls, selected_backend: "AttentionBackendEnum", - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: str | None, - block_size: int, - use_mla: bool, - has_sink: bool, - use_sparse: bool, - use_mm_prefix: bool, - attn_type: str | None = None, + attn_selector_config: "AttentionSelectorConfig", ) -> str: - if use_sparse: + if attn_selector_config.use_sparse: raise NotImplementedError("Sparse Attention is not supported on TPU.") if selected_backend != AttentionBackendEnum.PALLAS: logger.info("Cannot use %s backend on TPU.", selected_backend) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c1ec2d41c73b0..af8979af36643 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -14,6 +14,7 @@ from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: + from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig else: VllmConfig = None @@ -42,15 +43,7 @@ class XPUPlatform(Platform): def get_attn_backend_cls( cls, selected_backend: "AttentionBackendEnum", - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: str | None, - block_size: int, - use_mla: bool, - has_sink: bool, - use_sparse: bool, - use_mm_prefix: bool, - attn_type: str | None = None, + attn_selector_config: "AttentionSelectorConfig", ) -> str: from vllm.v1.attention.backends.utils import set_kv_cache_layout @@ -60,7 +53,7 @@ class XPUPlatform(Platform): "only NHD layout is supported by XPU attention kernels." ) - if use_sparse: + if attn_selector_config.use_sparse: raise NotImplementedError("Sparse Attention is not supported on XPU.") if selected_backend == AttentionBackendEnum.TRITON_ATTN: logger.info_once("Using Triton backend.") @@ -71,7 +64,7 @@ class XPUPlatform(Platform): elif selected_backend: raise ValueError( f"Invalid attention backend for {cls.device_name}, " - f"with use_mla: {use_mla}" + f"with use_mla: {attn_selector_config.use_mla}" ) logger.info("Using Flash Attention backend.") From 51e5b3e3c422cdd81e3c1bd2b9abd025e53ae986 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 15 Dec 2025 14:45:21 -0500 Subject: [PATCH 3/6] [Bugfix] Fix ViT with FlashAttention on ROCm (#30703) Signed-off-by: Matthew Bonanni --- vllm/attention/layer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 47daf6d138431..7ef77db8fbb5b 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -464,7 +464,10 @@ class MultiHeadAttention(nn.Module): } self.fa_version = None - if self.attn_backend == AttentionBackendEnum.FLASH_ATTN: + if ( + self.attn_backend == AttentionBackendEnum.FLASH_ATTN + and current_platform.is_cuda() + ): self.fa_version = get_flash_attn_version() assert self._flash_attn_varlen_func is not None self._flash_attn_varlen_func = functools.partial( From b2191abdcae73fe80f751b463c488159f4dd08a4 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Mon, 15 Dec 2025 19:46:25 +0000 Subject: [PATCH 4/6] [docs][fix] Update Arm CPU vLLM wheel installation docs (#30594) Signed-off-by: Fadi Arafeh --- .../installation/cpu.arm.inc.md | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index ad9c7d9ef21be..657bf2509db01 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -16,15 +16,15 @@ vLLM offers basic model inferencing and serving on Arm CPU platform, with suppor # --8<-- [start:pre-built-wheels] Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries. -Please replace `` in the commands below with a specific version string (e.g., `0.11.2`). ```bash -uv pip install --pre vllm==+cpu --extra-index-url https://wheels.vllm.ai/%2Bcpu/ +export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu ``` ??? console "pip" ```bash - pip install --pre vllm==+cpu --extra-index-url https://wheels.vllm.ai/%2Bcpu/ + pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu ``` The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. @@ -35,20 +35,28 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe * `https://wheels.vllm.ai/nightly/cpu/vllm` -To install from nightly index, copy the link address of the `*.whl` under this index to run, for example: - +To install from nightly index, run: ```bash -uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!) +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu ``` +??? console "pip (there's a caveat)" + + Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). + + If you insist on using `pip`, you have to specify the full URL (link address) of the wheel file (which can be obtained from https://wheels.vllm.ai/nightly/cpu/vllm). + + ```bash + pip install https://wheels.vllm.ai/4fa7ce46f31cbd97b4651694caf9991cc395a259/vllm-0.13.0rc2.dev104%2Bg4fa7ce46f.cpu-cp38-abi3-manylinux_2_35_aarch64.whl # current nightly build (the filename will change!) + ``` + **Install specific revisions** -If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index: -https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm . -Then, copy the link address of the `*.whl` under this index to run: +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```bash -uv pip install -U +export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu ``` # --8<-- [end:pre-built-wheels] @@ -103,10 +111,10 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image. Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo). -Please replace `` in the command below with a specific version string (e.g., `0.12.0`). ```bash -docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v +export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') +docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION} ``` You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days. From a450c64a30ab6d450b23587611a726af965618b1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 15 Dec 2025 15:18:02 -0500 Subject: [PATCH 5/6] [Bugfix] Fail instead of ignoring when CompilationConfig gets invalid args (#30708) Signed-off-by: mgoin --- tests/benchmarks/test_param_sweep.py | 8 -------- vllm/config/compilation.py | 8 ++++---- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py index 0d47cfd9d6230..467797d9915c9 100644 --- a/tests/benchmarks/test_param_sweep.py +++ b/tests/benchmarks/test_param_sweep.py @@ -23,14 +23,6 @@ class TestParameterSweepItem: {"compilation_config.use_inductor_graph_partition": True}, "--compilation-config.use_inductor_graph_partition=true", ), - ( - {"compilation_config.use_inductor": False}, - "--compilation-config.use_inductor=false", - ), - ( - {"compilation_config.use_inductor": True}, - "--compilation-config.use_inductor=true", - ), ], ) def test_nested_boolean_params(self, input_dict, expected): diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 568a01bd9db91..1fdb843e1a7c7 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -8,7 +8,7 @@ from dataclasses import field from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal -from pydantic import Field, TypeAdapter, field_validator +from pydantic import ConfigDict, Field, TypeAdapter, field_validator from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -96,7 +96,7 @@ class CUDAGraphMode(enum.Enum): @config -@dataclass +@dataclass(config=ConfigDict(extra="forbid")) class PassConfig: """Configuration for custom Inductor passes. @@ -251,7 +251,7 @@ class DynamicShapesType(str, enum.Enum): @config -@dataclass +@dataclass(config=ConfigDict(extra="forbid")) class DynamicShapesConfig: """Configuration to control/debug torch compile dynamic shapes.""" @@ -290,7 +290,7 @@ class DynamicShapesConfig: @config -@dataclass +@dataclass(config=ConfigDict(extra="forbid")) class CompilationConfig: """Configuration for compilation. From 60dbf7d8f13689b17c88840f3ae4e7a222305f2b Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 15 Dec 2025 15:24:16 -0500 Subject: [PATCH 6/6] Update batch invariant to use attention config (#30704) Signed-off-by: Matthew Bonanni Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/batch_invariant.py | 39 +++++++++++-------- vllm/v1/worker/gpu_worker.py | 3 +- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4f31e5afa1ac9..fde0826779eb1 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -6,7 +6,7 @@ from typing import Any import torch -import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import tl, triton @@ -1004,27 +1004,30 @@ def vllm_is_batch_invariant() -> bool: return VLLM_BATCH_INVARIANT -def override_envs_for_invariance(): - curr_attn_backend = envs.VLLM_ATTENTION_BACKEND +def override_envs_for_invariance( + attention_backend: AttentionBackendEnum | None, +): supported_backends = [ - "FLASH_ATTN", # best supported backend - "FLASHINFER", - "FLASH_ATTN_MLA", - "TRITON_MLA", + AttentionBackendEnum.FLASH_ATTN, # best supported backend + AttentionBackendEnum.FLASHINFER, + AttentionBackendEnum.FLASH_ATTN_MLA, + AttentionBackendEnum.TRITON_MLA, # Not yet supported MLA backends - # "FLASHMLA", - # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance - # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 + # AttentionBackendEnum.FLASHMLA, + # AttentionBackendEnum.FLEX_ATTENTION, # IMA issue + # AttentionBackendEnum.FLASHINFER_MLA, # PR #28967 ] - if curr_attn_backend not in supported_backends: + if attention_backend not in supported_backends: + supported_names = [b.name for b in supported_backends] + backend_name = attention_backend.name if attention_backend else None error = ( "VLLM batch_invariant mode requires an attention backend in " - f"{supported_backends}, but got '{curr_attn_backend}'. " - "Please set the 'VLLM_ATTENTION_BACKEND' environment variable " - "to one of the supported backends before enabling batch_invariant." + f"{supported_names}, but got '{backend_name}'. " + "Please use --attention-backend or attention_config to set " + "one of the supported backends before enabling batch_invariant." ) raise RuntimeError(error) - if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]: + if attention_backend != supported_backends[0]: warning = ( "You are using a decode-invariant form of batch invariance. " "This will not be invariant between prefill and decode." @@ -1050,10 +1053,12 @@ def override_envs_for_invariance(): os.environ["VLLM_USE_AOT_COMPILE"] = "0" -def init_batch_invariance(): +def init_batch_invariance( + attention_backend: AttentionBackendEnum | None, +): # this will hit all the csrc overrides as well if vllm_is_batch_invariant(): - override_envs_for_invariance() + override_envs_for_invariance(attention_backend) enable_batch_invariant_mode() # Disable TF32 for batch invariance - it causes non-deterministic rounding diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 21a8564f83c40..1e13650cd083e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -931,10 +931,11 @@ def init_worker_distributed_environment( backend: str = "nccl", ) -> None: """Initialize the distributed environment.""" + attention_config = vllm_config.attention_config parallel_config = vllm_config.parallel_config from vllm.model_executor.layers.batch_invariant import init_batch_invariance - init_batch_invariance() + init_batch_invariance(attention_config.backend) set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_method = distributed_init_method or "env://"