[V0 deprecation] Deprecate use_v1 parameter (#28112)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan 2025-11-12 22:03:52 +08:00 committed by GitHub
parent a9d18b5107
commit 10138c92a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 31 additions and 35 deletions

View File

@ -27,7 +27,6 @@ class DummyPlatform(Platform):
dtype, dtype,
kv_cache_dtype, kv_cache_dtype,
block_size, block_size,
use_v1,
use_mla, use_mla,
has_sink, has_sink,
use_sparse, use_sparse,

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import os import os
from collections.abc import Generator from collections.abc import Generator
from contextlib import contextmanager from contextlib import contextmanager
@ -141,17 +142,35 @@ def _cached_get_attn_backend(
# get device-specific attn_backend # get device-specific attn_backend
from vllm.platforms import current_platform from vllm.platforms import current_platform
attention_cls = current_platform.get_attn_backend_cls( sig = inspect.signature(current_platform.get_attn_backend_cls)
selected_backend, if "use_v1" in sig.parameters:
head_size, logger.warning_once(
dtype, "use_v1 parameter for get_attn_backend_cls is deprecated and will "
kv_cache_dtype, "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
block_size, "remove it from your plugin code."
True, )
use_mla, attention_cls = current_platform.get_attn_backend_cls(
has_sink, selected_backend,
use_sparse, head_size,
) dtype,
kv_cache_dtype,
block_size,
True, # use_v1
use_mla,
has_sink,
use_sparse,
)
else:
attention_cls = current_platform.get_attn_backend_cls(
selected_backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
)
if not attention_cls: if not attention_cls:
raise ValueError( raise ValueError(
f"Invalid attention backend for {current_platform.device_name}" f"Invalid attention backend for {current_platform.device_name}"

View File

@ -131,7 +131,6 @@ class CpuPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,
@ -144,8 +143,6 @@ class CpuPlatform(Platform):
raise NotImplementedError("MLA is not supported on CPU.") raise NotImplementedError("MLA is not supported on CPU.")
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on CPU.") raise NotImplementedError("Sparse Attention is not supported on CPU.")
if not use_v1:
raise ValueError("CPU backend only supports V1.")
return AttentionBackendEnum.CPU_ATTN.get_path() return AttentionBackendEnum.CPU_ATTN.get_path()
@classmethod @classmethod

View File

@ -336,17 +336,10 @@ class CudaPlatformBase(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None", kv_cache_dtype: "CacheDType | None",
block_size: int | None, block_size: int | None,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,
) -> str: ) -> str:
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
device_capability = cls.get_device_capability() device_capability = cls.get_device_capability()
assert device_capability is not None assert device_capability is not None

View File

@ -215,7 +215,6 @@ class Platform:
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None", kv_cache_dtype: "CacheDType | None",
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,

View File

@ -213,7 +213,6 @@ class RocmPlatform(Platform):
dtype, dtype,
kv_cache_dtype, kv_cache_dtype,
block_size, block_size,
use_v1,
use_mla, use_mla,
has_sink, has_sink,
use_sparse, use_sparse,
@ -224,12 +223,6 @@ class RocmPlatform(Platform):
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on ROCm.") raise NotImplementedError("Sparse Attention is not supported on ROCm.")
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
if use_mla: if use_mla:
if selected_backend is None: if selected_backend is None:
selected_backend = ( selected_backend = (

View File

@ -58,7 +58,6 @@ class TpuPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink, has_sink,
use_sparse, use_sparse,
@ -70,8 +69,6 @@ class TpuPlatform(Platform):
if selected_backend != AttentionBackendEnum.PALLAS: if selected_backend != AttentionBackendEnum.PALLAS:
logger.info("Cannot use %s backend on TPU.", selected_backend) logger.info("Cannot use %s backend on TPU.", selected_backend)
if not use_v1:
raise ValueError("TPU backend only supports V1.")
logger.info("Using Pallas V1 backend.") logger.info("Using Pallas V1 backend.")
return AttentionBackendEnum.PALLAS.get_path() return AttentionBackendEnum.PALLAS.get_path()

View File

@ -48,7 +48,6 @@ class XPUPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse, use_sparse,
@ -76,7 +75,7 @@ class XPUPlatform(Platform):
elif selected_backend: elif selected_backend:
raise ValueError( raise ValueError(
f"Invalid attention backend for {cls.device_name}, " f"Invalid attention backend for {cls.device_name}, "
f"with use_v1: {use_v1} use_mla: {use_mla}" f"with use_mla: {use_mla}"
) )
logger.info("Using Flash Attention backend.") logger.info("Using Flash Attention backend.")