mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:54:56 +08:00
[V0 deprecation] Deprecate use_v1 parameter (#28112)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
parent
a9d18b5107
commit
10138c92a5
@ -27,7 +27,6 @@ class DummyPlatform(Platform):
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
use_v1,
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import inspect
|
||||
import os
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
@ -141,17 +142,35 @@ def _cached_get_attn_backend(
|
||||
# get device-specific attn_backend
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
attention_cls = current_platform.get_attn_backend_cls(
|
||||
selected_backend,
|
||||
head_size,
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
True,
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
)
|
||||
sig = inspect.signature(current_platform.get_attn_backend_cls)
|
||||
if "use_v1" in sig.parameters:
|
||||
logger.warning_once(
|
||||
"use_v1 parameter for get_attn_backend_cls is deprecated and will "
|
||||
"be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
|
||||
"remove it from your plugin code."
|
||||
)
|
||||
attention_cls = current_platform.get_attn_backend_cls(
|
||||
selected_backend,
|
||||
head_size,
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
True, # use_v1
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
)
|
||||
else:
|
||||
attention_cls = current_platform.get_attn_backend_cls(
|
||||
selected_backend,
|
||||
head_size,
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
)
|
||||
if not attention_cls:
|
||||
raise ValueError(
|
||||
f"Invalid attention backend for {current_platform.device_name}"
|
||||
|
||||
@ -131,7 +131,6 @@ class CpuPlatform(Platform):
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str | None,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
@ -144,8 +143,6 @@ class CpuPlatform(Platform):
|
||||
raise NotImplementedError("MLA is not supported on CPU.")
|
||||
if use_sparse:
|
||||
raise NotImplementedError("Sparse Attention is not supported on CPU.")
|
||||
if not use_v1:
|
||||
raise ValueError("CPU backend only supports V1.")
|
||||
return AttentionBackendEnum.CPU_ATTN.get_path()
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -336,17 +336,10 @@ class CudaPlatformBase(Platform):
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: "CacheDType | None",
|
||||
block_size: int | None,
|
||||
use_v1: bool,
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
) -> str:
|
||||
if not use_v1:
|
||||
raise RuntimeError(
|
||||
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
|
||||
"to select a supported backend."
|
||||
)
|
||||
|
||||
device_capability = cls.get_device_capability()
|
||||
assert device_capability is not None
|
||||
|
||||
|
||||
@ -215,7 +215,6 @@ class Platform:
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: "CacheDType | None",
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
|
||||
@ -213,7 +213,6 @@ class RocmPlatform(Platform):
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
use_v1,
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
@ -224,12 +223,6 @@ class RocmPlatform(Platform):
|
||||
if use_sparse:
|
||||
raise NotImplementedError("Sparse Attention is not supported on ROCm.")
|
||||
|
||||
if not use_v1:
|
||||
raise RuntimeError(
|
||||
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
|
||||
"to select a supported backend."
|
||||
)
|
||||
|
||||
if use_mla:
|
||||
if selected_backend is None:
|
||||
selected_backend = (
|
||||
|
||||
@ -58,7 +58,6 @@ class TpuPlatform(Platform):
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str | None,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
use_mla: bool,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
@ -70,8 +69,6 @@ class TpuPlatform(Platform):
|
||||
if selected_backend != AttentionBackendEnum.PALLAS:
|
||||
logger.info("Cannot use %s backend on TPU.", selected_backend)
|
||||
|
||||
if not use_v1:
|
||||
raise ValueError("TPU backend only supports V1.")
|
||||
logger.info("Using Pallas V1 backend.")
|
||||
return AttentionBackendEnum.PALLAS.get_path()
|
||||
|
||||
|
||||
@ -48,7 +48,6 @@ class XPUPlatform(Platform):
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str | None,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse,
|
||||
@ -76,7 +75,7 @@ class XPUPlatform(Platform):
|
||||
elif selected_backend:
|
||||
raise ValueError(
|
||||
f"Invalid attention backend for {cls.device_name}, "
|
||||
f"with use_v1: {use_v1} use_mla: {use_mla}"
|
||||
f"with use_mla: {use_mla}"
|
||||
)
|
||||
|
||||
logger.info("Using Flash Attention backend.")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user