mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-26 15:49:11 +08:00
Remove _use_pallas var as PALLAS attention backend is deprecated
Signed-off-by: Wei-Yu Lin <weiyulin@google.com>
This commit is contained in:
parent
0f7ee9d247
commit
9aaed80cc8
@ -11,7 +11,6 @@ from typing import TYPE_CHECKING, Literal
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
|
||||
from vllm.logger import init_logger
|
||||
@ -251,9 +250,6 @@ class TpKVTopology:
|
||||
len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
|
||||
)
|
||||
|
||||
attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
|
||||
self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
|
||||
|
||||
@property
|
||||
def is_kv_layout_blocks_first(self) -> bool:
|
||||
return self._is_kv_layout_blocks_first
|
||||
@ -261,7 +257,7 @@ class TpKVTopology:
|
||||
@property
|
||||
def split_k_and_v(self) -> bool:
|
||||
# Whether to register regions for K and V separately (when present).
|
||||
return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first)
|
||||
return not (self.is_mla or self.is_kv_layout_blocks_first)
|
||||
|
||||
@property
|
||||
def tp_size(self) -> int:
|
||||
|
||||
@ -499,7 +499,6 @@ class MooncakeConnectorWorker:
|
||||
total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
|
||||
attn_backend=backend,
|
||||
)
|
||||
self._use_pallas = self.kv_topo._use_pallas
|
||||
|
||||
self.zmq_ctx = zmq.Context()
|
||||
self.async_zmq_ctx = zmq.asyncio.Context()
|
||||
|
||||
@ -983,7 +983,6 @@ class NixlConnectorWorker:
|
||||
total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
|
||||
attn_backend=backend,
|
||||
)
|
||||
self._use_pallas = self.kv_topo._use_pallas
|
||||
self._physical_blocks_per_logical_kv_block = 1
|
||||
|
||||
def _nixl_handshake(
|
||||
@ -1641,9 +1640,6 @@ class NixlConnectorWorker:
|
||||
# Num kv_heads > tp_size and P TP > D TP case, not supported
|
||||
assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
|
||||
|
||||
assert not self._use_pallas or tp_ratio == 1, (
|
||||
"TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
|
||||
)
|
||||
kv_cache_layout = (
|
||||
self.kv_cache_layout
|
||||
if not self.use_host_buffer
|
||||
@ -1814,9 +1810,7 @@ class NixlConnectorWorker:
|
||||
|
||||
if len(self.device_kv_caches) == 0:
|
||||
return
|
||||
split_k_and_v = not (
|
||||
self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
|
||||
)
|
||||
split_k_and_v = not (self.use_mla or self.kv_topo.is_kv_layout_blocks_first)
|
||||
sample_cache = list(self.device_kv_caches.values())[0][0]
|
||||
for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
|
||||
assert block_size_ratio > 1, "Only nP < nD supported currently."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user