Remove _use_pallas var as PALLAS attention backend is deprecated

Signed-off-by: Wei-Yu Lin <weiyulin@google.com>
This commit is contained in:
Wei-Yu Lin 2025-12-22 23:42:26 +00:00
parent 0f7ee9d247
commit 9aaed80cc8
3 changed files with 2 additions and 13 deletions

View File

@ -11,7 +11,6 @@ from typing import TYPE_CHECKING, Literal
import torch
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import get_current_vllm_config
from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
from vllm.logger import init_logger
@ -251,9 +250,6 @@ class TpKVTopology:
len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
)
attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
@property
def is_kv_layout_blocks_first(self) -> bool:
return self._is_kv_layout_blocks_first
@ -261,7 +257,7 @@ class TpKVTopology:
@property
def split_k_and_v(self) -> bool:
# Whether to register regions for K and V separately (when present).
return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first)
return not (self.is_mla or self.is_kv_layout_blocks_first)
@property
def tp_size(self) -> int:

View File

@ -499,7 +499,6 @@ class MooncakeConnectorWorker:
total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
attn_backend=backend,
)
self._use_pallas = self.kv_topo._use_pallas
self.zmq_ctx = zmq.Context()
self.async_zmq_ctx = zmq.asyncio.Context()

View File

@ -983,7 +983,6 @@ class NixlConnectorWorker:
total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
attn_backend=backend,
)
self._use_pallas = self.kv_topo._use_pallas
self._physical_blocks_per_logical_kv_block = 1
def _nixl_handshake(
@ -1641,9 +1640,6 @@ class NixlConnectorWorker:
# Num kv_heads > tp_size and P TP > D TP case, not supported
assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
assert not self._use_pallas or tp_ratio == 1, (
"TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
)
kv_cache_layout = (
self.kv_cache_layout
if not self.use_host_buffer
@ -1814,9 +1810,7 @@ class NixlConnectorWorker:
if len(self.device_kv_caches) == 0:
return
split_k_and_v = not (
self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
)
split_k_and_v = not (self.use_mla or self.kv_topo.is_kv_layout_blocks_first)
sample_cache = list(self.device_kv_caches.values())[0][0]
for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
assert block_size_ratio > 1, "Only nP < nD supported currently."