diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 4f1ea1a0240c4..914ab91b1563c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING, Literal import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import get_current_vllm_config from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.logger import init_logger @@ -251,9 +250,6 @@ class TpKVTopology: len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1 ) - attn_backend = AttentionBackendEnum[self.attn_backend.get_name()] - self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS - @property def is_kv_layout_blocks_first(self) -> bool: return self._is_kv_layout_blocks_first @@ -261,7 +257,7 @@ class TpKVTopology: @property def split_k_and_v(self) -> bool: # Whether to register regions for K and V separately (when present). - return not (self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first) + return not (self.is_mla or self.is_kv_layout_blocks_first) @property def tp_size(self) -> int: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 9a15d3fa6ed09..38ce02a2fef76 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -499,7 +499,6 @@ class MooncakeConnectorWorker: total_num_kv_heads=self.model_config.get_total_num_kv_heads(), attn_backend=backend, ) - self._use_pallas = self.kv_topo._use_pallas self.zmq_ctx = zmq.Context() self.async_zmq_ctx = zmq.asyncio.Context() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 757ca41e9844b..0f33cde7d3221 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -983,7 +983,6 @@ class NixlConnectorWorker: total_num_kv_heads=self.model_config.get_total_num_kv_heads(), attn_backend=backend, ) - self._use_pallas = self.kv_topo._use_pallas self._physical_blocks_per_logical_kv_block = 1 def _nixl_handshake( @@ -1641,9 +1640,6 @@ class NixlConnectorWorker: # Num kv_heads > tp_size and P TP > D TP case, not supported assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id)) - assert not self._use_pallas or tp_ratio == 1, ( - "TPU (pallas_v1) DOES NOT support heterogeneous TP yet." - ) kv_cache_layout = ( self.kv_cache_layout if not self.use_host_buffer @@ -1814,9 +1810,7 @@ class NixlConnectorWorker: if len(self.device_kv_caches) == 0: return - split_k_and_v = not ( - self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first - ) + split_k_and_v = not (self.use_mla or self.kv_topo.is_kv_layout_blocks_first) sample_cache = list(self.device_kv_caches.values())[0][0] for block_size_ratio, block_ids_list in block_ids_per_ratio.items(): assert block_size_ratio > 1, "Only nP < nD supported currently."