diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 0556c191ddea6..09717a1121d05 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig, get_current_vllm_config +from vllm.logger import init_logger from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) +logger = init_logger(__name__) + if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) @@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in FlashInfer is not supported yet, it will fall" + " back to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index f948fbc0a1096..55b03bbf32e4b 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: super(AttentionImpl, self).__init__() + if use_irope: + logger.warning_once( + "Using irope in HPU is not supported yet, it will fall back " + "to global attention for long context.") self.kv_cache_dtype = kv_cache_dtype self.num_heads = num_heads self.head_size = head_size diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index d3c61ea26a02a..99917a92af5f9 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) +from vllm.logger import init_logger + +logger = init_logger(__name__) _PARTITION_SIZE = 512 @@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in Ipex is not supported yet, it will fall" + " back to global attention for long context.") if blocksparse_params is not None: raise ValueError( "IPEX backend does not support block-sparse attention.") diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 2ee66ab9e966e..91d20a4e7bfc0 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType, is_quantized_kv_cache) from vllm.attention.backends.utils import CommonAttentionState +from vllm.logger import init_logger + +logger = init_logger(__name__) class PallasAttentionBackend(AttentionBackend): @@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in Pallas is not supported yet, it will fall back " + "to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 9a4ee2ae70612..ebeefdf4fd327 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -462,7 +462,12 @@ class ROCmFlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in ROCm Flash Attention is not supported yet, it " + "will fail back to global attention for long context.") if blocksparse_params is not None: raise ValueError( "ROCmFlashAttention does not support blocksparse attention.") diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index afe2acff4ab3d..c1bd638f2605d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): if logits_soft_cap is not None: logger.warning_once("Torch SPDA does not support logits soft cap. " "Outputs may be slightly off.") + if use_irope: + logger.warning_once( + "Using irope in Torch SPDA is not supported yet, it will fall" + " back to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 14c94c9ac4cab..cd152e57d7496 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): if logits_soft_cap is not None: logger.warning_once("XFormers does not support logits soft cap. " "Outputs may be slightly off.") + if use_irope: + logger.warning_once( + "Using irope in XFormers is not supported yet, it will fall" + " back to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index af729ee9910f4..3e8149a24ebf7 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -10,6 +10,9 @@ import torch_xla.experimental.custom_kernel # noqa: F401 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionType) from vllm.attention.backends.utils import CommonAttentionState +from vllm.logger import init_logger + +logger = init_logger(__name__) class PallasAttentionBackend(AttentionBackend): @@ -80,7 +83,12 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in Pallas is not supported yet, it will fall back " + "to global attention for long context.") if blocksparse_params is not None: raise ValueError("Paged attention Pallas kernel does " "not support block-sparse attention.")