diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 22dc6dcbc8d6..15c0ce33e965 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module): backend = _Backend.FLASH_ATTN use_upstream_fa = True - if current_platform.is_rocm(): - # currently, only torch_sdpa is supported on rocm + if current_platform.is_rocm() or current_platform.is_xpu(): + # currently, only torch_sdpa is supported on rocm/xpu self.attn_backend = _Backend.TORCH_SDPA else: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index fc831a73a75e..b76ac633892f 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -282,7 +282,7 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda(): + if current_platform.is_cuda() or current_platform.is_xpu(): # We know that the GPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this.