From 3bc18127ff1c644257abcf84a1a56fab8c0d3f0c Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Thu, 18 Sep 2025 12:30:10 +0800
Subject: [PATCH] [XPU] Whisper model support on XPU Platform (#25123)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 vllm/attention/layer.py | 4 ++--
 vllm/v1/worker/utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 22dc6dcbc8d6..15c0ce33e965 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module):
             backend = _Backend.FLASH_ATTN
             use_upstream_fa = True
 
-        if current_platform.is_rocm():
-            # currently, only torch_sdpa is supported on rocm
+        if current_platform.is_rocm() or current_platform.is_xpu():
+            # currently, only torch_sdpa is supported on rocm/xpu
             self.attn_backend = _Backend.TORCH_SDPA
         else:
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index fc831a73a75e..b76ac633892f 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -282,7 +282,7 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda():
+            if current_platform.is_cuda() or current_platform.is_xpu():
                 # We know that the GPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.