[XPU] Whisper model support on XPU Platform (#25123)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-12-10 05:15:42 +08:00 · 2025-09-18 12:30:10 +08:00 · 2025-09-18 12:30:10 +08:00 · 3bc18127ff
commit 3bc18127ff
parent bec060fd99
2 changed files with 3 additions and 3 deletions
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module):
            backend = _Backend.FLASH_ATTN
            use_upstream_fa = True

-        if current_platform.is_rocm():
-            # currently, only torch_sdpa is supported on rocm
+        if current_platform.is_rocm() or current_platform.is_xpu():
+            # currently, only torch_sdpa is supported on rocm/xpu
            self.attn_backend = _Backend.TORCH_SDPA
        else:

--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@ -282,7 +282,7 @@ def bind_kv_cache(
            # TODO - analyze where runner_kv_caches is used and the right
            # way to ensure it properly reflects multiple attention layers
            # in the same decoder block.
-            if current_platform.is_cuda():
+            if current_platform.is_cuda() or current_platform.is_xpu():
                # We know that the GPU runner is not impacted by this
                # case. Some test code depends on runner_kv_caches, but
                # not in a way that's impacted by ignoring this.