From f1c2c20136cca6ea8798a64855eaf52ee9a42210 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 23 Dec 2025 13:22:15 +0800
Subject: [PATCH] [XPU] decrease IGC_ForceOCLSIMDWidth for speculative decoding
 triton-xpu kernel compilation (#30538)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 docker/Dockerfile.xpu   | 7 ++++++-
 docs/features/README.md | 2 +-
 vllm/platforms/xpu.py   | 4 +++-
 3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 72d2053102c22..4168c1570d874 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
 
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
 
+# arctic-inference is built from source which needs torch-xpu properly installed
+# used for suffix method speculative decoding
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir arctic-inference==0.1.1
+
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 
 COPY . .
diff --git a/docs/features/README.md b/docs/features/README.md
index e9e5232929b72..b9083b9993159 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -64,7 +64,7 @@ th:not(:first-child) {
 | [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ✅        |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 2d67551eed9f6..2e39a216a10a0 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -156,7 +156,9 @@ class XPUPlatform(Platform):
 
         if vllm_config.lora_config is not None:
             compilation_config.mode = CompilationMode.NONE
-
+        # decrease triton kernel compilation scratch space for speculative decoding
+        if vllm_config.speculative_config is not None:
+            os.environ["IGC_ForceOCLSIMDWidth"] = "16"  # noqa: SIM112
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
         # Only override worker_cls if it's still the default "auto"