[XPU] Upgrade NIXL to remove CUDA dependency (#26570)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
2026-03-16 11:37:12 +08:00 · 2025-10-11 13:15:23 +08:00 · 2025-10-11 13:15:23 +08:00 · 27ed39a347
commit 27ed39a347
parent 8f8474fbe3
5 changed files with 14 additions and 9 deletions
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -44,6 +44,5 @@ docker run \
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
 '
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 # install nixl from source code
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 ENTRYPOINT ["vllm", "serve"]
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@ -10,7 +10,6 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
 nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
        "--enable-devel-headers",
        "--with-verbs",
        "--enable-mt",
        "--with-ze=no",
    ]
    run_command(configure_command, cwd=ucx_source_path)
    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@ -54,6 +54,14 @@ class XPUPlatform(Platform):
        has_sink: bool,
        use_sparse,
    ) -> str:
        from vllm.v1.attention.backends.utils import set_kv_cache_layout
        set_kv_cache_layout("NHD")
        logger.info(
            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
            "only NHD layout is supported by XPU attention kernels."
        )
        from vllm.attention.backends.registry import _Backend
        if use_sparse:
@ -190,13 +198,6 @@ class XPUPlatform(Platform):
                vllm_config.scheduler_config.max_model_len,
                DEFAULT_MAX_NUM_BATCHED_TOKENS,
            )
        from vllm.v1.attention.backends.utils import set_kv_cache_layout
        set_kv_cache_layout("NHD")
        logger.info(
            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
            "only NHD layout is supported by XPU attention kernels."
        )
    @classmethod
    def support_hybrid_kv_cache(cls) -> bool: