diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 2fd7265fa5366..250a64fdd071c 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -44,6 +44,5 @@ docker run \ pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py - pytest -v -s v1/test_metrics pytest -v -s v1/test_serial_utils.py ' diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index ffc3abd389653..49ea39cad5128 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils + +# install nixl from source code +RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/" + ENTRYPOINT ["vllm", "serve"] diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 5d52400e50bc6..d14b631aa9364 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,7 +10,6 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.61.2 # Required for N-gram speculative decoding -nixl==0.3.0 # for PD disaggregation torch==2.8.0+xpu torchaudio torchvision diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index c903e3f1d3f18..c808b01d2e94b 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -135,6 +135,7 @@ def build_and_install_prerequisites(args): "--enable-devel-headers", "--with-verbs", "--enable-mt", + "--with-ze=no", ] run_command(configure_command, cwd=ucx_source_path) run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index e0c8a6605b7d4..b75b52938839b 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -54,6 +54,14 @@ class XPUPlatform(Platform): has_sink: bool, use_sparse, ) -> str: + from vllm.v1.attention.backends.utils import set_kv_cache_layout + + set_kv_cache_layout("NHD") + logger.info( + "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " + "only NHD layout is supported by XPU attention kernels." + ) + from vllm.attention.backends.registry import _Backend if use_sparse: @@ -190,13 +198,6 @@ class XPUPlatform(Platform): vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS, ) - from vllm.v1.attention.backends.utils import set_kv_cache_layout - - set_kv_cache_layout("NHD") - logger.info( - "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " - "only NHD layout is supported by XPU attention kernels." - ) @classmethod def support_hybrid_kv_cache(cls) -> bool: