From 81c57f60a2c77d169dbec021bb58a467edf580f6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 9 Aug 2025 08:03:45 +0800 Subject: [PATCH] [XPU] upgrade torch 2.8 on for XPU (#22300) Signed-off-by: Kunshang Ji --- docker/Dockerfile.xpu | 17 +++++++++++------ requirements/xpu.txt | 11 +++-------- vllm/plugins/__init__.py | 9 --------- vllm/v1/worker/xpu_worker.py | 2 +- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 7d5a589eb1d7d..65d2e5036b783 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -1,9 +1,12 @@ -# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. -FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base +FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base RUN rm /etc/apt/sources.list.d/intel-graphics.list -RUN apt-get update -y && \ +RUN apt clean && apt-get update -y && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install -y python3.10 python3.10-distutils && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ ffmpeg \ @@ -14,11 +17,13 @@ RUN apt-get update -y && \ libgl1 \ lsb-release \ numactl \ - python3 \ - python3-dev \ - python3-pip \ + python3.10-dev \ wget + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 + WORKDIR /workspace/vllm COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt COPY requirements/common.txt /workspace/vllm/requirements/common.txt diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 0d95dc57152de..4607c3efdf14c 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,15 +10,10 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding - -torch==2.7.0+xpu +--extra-index-url=https://download.pytorch.org/whl/xpu +torch==2.8.0+xpu torchaudio torchvision pytorch-triton-xpu ---extra-index-url=https://download.pytorch.org/whl/xpu - -# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu -# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -intel-extension-for-pytorch==2.7.10+xpu -oneccl_bind_pt==2.7.0+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +intel-extension-for-pytorch==2.8.10+xpu diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 51c78ddc1a9d5..1a1760df82c03 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -4,8 +4,6 @@ import logging from typing import Any, Callable -import torch - import vllm.envs as envs logger = logging.getLogger(__name__) @@ -68,13 +66,6 @@ def load_general_plugins(): return plugins_loaded = True - # some platform-specific configurations - from vllm.platforms import current_platform - - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 - torch._dynamo.config.disable = True - plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP) # general plugins, we only need to execute the loaded functions for func in plugins.values(): diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 2a7e0625b2f87..134d839252653 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -152,7 +152,7 @@ class XPUWorker(Worker): raise RuntimeError( f"Not support device type: {self.device_config.device}") - ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd") + ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd") ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", str(self.parallel_config.world_size))