From ed6cfb90c8ad13e77dcbfa0e211075a3e2f1ee7e Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 30 Apr 2025 15:03:58 +0800 Subject: [PATCH] [Hardware][Intel GPU] Upgrade to torch 2.7 (#17444) Signed-off-by: Kunshang Ji Co-authored-by: Qiming Zhang --- docker/Dockerfile.xpu | 6 ------ .../installation/gpu/xpu.inc.md | 9 --------- requirements/xpu.txt | 6 +++--- vllm/_ipex_ops.py | 18 +++++++++--------- vllm/attention/backends/ipex_attn.py | 14 ++++++-------- 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index ad4abf16b43b..681102b9d18b 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 setup.py install -# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu -# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install intel-extension-for-pytorch==2.6.10+xpu \ - --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - CMD ["/bin/bash"] FROM vllm-base AS vllm-openai diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index fbf5421eeec5..4ab41a21c2a1 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -35,13 +35,6 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7. - -```console -pip install intel-extension-for-pytorch==2.6.10+xpu \ - --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -``` - :::{note} - FP16 is the default data type in the current XPU backend. The BF16 data type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. @@ -81,5 +74,3 @@ python -m vllm.entrypoints.openai.api_server \ ``` By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. - -There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc. diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 723ffcfc9393..d9f2c007e9fa 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,7 +10,7 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts -torch==2.6.0+xpu +torch==2.7.0+xpu torchaudio torchvision pytorch-triton-xpu @@ -18,6 +18,6 @@ pytorch-triton-xpu # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu # FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -# intel-extension-for-pytorch==2.6.10+xpu -oneccl_bind_pt==2.6.0+xpu +intel-extension-for-pytorch==2.7.10+xpu +oneccl_bind_pt==2.7.0+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index c3d210c27cab..505ebec34d2f 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -177,6 +177,7 @@ class ipex_ops: out: torch.Tensor, seqlen_q: torch.Tensor, seqlen_k: torch.Tensor, + alibi_slopes: torch.Tensor, max_seqlen_q: int, max_seqlen_k: int, pdropout: float, @@ -185,6 +186,8 @@ class ipex_ops: is_causal: bool, return_softmax: bool, gen_: torch.Generator, + window_size_left: float, + window_size_right: float, logits_soft_cap: float, ) -> None: if ipex.__version__.endswith("cpu"): @@ -200,15 +203,12 @@ class ipex_ops: is_causal, return_softmax, gen_) else: # XPU build - ipex.llm.functional.varlen_attention(query.contiguous(), - key.contiguous(), - value.contiguous(), out, - seqlen_q.int(), - seqlen_k.int(), max_seqlen_q, - max_seqlen_k, pdropout, - softmax_scale, zero_tensors, - is_causal, return_softmax, - gen_, logits_soft_cap) + ipex.llm.functional.varlen_attention( + query.contiguous(), key.contiguous(), value.contiguous(), out, + seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q, + max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal, + return_softmax, gen_, window_size_left, window_size_right, + logits_soft_cap) @staticmethod def reshape_and_cache( diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 27959caa651a..f322c7b3dd6a 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -143,10 +143,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.alibi_slopes is not None - or self.sliding_window is not None) + self.need_mask = (self.sliding_window is not None) if logits_soft_cap is None: - logits_soft_cap = 0 + logits_soft_cap = -1 self.logits_soft_cap = logits_soft_cap supported_head_sizes = PagedAttention.get_supported_head_sizes() @@ -234,11 +233,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): dim=1) if attn_metadata.attn_bias is None: - if self.alibi_slopes is not None: - att_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: + if self.sliding_window is not None: att_masks = _make_sliding_window_bias( attn_metadata.seq_lens, self.sliding_window, query.dtype) # type: ignore @@ -258,6 +253,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): output, attn_metadata.seqlen_q, attn_metadata.seqlen_q, + self.alibi_slopes, attn_metadata.max_seqlen, attn_metadata.max_seqlen, pdropout=0.0, @@ -266,6 +262,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): is_causal=True, return_softmax=False, gen_=None, + window_size_left=-1, + window_size_right=-1, logits_soft_cap=self.logits_soft_cap, ) else: