mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 02:34:56 +08:00
[XPU] Update latest IPEX 2.8 release (#27735)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
parent
d7fb10c574
commit
b5bae42f91
@ -20,7 +20,10 @@ trap remove_docker_container EXIT
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run \
|
||||
--device /dev/dri \
|
||||
--device /dev/dri:/dev/dri \
|
||||
--net=host \
|
||||
--ipc=host \
|
||||
--privileged \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
@ -42,7 +45,7 @@ docker run \
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
|
||||
docker run -it \
|
||||
--rm \
|
||||
--network=host \
|
||||
--device /dev/dri \
|
||||
--device /dev/dri:/dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--ipc=host \
|
||||
--privileged \
|
||||
vllm-xpu-env
|
||||
```
|
||||
|
||||
|
||||
@ -15,4 +15,4 @@ torchaudio
|
||||
torchvision
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
|
||||
@ -151,7 +151,9 @@ class ipex_ops:
|
||||
def rms_norm(
|
||||
input: torch.Tensor, weight: torch.Tensor, epsilon: float
|
||||
) -> torch.Tensor:
|
||||
return ipex.llm.functional.rms_norm(input, weight, epsilon)
|
||||
out = torch.empty_like(input)
|
||||
torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def fused_add_rms_norm(
|
||||
@ -160,10 +162,7 @@ class ipex_ops:
|
||||
weight: torch.Tensor,
|
||||
epsilon: float,
|
||||
) -> None:
|
||||
tmp = ipex.llm.functional.add_rms_norm(
|
||||
residual, input, weight, None, epsilon, True
|
||||
)
|
||||
input.copy_(tmp)
|
||||
torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon)
|
||||
|
||||
@staticmethod
|
||||
def varlen_attention(
|
||||
@ -296,16 +295,6 @@ class ipex_ops:
|
||||
num_splits=0,
|
||||
s_aux: torch.Tensor | None = None,
|
||||
):
|
||||
if cu_seqlens_k is None:
|
||||
# cu_seqlens_k is not used in ipex kernel.
|
||||
cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
|
||||
cu_seqlens_k = torch.cat(
|
||||
[
|
||||
torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
|
||||
cu_seqlens_k,
|
||||
]
|
||||
).to(torch.int32)
|
||||
|
||||
real_window_size: tuple[int, int]
|
||||
if window_size is None:
|
||||
real_window_size = (-1, -1)
|
||||
@ -318,7 +307,7 @@ class ipex_ops:
|
||||
k,
|
||||
v,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
seqused_k,
|
||||
max_seqlen_q,
|
||||
max_seqlen_k,
|
||||
softmax_scale,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user