mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-21 03:17:00 +08:00
[XPU] Update latest IPEX 2.8 release (#27735)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
parent
d7fb10c574
commit
b5bae42f91
@ -20,7 +20,10 @@ trap remove_docker_container EXIT
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/dri \
|
--device /dev/dri:/dev/dri \
|
||||||
|
--net=host \
|
||||||
|
--ipc=host \
|
||||||
|
--privileged \
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
@ -42,7 +45,7 @@ docker run \
|
|||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
|
|||||||
docker run -it \
|
docker run -it \
|
||||||
--rm \
|
--rm \
|
||||||
--network=host \
|
--network=host \
|
||||||
--device /dev/dri \
|
--device /dev/dri:/dev/dri \
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
--ipc=host \
|
||||||
|
--privileged \
|
||||||
vllm-xpu-env
|
vllm-xpu-env
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -15,4 +15,4 @@ torchaudio
|
|||||||
torchvision
|
torchvision
|
||||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||||
|
|
||||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||||
|
|||||||
@ -151,7 +151,9 @@ class ipex_ops:
|
|||||||
def rms_norm(
|
def rms_norm(
|
||||||
input: torch.Tensor, weight: torch.Tensor, epsilon: float
|
input: torch.Tensor, weight: torch.Tensor, epsilon: float
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
return ipex.llm.functional.rms_norm(input, weight, epsilon)
|
out = torch.empty_like(input)
|
||||||
|
torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon)
|
||||||
|
return out
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def fused_add_rms_norm(
|
def fused_add_rms_norm(
|
||||||
@ -160,10 +162,7 @@ class ipex_ops:
|
|||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
epsilon: float,
|
epsilon: float,
|
||||||
) -> None:
|
) -> None:
|
||||||
tmp = ipex.llm.functional.add_rms_norm(
|
torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon)
|
||||||
residual, input, weight, None, epsilon, True
|
|
||||||
)
|
|
||||||
input.copy_(tmp)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def varlen_attention(
|
def varlen_attention(
|
||||||
@ -296,16 +295,6 @@ class ipex_ops:
|
|||||||
num_splits=0,
|
num_splits=0,
|
||||||
s_aux: torch.Tensor | None = None,
|
s_aux: torch.Tensor | None = None,
|
||||||
):
|
):
|
||||||
if cu_seqlens_k is None:
|
|
||||||
# cu_seqlens_k is not used in ipex kernel.
|
|
||||||
cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
|
|
||||||
cu_seqlens_k = torch.cat(
|
|
||||||
[
|
|
||||||
torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
|
|
||||||
cu_seqlens_k,
|
|
||||||
]
|
|
||||||
).to(torch.int32)
|
|
||||||
|
|
||||||
real_window_size: tuple[int, int]
|
real_window_size: tuple[int, int]
|
||||||
if window_size is None:
|
if window_size is None:
|
||||||
real_window_size = (-1, -1)
|
real_window_size = (-1, -1)
|
||||||
@ -318,7 +307,7 @@ class ipex_ops:
|
|||||||
k,
|
k,
|
||||||
v,
|
v,
|
||||||
cu_seqlens_q,
|
cu_seqlens_q,
|
||||||
cu_seqlens_k,
|
seqused_k,
|
||||||
max_seqlen_q,
|
max_seqlen_q,
|
||||||
max_seqlen_k,
|
max_seqlen_k,
|
||||||
softmax_scale,
|
softmax_scale,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user