From ff47aab05640394a513a5b2ac772a115ddc2e05a Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 12 Mar 2025 18:41:13 +0800 Subject: [PATCH] [CPU] Upgrade CPU backend to torch-2.6 (#13381) Signed-off-by: jiang1.li Co-authored-by: Isotr0py <2037008807@qq.com> --- .buildkite/run-cpu-test.sh | 8 +++++--- Dockerfile.cpu | 2 +- cmake/cpu_extension.cmake | 2 +- requirements/cpu.txt | 2 +- tests/lora/test_qwen2vl.py | 2 +- vllm/attention/ops/ipex_attn.py | 2 +- vllm/executor/multiproc_worker_utils.py | 9 +++++---- vllm/model_executor/layers/fused_moe/layer.py | 6 +++++- vllm/platforms/cpu.py | 3 +++ 9 files changed, 23 insertions(+), 13 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f6dad818ddc05..e45e184852f29 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -19,13 +19,14 @@ remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 + export BUILDKITE_BUILD_NUMBER=$3 # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " @@ -36,6 +37,7 @@ function cpu_tests() { docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pip install -r vllm/requirements/test.txt + pip install -r vllm/requirements/cpu.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -85,4 +87,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 08a4e188f4c14..a10090529d8a9 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.5.0 +RUN pip install intel_extension_for_pytorch==2.6.0 WORKDIR /workspace diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index ca2ffb1bc3c8c..345b75d622331 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.6 + GIT_TAG v3.7.1 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) diff --git a/requirements/cpu.txt b/requirements/cpu.txt index ba059d3ff72ee..b4e6abb6e3d66 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -2,7 +2,7 @@ -r common.txt # Dependencies for CPUs -torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x" +torch==2.6.0+cpu; platform_machine == "x86_64" torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" torch==2.7.0.dev20250304; platform_machine == "s390x" diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 90735d55be712..7bd3e3d0fe27f 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -@pytest.fixture(autouse=True) +@pytest.fixture(autouse=not current_platform.is_cpu()) def v1(run_with_both_engines_lora): # Simple autouse wrapper to run both engines for each test # This can be promoted up to conftest.py to run for every diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 598ceea130d97..6d96f58320c84 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -17,7 +17,7 @@ class _PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [32, 64, 80, 96, 112, 128, 256] + return [32, 64, 80, 96, 112, 128, 192, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 68a83bb610a49..74237f9eb451b 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -254,10 +254,11 @@ def _run_worker_process( # online (in situ) tuning is enabled. # Offline tuning API (record_untuned_is_enabled()) only # available in PyTorch 2.6 or later. - import torch.cuda.tunable as tunable - if (tunable.is_enabled() and tunable.tuning_is_enabled() - and not tunable.record_untuned_is_enabled()): - tunable.write_file() + if torch.cuda.is_available(): + import torch.cuda.tunable as tunable + if (tunable.is_enabled() and tunable.tuning_is_enabled() + and not tunable.record_untuned_is_enabled()): + tunable.write_file() logger.info("Worker exiting") diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 51c4df9d4a5e2..2c5fa509c595d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, activation: str = "silu", **kwargs, ): - assert custom_routing_function is None assert activation == "silu", f"{activation} is not supported." return layer.ipex_fusion( x, @@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): renormalize, topk_group, num_expert_group, + custom_routing_function, + scoring_func, + e_score_correction_bias, ) def forward_tpu( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index ab8982a3a6e1c..140335dfb64a6 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -121,6 +121,9 @@ class CpuPlatform(Platform): # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + # MLA attention is not supported + os.environ["VLLM_MLA_DISABLE"] = "1" + # Intel OpenMP setting ld_prealod_str = os.getenv("LD_PRELOAD", "") if "libiomp5.so" in ld_prealod_str: