[CPU] Upgrade CPU backend to torch-2.6 (#13381)

Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2026-03-16 16:27:15 +08:00 · 2025-03-12 18:41:13 +08:00 · 2025-03-12 18:41:13 +08:00 · ff47aab056
commit ff47aab056
parent debd6bbf09
9 changed files with 23 additions and 13 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -19,13 +19,14 @@ remove_docker_container

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
+  export BUILDKITE_BUILD_NUMBER=$3

  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@ -36,6 +37,7 @@ function cpu_tests() {
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@ -85,4 +87,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li

 RUN echo 'ulimit -c 0' >> ~/.bashrc

-RUN pip install intel_extension_for_pytorch==2.5.0
+RUN pip install intel_extension_for_pytorch==2.6.0

 WORKDIR /workspace

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.6
+        GIT_TAG  v3.7.1
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -2,7 +2,7 @@
 -r common.txt

 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
+torch==2.6.0+cpu; platform_machine == "x86_64"
 torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
 torch==2.7.0.dev20250304; platform_machine == "s390x"

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform


-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=not current_platform.is_cpu())
 def v1(run_with_both_engines_lora):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@ -17,7 +17,7 @@ class _PagedAttention:

    @staticmethod
    def get_supported_head_sizes() -> List[int]:
-        return [32, 64, 80, 96, 112, 128, 256]
+        return [32, 64, 80, 96, 112, 128, 192, 256]

    @staticmethod
    def get_kv_cache_shape(
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@ -254,10 +254,11 @@ def _run_worker_process(
    # online (in situ) tuning is enabled.
    # Offline tuning API (record_untuned_is_enabled()) only
    # available in PyTorch 2.6 or later.
-    import torch.cuda.tunable as tunable
-    if (tunable.is_enabled() and tunable.tuning_is_enabled()
-            and not tunable.record_untuned_is_enabled()):
-        tunable.write_file()
+    if torch.cuda.is_available():
+        import torch.cuda.tunable as tunable
+        if (tunable.is_enabled() and tunable.tuning_is_enabled()
+                and not tunable.record_untuned_is_enabled()):
+            tunable.write_file()

    logger.info("Worker exiting")

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
        activation: str = "silu",
        **kwargs,
    ):
-        assert custom_routing_function is None
        assert activation == "silu", f"{activation} is not supported."
        return layer.ipex_fusion(
            x,
@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            renormalize,
            topk_group,
            num_expert_group,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
        )

    def forward_tpu(
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@ -121,6 +121,9 @@ class CpuPlatform(Platform):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

+        # MLA attention is not supported
+        os.environ["VLLM_MLA_DISABLE"] = "1"
+
        # Intel OpenMP setting
        ld_prealod_str = os.getenv("LD_PRELOAD", "")
        if "libiomp5.so" in ld_prealod_str: