[CPU] Refine default config for the CPU backend (#19539)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2026-05-16 18:42:24 +08:00 · 2025-06-13 13:27:39 +08:00 · 2025-06-13 13:27:39 +08:00 · 6458721108
commit 6458721108
parent bb4a0decef
4 changed files with 60 additions and 25 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2

+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
  # offline inference
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
@ -72,7 +81,7 @@ function cpu_tests() {
    set -e
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
      --backend vllm \
      --dataset-name random \
      --model facebook/opt-125m \
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1562,14 +1562,20 @@ class EngineArgs:
                UsageContext.LLM_CLASS: 16384,
                UsageContext.OPENAI_API_SERVER: 8192,
            }
-            default_max_num_seqs = 1024
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 1024,
+                UsageContext.OPENAI_API_SERVER: 1024,
+            }
        else:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
                UsageContext.OPENAI_API_SERVER: 2048,
            }
-            default_max_num_seqs = 256
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 256,
+                UsageContext.OPENAI_API_SERVER: 256,
+            }

        # tpu specific default values.
        if current_platform.is_tpu():
@ -1586,6 +1592,17 @@ class EngineArgs:
                }
            }

+        # cpu specific default values.
+        if current_platform.is_cpu():
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 4096,
+                UsageContext.OPENAI_API_SERVER: 2048,
+            }
+            default_max_num_seqs = {
+                UsageContext.LLM_CLASS: 128,
+                UsageContext.OPENAI_API_SERVER: 32,
+            }
+
        use_context_value = usage_context.value if usage_context else None
        if (self.max_num_batched_tokens is None
                and usage_context in default_max_num_batched_tokens):
@ -1606,8 +1623,9 @@ class EngineArgs:
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, use_context_value)

-        if self.max_num_seqs is None:
-            self.max_num_seqs = default_max_num_seqs
+        if (self.max_num_seqs is None
+                and usage_context in default_max_num_seqs):
+            self.max_num_seqs = default_max_num_seqs[usage_context]

            logger.debug("Setting max_num_seqs to %d for %s usage context.",
                         self.max_num_seqs, use_context_value)
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@ -89,10 +89,6 @@ class CpuPlatform(Platform):
        import vllm.envs as envs
        from vllm.utils import GiB_bytes
        model_config = vllm_config.model_config
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if not model_config.enforce_eager:
-            model_config.enforce_eager = True

        model_config.disable_cascade_attn = True

@ -171,9 +167,21 @@ class CpuPlatform(Platform):
        compilation_config = vllm_config.compilation_config
        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
                == CompilationLevel.PIECEWISE):
+
+            # Note: vLLM V1 is using PIECEWISE level compilation, which will
+            # take time to compile kernels just-in-time with the inductor
+            # backend. For CPU CI tests, most of them are executed fast and
+            # compilations consume too much time, even with torch compile
+            # cache. So use VLLM_CPU_CI_ENV to indicate the CI environment,
+            # and just execute model with dynamo + eager mode to save time.
+            # VLLM_CPU_CI_ENV is only used as an internal variable.
+            if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0":
+                backend = "eager"
+            else:
+                backend = "inductor"
+
            compilation_config.level = CompilationLevel.DYNAMO_ONCE
-            compilation_config.backend = "eager"
-            compilation_config.custom_ops += ["none"]
+            compilation_config.backend = backend
            compilation_config.inductor_compile_config.update({
                "dce":
                True,
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@ -60,7 +60,8 @@ class CPUModelRunner(GPUModelRunner):
    def warming_up_model(self) -> None:
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
-        self._dummy_run(max(16, self.max_num_reqs))
+        with _set_global_compilation_settings(self.vllm_config):
+            self._dummy_run(max(16, self.max_num_reqs))
        logger.info("Warming up done.")

    def _init_device_properties(self) -> None:
@ -71,16 +72,15 @@ class CPUModelRunner(GPUModelRunner):


@contextmanager
-def _set_global_compilation_settings():
+def _set_global_compilation_settings(config: VllmConfig):
    import torch._inductor.config

-    # Note: The CPPGEMM backend requires freezing parameters.
-    freezing_value = torch._inductor.config.freezing
-    torch._inductor.config.freezing = True
-    # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects
-    # including object type dict"
-    force_disable_caches = torch._inductor.config.force_disable_caches
-    torch._inductor.config.force_disable_caches = True
-    yield
-    torch._inductor.config.freezing = freezing_value
-    torch._inductor.config.force_disable_caches = force_disable_caches
+    inductor_config = config.compilation_config.inductor_compile_config
+    try:
+        # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
+        freezing_value = torch._inductor.config.freezing
+        if inductor_config.get("max_autotune", False):
+            torch._inductor.config.freezing = True
+        yield
+    finally:
+        torch._inductor.config.freezing = freezing_value