diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index bbcde4009c0eb..8db8c3a05fb30 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 + # list packages + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " + set -e + pip list" + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + # offline inference docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e @@ -72,7 +81,7 @@ function cpu_tests() { set -e python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ + VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ --backend vllm \ --dataset-name random \ --model facebook/opt-125m \ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 85b7bbfbd93d1..f599d7a3bb5ed 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1562,14 +1562,20 @@ class EngineArgs: UsageContext.LLM_CLASS: 16384, UsageContext.OPENAI_API_SERVER: 8192, } - default_max_num_seqs = 1024 + default_max_num_seqs = { + UsageContext.LLM_CLASS: 1024, + UsageContext.OPENAI_API_SERVER: 1024, + } else: # TODO(woosuk): Tune the default values for other hardware. default_max_num_batched_tokens = { UsageContext.LLM_CLASS: 8192, UsageContext.OPENAI_API_SERVER: 2048, } - default_max_num_seqs = 256 + default_max_num_seqs = { + UsageContext.LLM_CLASS: 256, + UsageContext.OPENAI_API_SERVER: 256, + } # tpu specific default values. if current_platform.is_tpu(): @@ -1586,6 +1592,17 @@ class EngineArgs: } } + # cpu specific default values. + if current_platform.is_cpu(): + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 4096, + UsageContext.OPENAI_API_SERVER: 2048, + } + default_max_num_seqs = { + UsageContext.LLM_CLASS: 128, + UsageContext.OPENAI_API_SERVER: 32, + } + use_context_value = usage_context.value if usage_context else None if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): @@ -1606,8 +1623,9 @@ class EngineArgs: "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) - if self.max_num_seqs is None: - self.max_num_seqs = default_max_num_seqs + if (self.max_num_seqs is None + and usage_context in default_max_num_seqs): + self.max_num_seqs = default_max_num_seqs[usage_context] logger.debug("Setting max_num_seqs to %d for %s usage context.", self.max_num_seqs, use_context_value) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 27c591e3babd4..2d10d700fa2a3 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -89,10 +89,6 @@ class CpuPlatform(Platform): import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if not model_config.enforce_eager: - model_config.enforce_eager = True model_config.disable_cascade_attn = True @@ -171,9 +167,21 @@ class CpuPlatform(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE): + + # Note: vLLM V1 is using PIECEWISE level compilation, which will + # take time to compile kernels just-in-time with the inductor + # backend. For CPU CI tests, most of them are executed fast and + # compilations consume too much time, even with torch compile + # cache. So use VLLM_CPU_CI_ENV to indicate the CI environment, + # and just execute model with dynamo + eager mode to save time. + # VLLM_CPU_CI_ENV is only used as an internal variable. + if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0": + backend = "eager" + else: + backend = "inductor" + compilation_config.level = CompilationLevel.DYNAMO_ONCE - compilation_config.backend = "eager" - compilation_config.custom_ops += ["none"] + compilation_config.backend = backend compilation_config.inductor_compile_config.update({ "dce": True, diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 607cfc0ef69cd..6631c9636eacd 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -60,7 +60,8 @@ class CPUModelRunner(GPUModelRunner): def warming_up_model(self) -> None: logger.info("Warming up model for the compilation...") # Only generate graph for the generic shape - self._dummy_run(max(16, self.max_num_reqs)) + with _set_global_compilation_settings(self.vllm_config): + self._dummy_run(max(16, self.max_num_reqs)) logger.info("Warming up done.") def _init_device_properties(self) -> None: @@ -71,16 +72,15 @@ class CPUModelRunner(GPUModelRunner): @contextmanager -def _set_global_compilation_settings(): +def _set_global_compilation_settings(config: VllmConfig): import torch._inductor.config - # Note: The CPPGEMM backend requires freezing parameters. - freezing_value = torch._inductor.config.freezing - torch._inductor.config.freezing = True - # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects - # including object type dict" - force_disable_caches = torch._inductor.config.force_disable_caches - torch._inductor.config.force_disable_caches = True - yield - torch._inductor.config.freezing = freezing_value - torch._inductor.config.force_disable_caches = force_disable_caches + inductor_config = config.compilation_config.inductor_compile_config + try: + # Note: The MKLDNN and CPPGEMM backend requires freezing parameters. + freezing_value = torch._inductor.config.freezing + if inductor_config.get("max_autotune", False): + torch._inductor.config.freezing = True + yield + finally: + torch._inductor.config.freezing = freezing_value