mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-31 04:17:05 +08:00
[CPU] Refine default config for the CPU backend (#19539)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
parent
bb4a0decef
commit
6458721108
@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
@ -72,7 +81,7 @@ function cpu_tests() {
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
|
||||
@ -1562,14 +1562,20 @@ class EngineArgs:
|
||||
UsageContext.LLM_CLASS: 16384,
|
||||
UsageContext.OPENAI_API_SERVER: 8192,
|
||||
}
|
||||
default_max_num_seqs = 1024
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 1024,
|
||||
UsageContext.OPENAI_API_SERVER: 1024,
|
||||
}
|
||||
else:
|
||||
# TODO(woosuk): Tune the default values for other hardware.
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 8192,
|
||||
UsageContext.OPENAI_API_SERVER: 2048,
|
||||
}
|
||||
default_max_num_seqs = 256
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 256,
|
||||
UsageContext.OPENAI_API_SERVER: 256,
|
||||
}
|
||||
|
||||
# tpu specific default values.
|
||||
if current_platform.is_tpu():
|
||||
@ -1586,6 +1592,17 @@ class EngineArgs:
|
||||
}
|
||||
}
|
||||
|
||||
# cpu specific default values.
|
||||
if current_platform.is_cpu():
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 4096,
|
||||
UsageContext.OPENAI_API_SERVER: 2048,
|
||||
}
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 128,
|
||||
UsageContext.OPENAI_API_SERVER: 32,
|
||||
}
|
||||
|
||||
use_context_value = usage_context.value if usage_context else None
|
||||
if (self.max_num_batched_tokens is None
|
||||
and usage_context in default_max_num_batched_tokens):
|
||||
@ -1606,8 +1623,9 @@ class EngineArgs:
|
||||
"Setting max_num_batched_tokens to %d for %s usage context.",
|
||||
self.max_num_batched_tokens, use_context_value)
|
||||
|
||||
if self.max_num_seqs is None:
|
||||
self.max_num_seqs = default_max_num_seqs
|
||||
if (self.max_num_seqs is None
|
||||
and usage_context in default_max_num_seqs):
|
||||
self.max_num_seqs = default_max_num_seqs[usage_context]
|
||||
|
||||
logger.debug("Setting max_num_seqs to %d for %s usage context.",
|
||||
self.max_num_seqs, use_context_value)
|
||||
|
||||
@ -89,10 +89,6 @@ class CpuPlatform(Platform):
|
||||
import vllm.envs as envs
|
||||
from vllm.utils import GiB_bytes
|
||||
model_config = vllm_config.model_config
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
if not model_config.enforce_eager:
|
||||
model_config.enforce_eager = True
|
||||
|
||||
model_config.disable_cascade_attn = True
|
||||
|
||||
@ -171,9 +167,21 @@ class CpuPlatform(Platform):
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE):
|
||||
|
||||
# Note: vLLM V1 is using PIECEWISE level compilation, which will
|
||||
# take time to compile kernels just-in-time with the inductor
|
||||
# backend. For CPU CI tests, most of them are executed fast and
|
||||
# compilations consume too much time, even with torch compile
|
||||
# cache. So use VLLM_CPU_CI_ENV to indicate the CI environment,
|
||||
# and just execute model with dynamo + eager mode to save time.
|
||||
# VLLM_CPU_CI_ENV is only used as an internal variable.
|
||||
if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0":
|
||||
backend = "eager"
|
||||
else:
|
||||
backend = "inductor"
|
||||
|
||||
compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
compilation_config.backend = "eager"
|
||||
compilation_config.custom_ops += ["none"]
|
||||
compilation_config.backend = backend
|
||||
compilation_config.inductor_compile_config.update({
|
||||
"dce":
|
||||
True,
|
||||
|
||||
@ -60,7 +60,8 @@ class CPUModelRunner(GPUModelRunner):
|
||||
def warming_up_model(self) -> None:
|
||||
logger.info("Warming up model for the compilation...")
|
||||
# Only generate graph for the generic shape
|
||||
self._dummy_run(max(16, self.max_num_reqs))
|
||||
with _set_global_compilation_settings(self.vllm_config):
|
||||
self._dummy_run(max(16, self.max_num_reqs))
|
||||
logger.info("Warming up done.")
|
||||
|
||||
def _init_device_properties(self) -> None:
|
||||
@ -71,16 +72,15 @@ class CPUModelRunner(GPUModelRunner):
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _set_global_compilation_settings():
|
||||
def _set_global_compilation_settings(config: VllmConfig):
|
||||
import torch._inductor.config
|
||||
|
||||
# Note: The CPPGEMM backend requires freezing parameters.
|
||||
freezing_value = torch._inductor.config.freezing
|
||||
torch._inductor.config.freezing = True
|
||||
# Note: workaround for "ValueError: fast mode: can't pickle cyclic objects
|
||||
# including object type dict"
|
||||
force_disable_caches = torch._inductor.config.force_disable_caches
|
||||
torch._inductor.config.force_disable_caches = True
|
||||
yield
|
||||
torch._inductor.config.freezing = freezing_value
|
||||
torch._inductor.config.force_disable_caches = force_disable_caches
|
||||
inductor_config = config.compilation_config.inductor_compile_config
|
||||
try:
|
||||
# Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
|
||||
freezing_value = torch._inductor.config.freezing
|
||||
if inductor_config.get("max_autotune", False):
|
||||
torch._inductor.config.freezing = True
|
||||
yield
|
||||
finally:
|
||||
torch._inductor.config.freezing = freezing_value
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user