From cfb7e55515a5558be3a7199044411953017352d3 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 18 Dec 2025 12:59:09 +0800 Subject: [PATCH] [Doc][CPU] Update CPU doc (#30765) Signed-off-by: jiang1.li Signed-off-by: Li, Jiang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docker/Dockerfile.cpu | 4 +- .../installation/cpu.arm.inc.md | 34 ++++++++- docs/getting_started/installation/cpu.md | 11 +-- .../installation/cpu.x86.inc.md | 71 +++++++++++++++++-- .../installation/python_env_setup.inc.md | 2 +- 5 files changed, 106 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 8d55ecfba3e52..bd5bc43916eac 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -17,7 +17,7 @@ # VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true -# VLLM_CPU_AMXBF16=false (default)|true +# VLLM_CPU_AMXBF16=false |true (default) # ######################### COMMON BASE IMAGE ######################### @@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} ARG VLLM_CPU_AVX512VNNI=0 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... -ARG VLLM_CPU_AMXBF16=0 +ARG VLLM_CPU_AMXBF16=1 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} WORKDIR /workspace/vllm diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 657bf2509db01..4940e5781b29a 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -19,7 +19,7 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c ```bash export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index ``` ??? console "pip" @@ -27,6 +27,20 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + TC_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$LD_PRELOAD" + ``` + The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. **Install the latest code** @@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe To install from nightly index, run: ```bash -uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index ``` ??? console "pip (there's a caveat)" @@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi ```bash export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index ``` # --8<-- [end:pre-built-wheels] @@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation Testing has been conducted on AWS Graviton3 instances for compatibility. +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + TC_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$LD_PRELOAD" + ``` + # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 210f720e2d92a..affb94593dd42 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation" +## Technical Discussions + +The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/). + +When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness. + ## Requirements - Python: 3.10 -- 3.13 @@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel - GPTQ (x86 only) - compressed-tensor INT8 W8A8 (x86, s390x) -### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`? - -- Both of them require `amx` CPU flag. - - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios. - ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker? In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal. diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md index 1fad7f4338822..01e34eee10539 100644 --- a/docs/getting_started/installation/cpu.x86.inc.md +++ b/docs/getting_started/installation/cpu.x86.inc.md @@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data # --8<-- [end:set-up-using-python] # --8<-- [start:pre-built-wheels] -Currently, there are no pre-built x86 CPU wheels. +Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels: + +```bash +export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') + +# use uv +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu +``` +??? console "pip" + ```bash + # use pip + pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu + ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc, Intel OpenMP is installed with vLLM CPU + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + sudo find / -iname *libiomp5.so + TC_PATH=... + IOMP_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD" + ``` + +**Install the latest code** + +To install the wheel built from the latest main branch: + +```bash +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu +``` + +**Install specific revisions** + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: + +```bash +export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu +``` # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] @@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def ```bash sudo apt-get update -y -sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev +sudo apt-get install -y gcc-12 g++-12 libnuma-dev sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + Clone the vLLM project: ```bash @@ -82,6 +128,22 @@ uv pip install dist/*.whl pip install dist/*.whl ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc, Intel OpenMP is installed with vLLM CPU + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + sudo find / -iname *libiomp5.so + TC_PATH=... + IOMP_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD" + ``` + !!! example "Troubleshooting" - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`. - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed. @@ -95,7 +157,6 @@ uv pip install dist/*.whl "torch==X.Y.Z+cpu" # <------- ] ``` - - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM. # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] @@ -112,6 +173,7 @@ uv pip install dist/*.whl docker build -f docker/Dockerfile.cpu \ --build-arg VLLM_CPU_AVX512BF16=false (default)|true \ --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \ + --build-arg VLLM_CPU_AMXBF16=false|true (default) \ --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ --tag vllm-cpu-env \ --target vllm-openai . @@ -123,9 +185,8 @@ docker run --rm \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ + meta-llama/Llama-3.2-1B-Instruct \ --dtype=bfloat16 \ other vLLM OpenAI server arguments ``` diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md index ba78c329723ed..06794f8d3120e 100644 --- a/docs/getting_started/installation/python_env_setup.inc.md +++ b/docs/getting_started/installation/python_env_setup.inc.md @@ -1,4 +1,4 @@ -On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: ```bash uv venv --python 3.12 --seed