diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 74a7a3a3530f5..d5736c0aee208 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -7,7 +7,7 @@ permissions: jobs: lint-and-deploy: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/docker/Dockerfile.arm b/docker/Dockerfile.arm deleted file mode 100644 index bad093684239c..0000000000000 --- a/docker/Dockerfile.arm +++ /dev/null @@ -1,62 +0,0 @@ -# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform. - -FROM ubuntu:22.04 AS cpu-test-arm - -ENV CCACHE_DIR=/root/.cache/ccache - -ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache - -RUN --mount=type=cache,target=/var/cache/apt \ - apt-get update -y \ - && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores - -# Set LD_PRELOAD for tcmalloc on ARM -ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" - -RUN echo 'ulimit -c 0' >> ~/.bashrc - -WORKDIR /workspace - -ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" -ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ - pip install --upgrade pip && \ - pip install -r requirements/build.txt - -FROM cpu-test-arm AS build - -WORKDIR /workspace/vllm - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ - --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ - pip install -v -r requirements/cpu.txt - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -# Disabling AVX512 specific optimizations for ARM -ARG VLLM_CPU_DISABLE_AVX512="true" -ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ - pip install dist/*.whl && \ - rm -rf dist - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 982c1ddf27438..5e49e87131ece 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -1,4 +1,11 @@ -# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. +# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms. +# +# Supported platforms: +# - linux/amd64 (x86_64) +# - linux/arm64 (aarch64) +# +# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.: +# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu . # # Build targets: # vllm-openai (default): used for serving deployment @@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --upgrade pip && \ uv pip install -r requirements/cpu.txt -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" +ARG TARGETARCH +ENV TARGETARCH=${TARGETARCH} + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \ + else \ + PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \ + fi && \ + echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc + +# Ensure that the LD_PRELOAD environment variable for export is in effect. +SHELL ["/bin/bash", "-c"] + +ENV LD_PRELOAD=${LD_PRELOAD} RUN echo 'ulimit -c 0' >> ~/.bashrc diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index 63ae351b395fb..cac578eefb1d7 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] ```bash -docker build -f docker/Dockerfile.arm \ +docker build -f docker/Dockerfile.cpu \ --tag vllm-cpu-env . # Launching OpenAI server diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d80354342bc20..6860275acab6f 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 torch==2.7.0; platform_system == "Darwin" -torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.7.0; platform_machine == "ppc64le" +torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" @@ -25,3 +26,6 @@ datasets # for benchmark scripts intel-openmp==2024.2.1; platform_machine == "x86_64" intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile. + +# Use this to gather CPU info and optimize based on ARM Neoverse cores +py-cpuinfo; platform_machine == "aarch64"