WIP: [CI/Build] simplify Dockerfile build for ARM64 / GH200 (#11212)

Signed-off-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
This commit is contained in:
cennn 2024-12-16 17:20:49 +08:00 committed by GitHub
parent 17138af7c4
commit b3b1526f03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 64 additions and 11 deletions

View File

@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
@ -46,9 +47,14 @@ WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
# install build dependencies
COPY requirements-build.txt requirements-build.txt
@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# install vllm wheel first, so that torch etc will be installed
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip uninstall -y torch && \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
for vLLM to find the current GPU type and build for that.
Building for Arm64/aarch64
--------------------------
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
.. note::
Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
.. code-block:: console
# Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
$ DOCKER_BUILDKIT=1 sudo docker build . \
--target vllm-openai \
-platform "linux/arm64" \
-t vllm/vllm-gh200-openai:latest \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
To run vLLM:

View File

@ -4,6 +4,6 @@ ninja
packaging
setuptools>=61
setuptools-scm>=8
torch==2.5.1
torch==2.5.1; platform_machine != 'aarch64'
wheel
jinja2

View File

@ -0,0 +1,3 @@
--index-url https://download.pytorch.org/whl/nightly/cu124
torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'

View File

@ -4,7 +4,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1
torch == 2.5.1; platform_machine != 'aarch64'
# These must be updated alongside torch
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1