# The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. #################### BASE BUILD IMAGE #################### FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev RUN apt-get update -y \ && apt-get install -y python3-pip git WORKDIR /workspace # install build and runtime dependencies COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt # install development dependencies COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt #################### BASE BUILD IMAGE #################### #################### EXTENSION BUILD IMAGE #################### FROM dev AS build # install build dependencies COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-build.txt # copy input files COPY csrc csrc COPY setup.py setup.py COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py # cuda arch list used by torch ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # max jobs used by Ninja to build extensions ARG max_jobs=2 ENV MAX_JOBS=${max_jobs} # number of threads used by nvcc ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads RUN python3 setup.py build_ext --inplace #################### EXTENSION Build IMAGE #################### #################### TEST IMAGE #################### # image to run unit testing suite FROM dev AS test # copy pytorch extensions separately to avoid having to rebuild # when python code changes WORKDIR /vllm-workspace # ADD is used to preserve directory structure ADD . /vllm-workspace/ COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose #################### TEST IMAGE #################### #################### RUNTIME BASE IMAGE #################### # use CUDA base as CUDA runtime dependencies are already installed via pip FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base # libnccl required for ray RUN apt-get update -y \ && apt-get install -y python3-pip WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt #################### RUNTIME BASE IMAGE #################### #################### OPENAI API SERVER #################### # openai api server alternative FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER ####################