diff --git a/docker/Dockerfile b/docker/Dockerfile index 006481b23cb9..8bcd7f118f1e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} #################### BASE BUILD IMAGE #################### -#################### WHEEL BUILD IMAGE #################### -FROM base AS build +#################### CSRC BUILD IMAGE #################### +FROM base AS csrc-build ARG TARGETPLATFORM ARG PIP_INDEX_URL UV_INDEX_URL @@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi +WORKDIR /workspace + +COPY pyproject.toml setup.py CMakeLists.txt ./ +COPY cmake cmake/ +COPY csrc csrc/ +COPY vllm/envs.py vllm/envs.py +COPY vllm/__init__.py vllm/__init__.py # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 ARG VLLM_USE_PRECOMPILED="" ARG VLLM_MAIN_CUDA_VERSION="" +# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter) +ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" + # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" = "1" ]; then \ echo "Installing sccache..." \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \ @@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" != "1" ]; then \ # Clean any existing CMake artifacts rm -rf .deps && \ @@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +#################### CSRC BUILD IMAGE #################### + +#################### WHEEL BUILD IMAGE #################### +FROM base AS build +ARG TARGETPLATFORM + +ARG PIP_INDEX_URL UV_INDEX_URL +ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL +ARG PYTORCH_CUDA_INDEX_BASE_URL + +# install build dependencies +COPY requirements/build.txt requirements/build.txt + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + +WORKDIR /workspace + +COPY --from=csrc-build /workspace/dist /precompiled-wheels + +COPY . . + +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi + +ARG vllm_target_device="cuda" +ENV VLLM_TARGET_DEVICE=${vllm_target_device} + +# Skip adding +precompiled suffix to version (preserves git-derived version) +ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1 + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=.git,target=.git \ + if [ "${vllm_target_device}" = "cuda" ]; then \ + export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \ + fi && \ + python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 # Install DeepGEMM from source ARG DEEPGEMM_GIT_REF diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png index b327eb2151f5..7420ca4d8944 100644 Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ diff --git a/setup.py b/setup.py index 5b7d12bb373e..8b2b4f7e5def 100644 --- a/setup.py +++ b/setup.py @@ -461,14 +461,22 @@ class precompiled_wheel_utils: "vllm/cumem_allocator.abi3.so", ] - compiled_regex = re.compile( + flash_attn_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py" ) + triton_kernels_regex = re.compile( + r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py" + ) file_members = list( filter(lambda x: x.filename in files_to_copy, wheel.filelist) ) file_members += list( - filter(lambda x: compiled_regex.match(x.filename), wheel.filelist) + filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist) + ) + file_members += list( + filter( + lambda x: triton_kernels_regex.match(x.filename), wheel.filelist + ) ) for file in file_members: @@ -648,7 +656,7 @@ def get_vllm_version() -> str: if envs.VLLM_TARGET_DEVICE == "empty": version += f"{sep}empty" elif _is_cuda(): - if envs.VLLM_USE_PRECOMPILED: + if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) diff --git a/vllm/envs.py b/vllm/envs.py index 8b954fa14f28..4b594e54f721 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -78,6 +78,7 @@ if TYPE_CHECKING: MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False + VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -462,6 +463,10 @@ environment_variables: dict[str, Callable[[], Any]] = { .lower() in ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + # If set, skip adding +precompiled suffix to version string + "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool( + int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0")) + ), # Used to mark that setup.py is running in a Docker build context, # in order to force the use of precompiled binaries. "VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")