From a1873db23dd597930a7e4731a53314ace92baf49 Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Tue, 29 Jul 2025 17:45:19 -0400 Subject: [PATCH] docker: docker-aware precompiled wheel support (#21127) Signed-off-by: dougbtv --- docker/Dockerfile | 26 +++++++++++++-------- setup.py | 58 +++++++++++++++++++++++++++++++++++------------ vllm/envs.py | 11 +++++++-- 3 files changed, 68 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0cd2cfad66fdd..75b5ab0230c87 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED -# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed -ENV VLLM_USE_PRECOMPILED="" -RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ - export VLLM_USE_PRECOMPILED=1 && \ - echo "Using precompiled wheels"; \ - else \ - unset VLLM_USE_PRECOMPILED && \ - echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ - fi +ARG VLLM_USE_PRECOMPILED="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ + export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others +RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ + echo "Cleaning up extra wheels in dist/..." && \ + # Identify the most recent manylinux1_x86_64 wheel + KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ + if [ -n "$KEEP_WHEEL" ]; then \ + echo "Keeping wheel: $KEEP_WHEEL"; \ + find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ + fi; \ + fi + # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py diff --git a/setup.py b/setup.py index d46e678e7aa40..58e5833f16ae1 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ import json import logging import os import re +import shutil import subprocess import sys from pathlib import Path @@ -297,6 +298,10 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] + # In Docker build context, .git may be immutable or missing. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + return upstream_main_commit + # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -357,19 +362,48 @@ class repackage_wheel(build_ext): # create a temporary directory to store the wheel temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - from urllib.request import urlretrieve - try: urlretrieve(wheel_location, filename=wheel_path) except Exception as e: from setuptools.errors import SetupError - raise SetupError( f"Failed to get vLLM wheel from {wheel_location}") from e + # During a docker build: determine correct filename, copy wheel. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + dist_dir = "/workspace/dist" + os.makedirs(dist_dir, exist_ok=True) + # Determine correct wheel filename from METADATA + with zipfile.ZipFile(wheel_path, "r") as z: + metadata_file = next( + (n for n in z.namelist() + if n.endswith(".dist-info/METADATA")), + None, + ) + if not metadata_file: + raise RuntimeError( + "Could not find METADATA in precompiled wheel.") + metadata = z.read(metadata_file).decode() + version_line = next((line for line in metadata.splitlines() + if line.startswith("Version: ")), None) + if not version_line: + raise RuntimeError( + "Could not determine version from METADATA.") + version = version_line.split(": ")[1].strip() + + # Build correct filename using internal version + arch_tag = "cp38-abi3-manylinux1_x86_64" + corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl" + final_wheel_path = os.path.join(dist_dir, corrected_wheel_name) + + print(f"Docker build context detected, copying precompiled wheel " + f"({version}) to {final_wheel_path}") + shutil.copy2(wheel_path, final_wheel_path) + return + + # Unzip the wheel when not in Docker context with zipfile.ZipFile(wheel_path) as wheel: files_to_copy = [ "vllm/_C.abi3.so", @@ -378,15 +412,9 @@ class repackage_wheel(build_ext): "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/cumem_allocator.abi3.so", - # "vllm/_version.py", # not available in nightly wheels yet ] - file_members = list( filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - - # vllm_flash_attn python code: - # Regex from - # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` compiled_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") file_members += list( @@ -403,11 +431,8 @@ class repackage_wheel(build_ext): package_data[package_name] = [] wheel.extract(file) - if file_name.endswith(".py"): - # python files shouldn't be added to package_data - continue - - package_data[package_name].append(file_name) + if not file_name.endswith(".py"): + package_data[package_name].append(file_name) def _no_device() -> bool: @@ -415,6 +440,9 @@ def _no_device() -> bool: def _is_cuda() -> bool: + # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda + if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: + return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) diff --git a/vllm/envs.py b/vllm/envs.py index fcfad4eec1621..9b6d8c8be242a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -68,6 +68,7 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -222,8 +223,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( - os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": + lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in + ("1", "true"), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build.