From da31f6ad3dacea8579adfb36d64d28759dc5c095 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 1 Aug 2025 01:26:24 -0700 Subject: [PATCH] Revert precompile wheel changes (#22055) --- docker/Dockerfile | 27 +++---- requirements/test.txt | 24 ++---- setup.py | 182 ++++++++++++++++++++---------------------- vllm/envs.py | 11 +-- 4 files changed, 107 insertions(+), 137 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 413151b3edb0..0d6afca74e86 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_USE_PRECOMPILED +# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed +ENV VLLM_USE_PRECOMPILED="" +RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ + export VLLM_USE_PRECOMPILED=1 && \ + echo "Using precompiled wheels"; \ + else \ + unset VLLM_USE_PRECOMPILED && \ + echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ + fi # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ - && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ - && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ - export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ - export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others -RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ - echo "Cleaning up extra wheels in dist/..." && \ - # Identify the most recent manylinux1_x86_64 wheel - KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ - if [ -n "$KEEP_WHEEL" ]; then \ - echo "Keeping wheel: $KEEP_WHEEL"; \ - find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ - fi; \ - fi - # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py @@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # Install vllm wheel first, so that torch etc will be installed. -# !bang RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ diff --git a/requirements/test.txt b/requirements/test.txt index 4aaca2afea26..d45048aae580 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,7 +22,9 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via aiohttp + # via + # aiohttp + # ray albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -137,7 +139,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.5.1 +cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib @@ -224,6 +226,7 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal + # ray fsspec==2024.9.0 # via # datasets @@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny - # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions -opentelemetry-exporter-prometheus==0.56b0 - # via ray -opentelemetry-proto==1.36.0 - # via ray opentelemetry-sdk==1.35.0 - # via - # mlflow-skinny - # opentelemetry-exporter-prometheus - # ray + # via mlflow-skinny opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -702,9 +697,7 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via - # opentelemetry-exporter-prometheus - # ray + # via ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -714,7 +707,6 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny - # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -862,7 +854,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.48.0 +ray==2.43.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer diff --git a/setup.py b/setup.py index bfa195d4395f..64cfbb8db962 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,6 @@ import json import logging import os import re -import shutil import subprocess import sys from pathlib import Path @@ -282,69 +281,10 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class precompiled_wheel_utils: +class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - @staticmethod - def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: - import tempfile - import zipfile - - temp_dir = None - try: - if not os.path.isfile(wheel_url_or_path): - wheel_filename = wheel_url_or_path.split("/")[-1] - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_url_or_path} " - f"to {wheel_path}") - from urllib.request import urlretrieve - urlretrieve(wheel_url_or_path, filename=wheel_path) - else: - wheel_path = wheel_url_or_path - print(f"Using existing wheel at {wheel_path}") - - package_data_patch = {} - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - ] - - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members = list( - filter(lambda x: x.filename in files_to_copy, - wheel.filelist)) - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"[extract] {file.filename}") - target_path = os.path.join(".", file.filename) - os.makedirs(os.path.dirname(target_path), exist_ok=True) - with wheel.open(file.filename) as src, open( - target_path, "wb") as dst: - shutil.copyfileobj(src, dst) - - pkg = os.path.dirname(file.filename).replace("/", ".") - package_data_patch.setdefault(pkg, []).append( - os.path.basename(file.filename)) - - return package_data_patch - finally: - if temp_dir is not None: - print(f"Removing temporary directory {temp_dir}") - shutil.rmtree(temp_dir) - - @staticmethod - def get_base_commit_in_main_branch() -> str: + def get_base_commit_in_main_branch(self) -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -357,10 +297,6 @@ class precompiled_wheel_utils: ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] - # In Docker build context, .git may be immutable or missing. - if envs.VLLM_DOCKER_BUILD_CONTEXT: - return upstream_main_commit - # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -393,15 +329,92 @@ class precompiled_wheel_utils: "wheel may not be compatible with your dev branch: %s", err) return "nightly" + def run(self) -> None: + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is None: + base_commit = self.get_base_commit_in_main_branch() + wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + # Fallback to nightly wheel if latest commit wheel is unavailable, + # in this rare case, the nightly release CI hasn't finished on main. + if not is_url_available(wheel_location): + wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + # "vllm/_version.py", # not available in nightly wheels yet + ] + + file_members = list( + filter(lambda x: x.filename in files_to_copy, wheel.filelist)) + + # vllm_flash_attn python code: + # Regex from + # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" def _is_cuda() -> bool: - # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda - if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: - return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) @@ -626,37 +639,16 @@ package_data = { ] } -# If using precompiled, extract and patch package_data (in advance of setup) -if envs.VLLM_USE_PRECOMPILED: - assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is not None: - wheel_url = wheel_location - else: - base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() - wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - from urllib.request import urlopen - try: - with urlopen(wheel_url) as resp: - if resp.status != 200: - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - except Exception as e: - print(f"[warn] Falling back to nightly wheel: {e}") - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( - wheel_url) - for pkg, files in patch.items(): - package_data.setdefault(pkg, []).extend(files) - if _no_device(): ext_modules = [] -if not ext_modules or envs.VLLM_USE_PRECOMPILED: - # Disable build_ext when using precompiled wheel +if not ext_modules: cmdclass = {} else: - cmdclass = {"build_ext": cmake_build_ext} + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } setup( # static metadata should rather go in pyproject.toml diff --git a/vllm/envs.py b/vllm/envs.py index 19bc9156b258..7553eccf16ea 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -68,7 +68,6 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False - VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in - ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), - - # Used to mark that setup.py is running in a Docker build context, - # in order to force the use of precompiled binaries. - "VLLM_DOCKER_BUILD_CONTEXT": - lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in - ("1", "true"), + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build.