enable Docker-aware precompiled wheel setup (#22106)

Signed-off-by: dougbtv <dosmith@redhat.com>
This commit is contained in:
Doug Smith 2025-08-10 19:29:02 -04:00 committed by GitHub
parent 68b254d673
commit d1af8b7be9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 116 additions and 95 deletions

View File

@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
ARG VLLM_USE_PRECOMPILED=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi

185
setup.py
View File

@ -7,6 +7,7 @@ import json
import logging
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
self.copy_file(file, dst_file)
class repackage_wheel(build_ext):
class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries."""
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.")
return
class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
def get_base_commit_in_main_branch(self) -> str:
@staticmethod
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
print(f"Using existing wheel at {wheel_path}")
package_data_patch = {}
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))
return package_data_patch
finally:
if temp_dir is not None:
print(f"Removing temporary directory {temp_dir}")
shutil.rmtree(temp_dir)
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
"wheel may not be compatible with your dev branch: %s", err)
return "nightly"
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile
if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]
import tempfile
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)
if package_name not in package_data:
package_data[package_name] = []
wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue
package_data[package_name].append(file_name)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
@ -639,6 +635,29 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
if _no_device():
ext_modules = []
@ -647,7 +666,7 @@ if not ext_modules:
else:
cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
setup(

View File

@ -70,6 +70,7 @@ if TYPE_CHECKING:
MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
CMAKE_BUILD_TYPE: Optional[str] = None
@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED":
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT":
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
("1", "true"),
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.