Revert precompile wheel changes (#22055)

This commit is contained in:
Simon Mo 2025-08-01 01:26:24 -07:00 committed by GitHub
parent 98df153abf
commit da31f6ad3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 107 additions and 137 deletions

View File

@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0 ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels # Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED="" ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
# if USE_SCCACHE is set, use sccache to speed up compilation # if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \ && export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \ && export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \ && sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \ && sccache --show-stats; \
@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts # Clean any existing CMake artifacts
rm -rf .deps && \ rm -rf .deps && \
mkdir -p .deps && \ mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi fi
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
echo "Cleaning up extra wheels in dist/..." && \
# Identify the most recent manylinux1_x86_64 wheel
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
if [ -n "$KEEP_WHEEL" ]; then \
echo "Keeping wheel: $KEEP_WHEEL"; \
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
fi; \
fi
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi fi
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
# !bang
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose \

View File

@ -22,7 +22,9 @@ aiohttp==3.10.11
aiohttp-cors==0.8.1 aiohttp-cors==0.8.1
# via ray # via ray
aiosignal==1.3.1 aiosignal==1.3.1
# via aiohttp # via
# aiohttp
# ray
albucore==0.0.16 albucore==0.0.16
# via terratorch # via terratorch
albumentations==1.4.6 albumentations==1.4.6
@ -137,7 +139,7 @@ contourpy==1.3.0
# via matplotlib # via matplotlib
cramjam==2.9.0 cramjam==2.9.0
# via fastparquet # via fastparquet
cupy-cuda12x==13.5.1 cupy-cuda12x==13.3.0
# via ray # via ray
cycler==0.12.1 cycler==0.12.1
# via matplotlib # via matplotlib
@ -224,6 +226,7 @@ frozenlist==1.5.0
# via # via
# aiohttp # aiohttp
# aiosignal # aiosignal
# ray
fsspec==2024.9.0 fsspec==2024.9.0
# via # via
# datasets # datasets
@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86
opentelemetry-api==1.35.0 opentelemetry-api==1.35.0
# via # via
# mlflow-skinny # mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk # opentelemetry-sdk
# opentelemetry-semantic-conventions # opentelemetry-semantic-conventions
opentelemetry-exporter-prometheus==0.56b0
# via ray
opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0 opentelemetry-sdk==1.35.0
# via # via mlflow-skinny
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.56b0 opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk # via opentelemetry-sdk
packaging==24.2 packaging==24.2
@ -702,9 +697,7 @@ pqdm==0.2.0
pretrainedmodels==0.7.4 pretrainedmodels==0.7.4
# via segmentation-models-pytorch # via segmentation-models-pytorch
prometheus-client==0.22.0 prometheus-client==0.22.0
# via # via ray
# opentelemetry-exporter-prometheus
# ray
propcache==0.2.0 propcache==0.2.0
# via yarl # via yarl
proto-plus==1.26.1 proto-plus==1.26.1
@ -714,7 +707,6 @@ protobuf==5.28.3
# google-api-core # google-api-core
# googleapis-common-protos # googleapis-common-protos
# mlflow-skinny # mlflow-skinny
# opentelemetry-proto
# proto-plus # proto-plus
# ray # ray
# tensorboardx # tensorboardx
@ -862,7 +854,7 @@ rasterio==1.4.3
# rioxarray # rioxarray
# terratorch # terratorch
# torchgeo # torchgeo
ray==2.48.0 ray==2.43.0
# via -r requirements/test.in # via -r requirements/test.in
redis==5.2.0 redis==5.2.0
# via tensorizer # via tensorizer

182
setup.py
View File

@ -7,7 +7,6 @@ import json
import logging import logging
import os import os
import re import re
import shutil
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
@ -282,69 +281,10 @@ class cmake_build_ext(build_ext):
self.copy_file(file, dst_file) self.copy_file(file, dst_file)
class precompiled_wheel_utils: class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel.""" """Extracts libraries and other files from an existing wheel."""
@staticmethod def get_base_commit_in_main_branch(self) -> str:
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
print(f"Using existing wheel at {wheel_path}")
package_data_patch = {}
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))
return package_data_patch
finally:
if temp_dir is not None:
print(f"Removing temporary directory {temp_dir}")
shutil.rmtree(temp_dir)
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing. # Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly" return "nightly"
@ -357,10 +297,6 @@ class precompiled_wheel_utils:
]).decode("utf-8") ]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"] upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit
# Check if the upstream_main_commit exists in the local repo # Check if the upstream_main_commit exists in the local repo
try: try:
subprocess.check_output( subprocess.check_output(
@ -393,15 +329,92 @@ class precompiled_wheel_utils:
"wheel may not be compatible with your dev branch: %s", err) "wheel may not be compatible with your dev branch: %s", err)
return "nightly" return "nightly"
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile
if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]
import tempfile
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)
if package_name not in package_data:
package_data[package_name] = []
wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue
package_data[package_name].append(file_name)
def _no_device() -> bool: def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty" return VLLM_TARGET_DEVICE == "empty"
def _is_cuda() -> bool: def _is_cuda() -> bool:
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
return True
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu())) and not (_is_neuron() or _is_tpu()))
@ -626,37 +639,16 @@ package_data = {
] ]
} }
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
if _no_device(): if _no_device():
ext_modules = [] ext_modules = []
if not ext_modules or envs.VLLM_USE_PRECOMPILED: if not ext_modules:
# Disable build_ext when using precompiled wheel
cmdclass = {} cmdclass = {}
else: else:
cmdclass = {"build_ext": cmake_build_ext} cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
setup( setup(
# static metadata should rather go in pyproject.toml # static metadata should rather go in pyproject.toml

View File

@ -68,7 +68,6 @@ if TYPE_CHECKING:
MAX_JOBS: Optional[str] = None MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False VLLM_USE_PRECOMPILED: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so) # If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED": "VLLM_USE_PRECOMPILED":
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT":
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
("1", "true"),
# Whether to force using nightly wheel in python build. # Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build. # This is used for testing the nightly wheel in python build.