[UX] Add FlashInfer as default CUDA dependency (#26443)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
Michael Goin 2025-10-09 17:10:02 -04:00 committed by GitHub
parent 2e54db4d2b
commit c9d33c60dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 19 additions and 71 deletions

View File

@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # Install FlashInfer pre-compiled kernel cache and binaries
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # https://docs.flashinfer.ai/installation.html
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' RUN --mount=type=cache,target=/root/.cache/uv \
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive uv pip install --system flashinfer-cubin==0.4.0 \
# $ cd flashinfer && uv pip install --system flashinfer-jit-cache==0.4.0 \
# $ git checkout v0.2.6.post1 --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
# $ python -m flashinfer.aot && flashinfer show-config
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.4.0"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE=false
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
pushd flashinfer
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
fi
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
fi
popd
rm -rf flashinfer
BASH
COPY examples examples COPY examples examples
COPY benchmarks benchmarks COPY benchmarks benchmarks
COPY ./vllm/collect_env.py . COPY ./vllm/collect_env.py .

View File

@ -11,3 +11,5 @@ torchaudio==2.8.0
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.4.0

View File

@ -714,8 +714,7 @@ setup(
"mistral_common[audio]", "mistral_common[audio]",
], # Required for audio processing ], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile "flashinfer": [], # Kept for backwards compatibility
"flashinfer": ["flashinfer-python==0.4.0"],
# Optional deps for AMD FP4 quantization support # Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"], "petit-kernel": ["petit-kernel"],
}, },

View File

@ -12,6 +12,7 @@ import functools
import importlib import importlib
import importlib.util import importlib.util
import os import os
import shutil
from typing import Any, Callable, NoReturn from typing import Any, Callable, NoReturn
import requests import requests
@ -37,7 +38,14 @@ def has_flashinfer() -> bool:
"""Return ``True`` if FlashInfer is available.""" """Return ``True`` if FlashInfer is available."""
# Use find_spec to check if the module exists without importing it # Use find_spec to check if the module exists without importing it
# This avoids potential CUDA initialization side effects # This avoids potential CUDA initialization side effects
return importlib.util.find_spec("flashinfer") is not None if importlib.util.find_spec("flashinfer") is None:
logger.debug_once("FlashInfer unavailable since package was not found")
return False
# Also check if nvcc is available since it's required to JIT compile flashinfer
if shutil.which("nvcc") is None:
logger.debug_once("FlashInfer unavailable since nvcc was not found")
return False
return True
def _missing(*_: Any, **__: Any) -> NoReturn: def _missing(*_: Any, **__: Any) -> NoReturn: