mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 07:25:02 +08:00
Add flashinfer-build.sh and register precompiled cu128 wheel in Dockerfile (#25782)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
dbb7782d5b
commit
55971f85c9
@ -391,18 +391,28 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
|||||||
git clone --depth 1 --recursive --shallow-submodules \
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
--branch ${FLASHINFER_GIT_REF} \
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
${FLASHINFER_GIT_REPO} flashinfer
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
|
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||||
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
|
else
|
||||||
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
|
fi
|
||||||
pushd flashinfer
|
pushd flashinfer
|
||||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
|
||||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
|
||||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
|
||||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
# Download pre-compiled cubins
|
||||||
else
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
# CUDA 12.8+ supports 10.0a and 12.0
|
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
|
||||||
fi
|
fi
|
||||||
|
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||||
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
||||||
|
|||||||
63
tools/flashinfer-build.sh
Normal file
63
tools/flashinfer-build.sh
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# This script is used to build FlashInfer wheels with AOT kernels
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# FlashInfer configuration
|
||||||
|
FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
|
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
|
||||||
|
CUDA_VERSION="${CUDA_VERSION}"
|
||||||
|
BUILD_WHEEL="${BUILD_WHEEL:-true}"
|
||||||
|
|
||||||
|
if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
|
||||||
|
echo "❌ FLASHINFER_GIT_REF must be specified" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "${CUDA_VERSION}" ]]; then
|
||||||
|
echo "❌ CUDA_VERSION must be specified" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
|
||||||
|
|
||||||
|
# Clone FlashInfer
|
||||||
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
|
|
||||||
|
# Set CUDA arch list based on CUDA version
|
||||||
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
|
else
|
||||||
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
|
|
||||||
|
pushd flashinfer
|
||||||
|
# Make sure the wheel is built for the correct CUDA version
|
||||||
|
export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
|
# Build AOT kernels
|
||||||
|
export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
|
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
|
python3 -m flashinfer.aot
|
||||||
|
|
||||||
|
if [[ "${BUILD_WHEEL}" == "true" ]]; then
|
||||||
|
# Build wheel for distribution
|
||||||
|
uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
|
||||||
|
echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
|
||||||
|
else
|
||||||
|
# Install directly (for Dockerfile)
|
||||||
|
uv pip install --system --no-build-isolation --force-reinstall .
|
||||||
|
echo "✅ FlashInfer installed successfully"
|
||||||
|
fi
|
||||||
|
popd
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
rm -rf flashinfer
|
||||||
Loading…
x
Reference in New Issue
Block a user