vllm/tools/flashinfer-build.sh
mgoin b15005dc12 Simplify!
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-07-30 12:41:14 -04:00

53 lines
1.8 KiB
Bash
Executable File

#!/usr/bin/env bash
set -ex
# Build FlashInfer with AOT kernels
# This script is used by both the Dockerfile and standalone wheel building
FLASHINFER_GIT_REPO="${FLASHINFER_GIT_REPO:-https://github.com/flashinfer-ai/flashinfer.git}"
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}"
CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
BUILD_WHEEL="${BUILD_WHEEL:-false}"
echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
# Clone FlashInfer
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Set CUDA arch list based on CUDA version
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
# Build AOT kernels and install/build wheel
pushd flashinfer
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
if [[ "${BUILD_WHEEL}" == "true" ]]; then
# Build wheel for distribution
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip wheel --no-deps --wheel-dir /wheels .
mkdir -p /output && cp /wheels/*.whl /output/
echo "✅ FlashInfer wheel built successfully"
else
# Install directly (for Dockerfile)
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
echo "✅ FlashInfer installed successfully"
fi
popd
# Cleanup
rm -rf flashinfer