diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 6314afd652340..47866b209e69a 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -41,6 +41,40 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - block: "Build FlashInfer wheels"
+ key: block-build-flashinfer-wheels
+ depends_on: ~
+
+ - label: "Build FlashInfer wheels - CUDA 12.8"
+ depends_on: block-build-flashinfer-wheels
+ id: build-flashinfer-wheel-cuda-12-8
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.8.1"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ - label: "Build FlashInfer wheels - CUDA 12.6"
+ depends_on: block-build-flashinfer-wheels
+ id: build-flashinfer-wheel-cuda-12-6
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.6.3"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ - label: "Build FlashInfer wheels - CUDA 11.8"
+ depends_on: block-build-flashinfer-wheels
+ id: build-flashinfer-wheel-cuda-11-8
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/build-flashinfer-wheel.sh 11.8.0"
+ env:
+ DOCKER_BUILDKIT: "1"
+
- block: "Build release image"
depends_on: ~
key: block-release-image-build
diff --git a/.buildkite/scripts/build-flashinfer-wheel.sh b/.buildkite/scripts/build-flashinfer-wheel.sh
new file mode 100755
index 0000000000000..c941bff0fc3e4
--- /dev/null
+++ b/.buildkite/scripts/build-flashinfer-wheel.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -ex
+
+CUDA_VERSION="${1:-12.8.1}"
+FLASHINFER_VERSION="${FLASHINFER_VERSION:-v0.2.9rc2}"
+
+echo "Building FlashInfer wheel for CUDA ${CUDA_VERSION} using vLLM Dockerfile"
+
+# Build the FlashInfer wheel using the existing Dockerfile stage
+DOCKER_BUILDKIT=1 docker build \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg CUDA_VERSION="${CUDA_VERSION}" \
+ --build-arg FLASHINFER_GIT_REF="${FLASHINFER_VERSION}" \
+ --tag flashinfer-wheel-builder:${CUDA_VERSION} \
+ --target flashinfer-wheel-builder \
+ --progress plain \
+ -f docker/Dockerfile .
+
+# Extract the wheel
+mkdir -p artifacts/dist
+docker run --rm -v $(pwd)/artifacts:/output_host flashinfer-wheel-builder:${CUDA_VERSION} \
+ bash -c 'cp /output/*.whl /output_host/dist/ && chmod -R a+rw /output_host'
+
+# Upload the wheel
+bash .buildkite/scripts/upload-flashinfer-wheels.sh
+
+echo "FlashInfer wheel built and uploaded successfully for CUDA ${CUDA_VERSION}"
+ls -la artifacts/dist/
\ No newline at end of file
diff --git a/.buildkite/scripts/upload-flashinfer-wheels.sh b/.buildkite/scripts/upload-flashinfer-wheels.sh
new file mode 100755
index 0000000000000..495c20fec2e6e
--- /dev/null
+++ b/.buildkite/scripts/upload-flashinfer-wheels.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+ echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+ exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+echo "Processing FlashInfer wheel: $wheel"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename for compatibility
+new_wheel="${wheel/linux/manylinux1}"
+if [[ "$wheel" != "$new_wheel" ]]; then
+ mv -- "$wheel" "$new_wheel"
+ wheel="$new_wheel"
+ echo "Renamed wheel to: $wheel"
+fi
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "FlashInfer version: $version"
+
+# Upload the wheel to S3 under flashinfer directory
+aws s3 cp "$wheel" "s3://vllm-wheels/flashinfer/"
+
+# Generate simple index.html for the package (following pip index pattern)
+wheel_name=$(basename "$wheel")
+cat > flashinfer_index.html << EOF
+
+
+
Links for flashinfer-python
+
+Links for flashinfer-python
+$wheel_name
+
+
+EOF
+
+aws s3 cp flashinfer_index.html "s3://vllm-wheels/flashinfer/index.html"
+
+# Clean up
+rm -f flashinfer_index.html
+
+echo "Successfully uploaded FlashInfer wheel $wheel_name (version $version)"
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 75b5ab0230c87..9d8c024126fa9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -268,6 +268,17 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
else \
echo "Skipping wheel size check."; \
fi
+#################### FLASHINFER WHEEL BUILD IMAGE ####################
+FROM base AS flashinfer-wheel-builder
+ARG CUDA_VERSION
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+ARG FLASHINFER_GIT_REF="v0.2.9rc2"
+
+COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+ . /etc/environment && \
+ BUILD_WHEEL=true /tmp/build-flashinfer.sh
+
#################### EXTENSION Build IMAGE ####################
#################### DEV IMAGE ####################
@@ -395,31 +406,10 @@ ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
- . /etc/environment
- git clone --depth 1 --recursive --shallow-submodules \
- --branch ${FLASHINFER_GIT_REF} \
- ${FLASHINFER_GIT_REPO} flashinfer
- # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
- # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
- if [[ "${CUDA_VERSION}" == 11.* ]]; then
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
- elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
- else
- # CUDA 12.8+ supports 10.0a and 12.0
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
- fi
- echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
- # Needed to build AOT kernels
- pushd flashinfer
- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
- python3 -m flashinfer.aot
- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
- uv pip install --system --no-build-isolation --force-reinstall --no-deps .
- popd
- rm -rf flashinfer
-BASH
+COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+ . /etc/environment && \
+ /tmp/build-flashinfer.sh
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
diff --git a/tools/build-flashinfer.sh b/tools/build-flashinfer.sh
new file mode 100755
index 0000000000000..4182bc22bf067
--- /dev/null
+++ b/tools/build-flashinfer.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Build FlashInfer with AOT kernels
+# This script is used by both the Dockerfile and standalone wheel building
+
+FLASHINFER_GIT_REPO="${FLASHINFER_GIT_REPO:-https://github.com/flashinfer-ai/flashinfer.git}"
+FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}"
+CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
+BUILD_WHEEL="${BUILD_WHEEL:-false}"
+
+echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
+
+# Clone FlashInfer
+git clone --depth 1 --recursive --shallow-submodules \
+ --branch ${FLASHINFER_GIT_REF} \
+ ${FLASHINFER_GIT_REPO} flashinfer
+
+# Set CUDA arch list based on CUDA version
+# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+if [[ "${CUDA_VERSION}" == 11.* ]]; then
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+else
+ # CUDA 12.8+ supports 10.0a and 12.0
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+fi
+
+echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+
+# Build AOT kernels and install/build wheel
+pushd flashinfer
+ TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+ python3 -m flashinfer.aot
+
+ if [[ "${BUILD_WHEEL}" == "true" ]]; then
+ # Build wheel for distribution
+ TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+ uv pip wheel --no-deps --wheel-dir /wheels .
+ mkdir -p /output && cp /wheels/*.whl /output/
+ echo "✅ FlashInfer wheel built successfully"
+ else
+ # Install directly (for Dockerfile)
+ TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+ uv pip install --system --no-build-isolation --force-reinstall --no-deps .
+ echo "✅ FlashInfer installed successfully"
+ fi
+popd
+
+# Cleanup
+rm -rf flashinfer
\ No newline at end of file