From e360316ab9902ecfc564710ae4b1539db867efd9 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 31 Jul 2025 21:01:55 -0400 Subject: [PATCH] Add DeepGEMM to Dockerfile in vllm-base image (#21533) Signed-off-by: Matthew Bonanni Signed-off-by: mgoin Co-authored-by: mgoin --- docker/Dockerfile | 30 +++++++++++++++++-- tests/kernels/moe/test_deepep_deepgemm_moe.py | 5 ++-- tests/kernels/moe/test_deepgemm.py | 6 ++-- vllm/utils/deep_gemm.py | 12 ++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 69aeee67a430..413151b3edb0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,3 @@ - # The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. @@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12 # Example: # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +# TODO: Restore to base image after FlashInfer AOT wheel fixed ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # By parameterizing the Deadsnakes repository URL, we allow third-party to use @@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ #################### vLLM installation IMAGE #################### # image with vLLM installed -# TODO: Restore to base image after FlashInfer AOT wheel fixed FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION @@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') +# Install DeepGEMM from source +ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" +ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1" +RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' + . /etc/environment + CUDA_MAJOR="${CUDA_VERSION%%.*}" + CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}" + CUDA_MINOR="${CUDA_MINOR%%.*}" + if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then + git clone --recursive --shallow-submodules \ + ${DEEPGEMM_GIT_REPO} deepgemm + echo "🏗️ Building DeepGEMM" + pushd deepgemm + git checkout ${DEEPGEMM_GIT_REF} + # Build DeepGEMM + # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh) + rm -rf build dist + rm -rf *.egg-info + python3 setup.py bdist_wheel + uv pip install --system dist/*.whl + popd + rm -rf deepgemm + else + echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})" + fi +BASH + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 074771e49a06..266f1161a684 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used, + is_deep_gemm_supported) from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif( ) requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index f7578e226917..759d2814eefb 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -15,13 +15,13 @@ import torch from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8 +from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, + per_block_cast_to_fp8) BLOCK_SIZE = [128, 128] requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 169b083017e4..a49a59bd8125 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -17,6 +17,17 @@ from vllm.platforms import current_platform from vllm.utils import has_deep_gemm +@functools.cache +def is_deep_gemm_supported() -> bool: + """Return ``True`` if DeepGEMM is supported on the current platform. + Currently, only Hopper and Blackwell GPUs are supported. + """ + supported_arch = current_platform.is_cuda() and ( + current_platform.is_device_capability(90) + or current_platform.is_device_capability(100)) + return has_deep_gemm() and supported_arch + + @functools.cache def is_blackwell_deep_gemm_used() -> bool: """Return ``True`` if vLLM is configured to use DeepGEMM on a @@ -142,4 +153,5 @@ __all__ = [ "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", "is_blackwell_deep_gemm_used", + "is_deep_gemm_supported", ]