mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 21:25:01 +08:00
Add DeepGEMM to Dockerfile in vllm-base image (#21533)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
c3e0e9337e
commit
e360316ab9
@ -1,4 +1,3 @@
|
||||
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
|
||||
# Example:
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||
@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
|
||||
CUDA_MINOR="${CUDA_MINOR%%.*}"
|
||||
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
|
||||
git clone --recursive --shallow-submodules \
|
||||
${DEEPGEMM_GIT_REPO} deepgemm
|
||||
echo "🏗️ Building DeepGEMM"
|
||||
pushd deepgemm
|
||||
git checkout ${DEEPGEMM_GIT_REF}
|
||||
# Build DeepGEMM
|
||||
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
|
||||
rm -rf build dist
|
||||
rm -rf *.egg-info
|
||||
python3 setup.py bdist_wheel
|
||||
uv pip install --system dist/*.whl
|
||||
popd
|
||||
rm -rf deepgemm
|
||||
else
|
||||
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||
fi
|
||||
BASH
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
|
||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
|
||||
is_deep_gemm_supported)
|
||||
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
from .utils import make_test_weights
|
||||
@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif(
|
||||
)
|
||||
|
||||
requires_deep_gemm = pytest.mark.skipif(
|
||||
not has_deep_gemm(),
|
||||
not is_deep_gemm_supported(),
|
||||
reason="Requires deep_gemm kernels",
|
||||
)
|
||||
|
||||
|
||||
@ -15,13 +15,13 @@ import torch
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8)
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
|
||||
from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
|
||||
per_block_cast_to_fp8)
|
||||
|
||||
BLOCK_SIZE = [128, 128]
|
||||
|
||||
requires_deep_gemm = pytest.mark.skipif(
|
||||
not has_deep_gemm(),
|
||||
not is_deep_gemm_supported(),
|
||||
reason="Requires deep_gemm kernels",
|
||||
)
|
||||
|
||||
|
||||
@ -17,6 +17,17 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_gemm
|
||||
|
||||
|
||||
@functools.cache
|
||||
def is_deep_gemm_supported() -> bool:
|
||||
"""Return ``True`` if DeepGEMM is supported on the current platform.
|
||||
Currently, only Hopper and Blackwell GPUs are supported.
|
||||
"""
|
||||
supported_arch = current_platform.is_cuda() and (
|
||||
current_platform.is_device_capability(90)
|
||||
or current_platform.is_device_capability(100))
|
||||
return has_deep_gemm() and supported_arch
|
||||
|
||||
|
||||
@functools.cache
|
||||
def is_blackwell_deep_gemm_used() -> bool:
|
||||
"""Return ``True`` if vLLM is configured to use DeepGEMM on a
|
||||
@ -142,4 +153,5 @@ __all__ = [
|
||||
"fp8_m_grouped_gemm_nt_masked",
|
||||
"per_block_cast_to_fp8",
|
||||
"is_blackwell_deep_gemm_used",
|
||||
"is_deep_gemm_supported",
|
||||
]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user