mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:15:01 +08:00
Add DeepGEMM to Dockerfile in vllm-base image (#21533)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
c3e0e9337e
commit
e360316ab9
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
|
|||||||
# Example:
|
# Example:
|
||||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||||
@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
|
||||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
|
# Install DeepGEMM from source
|
||||||
|
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||||
|
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
|
. /etc/environment
|
||||||
|
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||||
|
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
|
||||||
|
CUDA_MINOR="${CUDA_MINOR%%.*}"
|
||||||
|
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
|
||||||
|
git clone --recursive --shallow-submodules \
|
||||||
|
${DEEPGEMM_GIT_REPO} deepgemm
|
||||||
|
echo "🏗️ Building DeepGEMM"
|
||||||
|
pushd deepgemm
|
||||||
|
git checkout ${DEEPGEMM_GIT_REF}
|
||||||
|
# Build DeepGEMM
|
||||||
|
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
|
||||||
|
rm -rf build dist
|
||||||
|
rm -rf *.egg-info
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
uv pip install --system dist/*.whl
|
||||||
|
popd
|
||||||
|
rm -rf deepgemm
|
||||||
|
else
|
||||||
|
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||||
|
fi
|
||||||
|
BASH
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
|
|||||||
@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
|||||||
FusedMoEModularKernel)
|
FusedMoEModularKernel)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import has_deep_ep, has_deep_gemm
|
from vllm.utils import has_deep_ep, has_deep_gemm
|
||||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
|
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
|
||||||
|
is_deep_gemm_supported)
|
||||||
|
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
from .utils import make_test_weights
|
from .utils import make_test_weights
|
||||||
@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif(
|
|||||||
)
|
)
|
||||||
|
|
||||||
requires_deep_gemm = pytest.mark.skipif(
|
requires_deep_gemm = pytest.mark.skipif(
|
||||||
not has_deep_gemm(),
|
not is_deep_gemm_supported(),
|
||||||
reason="Requires deep_gemm kernels",
|
reason="Requires deep_gemm kernels",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -15,13 +15,13 @@ import torch
|
|||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
per_token_group_quant_fp8)
|
per_token_group_quant_fp8)
|
||||||
from vllm.utils import has_deep_gemm
|
from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
|
||||||
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
|
per_block_cast_to_fp8)
|
||||||
|
|
||||||
BLOCK_SIZE = [128, 128]
|
BLOCK_SIZE = [128, 128]
|
||||||
|
|
||||||
requires_deep_gemm = pytest.mark.skipif(
|
requires_deep_gemm = pytest.mark.skipif(
|
||||||
not has_deep_gemm(),
|
not is_deep_gemm_supported(),
|
||||||
reason="Requires deep_gemm kernels",
|
reason="Requires deep_gemm kernels",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -17,6 +17,17 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.utils import has_deep_gemm
|
from vllm.utils import has_deep_gemm
|
||||||
|
|
||||||
|
|
||||||
|
@functools.cache
|
||||||
|
def is_deep_gemm_supported() -> bool:
|
||||||
|
"""Return ``True`` if DeepGEMM is supported on the current platform.
|
||||||
|
Currently, only Hopper and Blackwell GPUs are supported.
|
||||||
|
"""
|
||||||
|
supported_arch = current_platform.is_cuda() and (
|
||||||
|
current_platform.is_device_capability(90)
|
||||||
|
or current_platform.is_device_capability(100))
|
||||||
|
return has_deep_gemm() and supported_arch
|
||||||
|
|
||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def is_blackwell_deep_gemm_used() -> bool:
|
def is_blackwell_deep_gemm_used() -> bool:
|
||||||
"""Return ``True`` if vLLM is configured to use DeepGEMM on a
|
"""Return ``True`` if vLLM is configured to use DeepGEMM on a
|
||||||
@ -142,4 +153,5 @@ __all__ = [
|
|||||||
"fp8_m_grouped_gemm_nt_masked",
|
"fp8_m_grouped_gemm_nt_masked",
|
||||||
"per_block_cast_to_fp8",
|
"per_block_cast_to_fp8",
|
||||||
"is_blackwell_deep_gemm_used",
|
"is_blackwell_deep_gemm_used",
|
||||||
|
"is_deep_gemm_supported",
|
||||||
]
|
]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user