From 4ac510f4844ae2ab168c2dbac545e3dd28a0a1b9 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 17 Sep 2025 23:19:52 -0400 Subject: [PATCH] [Kernels] Enable DeepGEMM by default (#24462) Signed-off-by: Bill Nell --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index eeed7771f0453..72e1d5b0ede81 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -135,7 +135,7 @@ if TYPE_CHECKING: VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False - VLLM_USE_DEEP_GEMM: bool = False + VLLM_USE_DEEP_GEMM: bool = True VLLM_USE_DEEP_GEMM_E8M0: bool = True VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False @@ -1044,7 +1044,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Allow use of DeepGemm kernels for fused moe ops. "VLLM_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))), # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs. "VLLM_USE_DEEP_GEMM_E8M0":