[Feature] Enable E8M0 by Default on Hopper for DeepGEMM, 5% E2E throughput improvement (#26197)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Wentao Ye 2025-10-08 03:33:56 -04:00 committed by GitHub
parent 335b28f7d1
commit f8607863d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 29 deletions

View File

@ -142,7 +142,6 @@ if TYPE_CHECKING:
VLLM_TPU_USING_PATHWAYS: bool = False VLLM_TPU_USING_PATHWAYS: bool = False
VLLM_USE_DEEP_GEMM: bool = True VLLM_USE_DEEP_GEMM: bool = True
VLLM_USE_DEEP_GEMM_E8M0: bool = True VLLM_USE_DEEP_GEMM_E8M0: bool = True
VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
VLLM_USE_FLASHINFER_MOE_FP16: bool = False VLLM_USE_FLASHINFER_MOE_FP16: bool = False
@ -1055,11 +1054,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool( "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1")) int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
), ),
# TODO(wentao): unify the two E8M0 flags after verifying the correctness.
# Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
),
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no # JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine # JIT'ing in the hot-path. However, this warmup increases the engine
@ -1433,7 +1427,6 @@ def compute_hash() -> str:
"VLLM_DISABLED_KERNELS", "VLLM_DISABLED_KERNELS",
"VLLM_USE_DEEP_GEMM", "VLLM_USE_DEEP_GEMM",
"VLLM_USE_DEEP_GEMM_E8M0", "VLLM_USE_DEEP_GEMM_E8M0",
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
"VLLM_USE_TRTLLM_FP4_GEMM", "VLLM_USE_TRTLLM_FP4_GEMM",
"VLLM_USE_FUSED_MOE_GROUPED_TOPK", "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
"VLLM_USE_FLASHINFER_MOE_FP16", "VLLM_USE_FLASHINFER_MOE_FP16",

View File

@ -629,25 +629,25 @@ def get_config(
if quantization_config is not None: if quantization_config is not None:
config.quantization_config = quantization_config config.quantization_config = quantization_config
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it # auto-enable DeepGEMM UE8M0 if model config requests it
scale_fmt = quantization_config.get("scale_fmt", None) scale_fmt = quantization_config.get("scale_fmt", None)
if scale_fmt in ("ue8m0",): if scale_fmt in ("ue8m0",):
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"): if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1" os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
logger.info_once( logger.info_once(
( (
"Detected quantization_config.scale_fmt=%s; " "Detected quantization_config.scale_fmt=%s; "
"enabling Hopper UE8M0." "enabling UE8M0 for DeepGEMM."
), ),
scale_fmt, scale_fmt,
) )
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER: elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.warning_once( logger.warning_once(
( (
"Model config requests UE8M0 " "Model config requests UE8M0 "
"(quantization_config.scale_fmt=%s), but " "(quantization_config.scale_fmt=%s), but "
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; " "VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
"Hopper UE8M0 disabled." "UE8M0 for DeepGEMM disabled."
), ),
scale_fmt, scale_fmt,
) )

View File

@ -29,12 +29,7 @@ def is_deep_gemm_supported() -> bool:
current_platform.is_device_capability(90) current_platform.is_device_capability(90)
or current_platform.is_device_capability(100) or current_platform.is_device_capability(100)
) )
return ( return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
envs.VLLM_USE_DEEP_GEMM
and has_deep_gemm()
and is_supported_arch
and not envs.VLLM_USE_FLASHINFER_MOE_FP8
)
@functools.cache @functools.cache
@ -58,15 +53,8 @@ def is_deep_gemm_e8m0_used() -> bool:
logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.") logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
return False return False
if current_platform.is_device_capability(100) and envs.VLLM_USE_DEEP_GEMM_E8M0: if envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.") logger.info_once("DeepGEMM E8M0 enabled on current platform.")
return True
if (
current_platform.is_device_capability(90)
and envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER
):
logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
return True return True
logger.info_once("DeepGEMM E8M0 disabled on current configuration.") logger.info_once("DeepGEMM E8M0 disabled on current configuration.")