mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:15:01 +08:00
[Feature] Enable E8M0 by Default on Hopper for DeepGEMM, 5% E2E throughput improvement (#26197)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
335b28f7d1
commit
f8607863d8
@ -142,7 +142,6 @@ if TYPE_CHECKING:
|
|||||||
VLLM_TPU_USING_PATHWAYS: bool = False
|
VLLM_TPU_USING_PATHWAYS: bool = False
|
||||||
VLLM_USE_DEEP_GEMM: bool = True
|
VLLM_USE_DEEP_GEMM: bool = True
|
||||||
VLLM_USE_DEEP_GEMM_E8M0: bool = True
|
VLLM_USE_DEEP_GEMM_E8M0: bool = True
|
||||||
VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
|
|
||||||
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
|
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
|
||||||
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
||||||
VLLM_USE_FLASHINFER_MOE_FP16: bool = False
|
VLLM_USE_FLASHINFER_MOE_FP16: bool = False
|
||||||
@ -1055,11 +1054,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
|
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
|
||||||
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
|
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
|
||||||
),
|
),
|
||||||
# TODO(wentao): unify the two E8M0 flags after verifying the correctness.
|
|
||||||
# Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
|
|
||||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
|
|
||||||
int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
|
|
||||||
),
|
|
||||||
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
|
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
|
||||||
# JIT all the required kernels before model execution so there is no
|
# JIT all the required kernels before model execution so there is no
|
||||||
# JIT'ing in the hot-path. However, this warmup increases the engine
|
# JIT'ing in the hot-path. However, this warmup increases the engine
|
||||||
@ -1433,7 +1427,6 @@ def compute_hash() -> str:
|
|||||||
"VLLM_DISABLED_KERNELS",
|
"VLLM_DISABLED_KERNELS",
|
||||||
"VLLM_USE_DEEP_GEMM",
|
"VLLM_USE_DEEP_GEMM",
|
||||||
"VLLM_USE_DEEP_GEMM_E8M0",
|
"VLLM_USE_DEEP_GEMM_E8M0",
|
||||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
|
|
||||||
"VLLM_USE_TRTLLM_FP4_GEMM",
|
"VLLM_USE_TRTLLM_FP4_GEMM",
|
||||||
"VLLM_USE_FUSED_MOE_GROUPED_TOPK",
|
"VLLM_USE_FUSED_MOE_GROUPED_TOPK",
|
||||||
"VLLM_USE_FLASHINFER_MOE_FP16",
|
"VLLM_USE_FLASHINFER_MOE_FP16",
|
||||||
|
|||||||
@ -629,25 +629,25 @@ def get_config(
|
|||||||
|
|
||||||
if quantization_config is not None:
|
if quantization_config is not None:
|
||||||
config.quantization_config = quantization_config
|
config.quantization_config = quantization_config
|
||||||
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
|
# auto-enable DeepGEMM UE8M0 if model config requests it
|
||||||
scale_fmt = quantization_config.get("scale_fmt", None)
|
scale_fmt = quantization_config.get("scale_fmt", None)
|
||||||
if scale_fmt in ("ue8m0",):
|
if scale_fmt in ("ue8m0",):
|
||||||
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
|
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
|
||||||
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
|
os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
(
|
(
|
||||||
"Detected quantization_config.scale_fmt=%s; "
|
"Detected quantization_config.scale_fmt=%s; "
|
||||||
"enabling Hopper UE8M0."
|
"enabling UE8M0 for DeepGEMM."
|
||||||
),
|
),
|
||||||
scale_fmt,
|
scale_fmt,
|
||||||
)
|
)
|
||||||
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
|
elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
(
|
(
|
||||||
"Model config requests UE8M0 "
|
"Model config requests UE8M0 "
|
||||||
"(quantization_config.scale_fmt=%s), but "
|
"(quantization_config.scale_fmt=%s), but "
|
||||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
|
"VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
|
||||||
"Hopper UE8M0 disabled."
|
"UE8M0 for DeepGEMM disabled."
|
||||||
),
|
),
|
||||||
scale_fmt,
|
scale_fmt,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -29,12 +29,7 @@ def is_deep_gemm_supported() -> bool:
|
|||||||
current_platform.is_device_capability(90)
|
current_platform.is_device_capability(90)
|
||||||
or current_platform.is_device_capability(100)
|
or current_platform.is_device_capability(100)
|
||||||
)
|
)
|
||||||
return (
|
return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
|
||||||
envs.VLLM_USE_DEEP_GEMM
|
|
||||||
and has_deep_gemm()
|
|
||||||
and is_supported_arch
|
|
||||||
and not envs.VLLM_USE_FLASHINFER_MOE_FP8
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
@ -58,15 +53,8 @@ def is_deep_gemm_e8m0_used() -> bool:
|
|||||||
logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
|
logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if current_platform.is_device_capability(100) and envs.VLLM_USE_DEEP_GEMM_E8M0:
|
if envs.VLLM_USE_DEEP_GEMM_E8M0:
|
||||||
logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
|
logger.info_once("DeepGEMM E8M0 enabled on current platform.")
|
||||||
return True
|
|
||||||
|
|
||||||
if (
|
|
||||||
current_platform.is_device_capability(90)
|
|
||||||
and envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER
|
|
||||||
):
|
|
||||||
logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
|
logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user