diff --git a/vllm/envs.py b/vllm/envs.py index 1cd03240d7e6..01e93f224a30 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -142,7 +142,6 @@ if TYPE_CHECKING: VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = True VLLM_USE_DEEP_GEMM_E8M0: bool = True - VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FLASHINFER_MOE_FP16: bool = False @@ -1055,11 +1054,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool( int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1")) ), - # TODO(wentao): unify the two E8M0 flags after verifying the correctness. - # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs. - "VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool( - int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0")) - ), # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm # JIT all the required kernels before model execution so there is no # JIT'ing in the hot-path. However, this warmup increases the engine @@ -1433,7 +1427,6 @@ def compute_hash() -> str: "VLLM_DISABLED_KERNELS", "VLLM_USE_DEEP_GEMM", "VLLM_USE_DEEP_GEMM_E8M0", - "VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "VLLM_USE_TRTLLM_FP4_GEMM", "VLLM_USE_FUSED_MOE_GROUPED_TOPK", "VLLM_USE_FLASHINFER_MOE_FP16", diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index b9592f08b9f2..87bbe73d834a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -629,25 +629,25 @@ def get_config( if quantization_config is not None: config.quantization_config = quantization_config - # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it + # auto-enable DeepGEMM UE8M0 if model config requests it scale_fmt = quantization_config.get("scale_fmt", None) if scale_fmt in ("ue8m0",): - if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"): - os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1" + if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"): + os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1" logger.info_once( ( "Detected quantization_config.scale_fmt=%s; " - "enabling Hopper UE8M0." + "enabling UE8M0 for DeepGEMM." ), scale_fmt, ) - elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER: + elif not envs.VLLM_USE_DEEP_GEMM_E8M0: logger.warning_once( ( "Model config requests UE8M0 " "(quantization_config.scale_fmt=%s), but " - "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; " - "Hopper UE8M0 disabled." + "VLLM_USE_DEEP_GEMM_E8M0=0 is set; " + "UE8M0 for DeepGEMM disabled." ), scale_fmt, ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 1d7f05cf67bb..8f8f25f1302d 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -29,12 +29,7 @@ def is_deep_gemm_supported() -> bool: current_platform.is_device_capability(90) or current_platform.is_device_capability(100) ) - return ( - envs.VLLM_USE_DEEP_GEMM - and has_deep_gemm() - and is_supported_arch - and not envs.VLLM_USE_FLASHINFER_MOE_FP8 - ) + return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch @functools.cache @@ -58,15 +53,8 @@ def is_deep_gemm_e8m0_used() -> bool: logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.") return False - if current_platform.is_device_capability(100) and envs.VLLM_USE_DEEP_GEMM_E8M0: - logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.") - return True - - if ( - current_platform.is_device_capability(90) - and envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER - ): - logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.") + if envs.VLLM_USE_DEEP_GEMM_E8M0: + logger.info_once("DeepGEMM E8M0 enabled on current platform.") return True logger.info_once("DeepGEMM E8M0 disabled on current configuration.")