From 2e7035dd8cc2e6c907873462b4ac0bb9f08e0abb Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Wed, 10 Dec 2025 02:17:25 +0100 Subject: [PATCH] [Bugfix] Fix fp8 DeepGemm compilation issues (#30336) --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b459d5947863b..e12fe61bf3d97 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( - DeepGemmQuantScaleFMT, fp8_gemm_nt, is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) + self.is_blackwell = current_platform.is_device_capability(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. @@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0: + if self.use_deep_gemm_e8m0 and self.is_blackwell: q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( input_2d, group_size=self.act_quant_group_shape.col,