From 981e1d834326ba13a7cb1844f7ad5de86674f23b Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 23 Dec 2025 17:28:08 +0000 Subject: [PATCH] Disable the broken gemm_a8w8_blockscale Signed-off-by: Gregory Shtrasberg --- .../layers/quantization/utils/fp8_utils.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ea68745585160..82e6de621fc87 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -362,23 +362,10 @@ class W8A8BlockFp8LinearOp: n, k = weight.shape - use_triton = ( - not current_platform.is_fp8_fnuz() - and rocm_aiter_ops.is_triton_gemm_w8a8_tuned(n, k) - ) - - if use_triton: - gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale - else: - gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale + gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale if input_scale is not None: q_input = input_2d - elif use_triton: - q_input, input_scale = torch.ops.vllm.triton_per_token_group_quant_fp8( - input_2d, - self.act_quant_group_shape.col, - ) else: q_input, input_scale = rocm_aiter_ops.group_fp8_quant( input_2d, self.act_quant_group_shape.col