From 1f19d8f899b228a530d256bf9476d9b1ea3039af Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:07:57 -0800 Subject: [PATCH] [Perf] Set split_k to 1 for triton_kernels (#30528) Signed-off-by: Xin Yang --- .../layers/quantization/utils/mxfp4_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index d0c8b3d1a3093..7a351afb3c415 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -57,12 +57,18 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): mx_axis=1, num_warps=num_warps ) ) - if current_platform.is_cuda() and current_platform.is_device_capability(100): - constraints = { - "is_persistent": True, - "epilogue_subtile": 1, - } - opt_flags.update_opt_flags_constraints(constraints) + if current_platform.is_cuda(): + if current_platform.is_device_capability(90): + constraints = { + "split_k": 1, + } + opt_flags.update_opt_flags_constraints(constraints) + elif current_platform.is_device_capability(100): + constraints = { + "is_persistent": True, + "epilogue_subtile": 1, + } + opt_flags.update_opt_flags_constraints(constraints) # transpose the tensor so that the quantization axis is on dim1 quant_tensor = quant_tensor.transpose(-2, -1) scale = scale.transpose(-2, -1)