diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 255b5aad1785..198feb03be3e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: ) # If FlashInfer is not available, try either Marlin or Triton - if ( - envs.VLLM_MXFP4_USE_MARLIN - or current_platform.get_device_capability()[0] < 9 - or not has_triton_kernels() - or not is_torch_equal_or_newer("2.8.0") - ): + triton_kernels_supported = ( + has_triton_kernels() + and is_torch_equal_or_newer("2.8.0") + # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 + # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 + # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 + and (9, 0) <= current_platform.get_device_capability() < (11, 0) + ) + if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: logger.info_once("Using Marlin backend") return Mxfp4Backend.MARLIN else: