diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 7eba8359b92f6..eef7a0896c375 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -282,6 +282,16 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND if flashinfer_moe_backend in backend_map: + if ( + flashinfer_moe_backend == "latency" + and not current_platform.is_device_capability(100) + ): + logger.info_once( + "Flashinfer TRTLLM MOE backend is only supported on " + "SM100 and later, using CUTLASS backend instead", + scope="local", + ) + return FlashinferMoeBackend.CUTLASS return backend_map[flashinfer_moe_backend] elif current_platform.is_device_capability(90): return FlashinferMoeBackend.CUTLASS