diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index cab610decf90..0b501cd87fb5 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -190,12 +190,6 @@ class FusedMoEParallelConfig: return (self.use_all2all_kernels and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency") - @property - def use_flashinfer_cutlass_kernels(self): - return (envs.VLLM_USE_FLASHINFER_MOE_FP4 - and has_flashinfer_cutlass_fused_moe() - and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") - @staticmethod def make(tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig": @@ -404,7 +398,14 @@ class FusedMoEConfig: @property def use_flashinfer_cutlass_kernels(self): - return self.moe_parallel_config.use_flashinfer_cutlass_kernels + """ + Whether to use FlashInfer cutlass kernels for NVFP4 MoE. + """ + return (self.quant_config is not None + and self.quant_config.quant_dtype == "nvfp4" + and envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") @staticmethod def make( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b9de03ddd216..28123d3958ad 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -920,7 +920,7 @@ class FusedMoE(CustomOp): self.batched_router_logits: Optional[torch.Tensor] = None if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels - or self.moe_parallel_config.use_flashinfer_cutlass_kernels): + or self.moe_config.use_flashinfer_cutlass_kernels): self.batched_hidden_states = torch.zeros( (moe.max_num_tokens, self.hidden_size), dtype=moe.in_dtype, @@ -974,7 +974,7 @@ class FusedMoE(CustomOp): @property def use_flashinfer_cutlass_kernels(self): - return self.moe_parallel_config.use_flashinfer_cutlass_kernels + return self.moe_config.use_flashinfer_cutlass_kernels def update_expert_map(self): # ep_size and ep_rank should already be updated @@ -1665,7 +1665,7 @@ class FusedMoE(CustomOp): # only when data parallelism (DP) is enabled. use_flashinfer_cutlass_kernels = ( self.dp_size > 1 - and self.moe_parallel_config.use_flashinfer_cutlass_kernels) + and self.moe_config.use_flashinfer_cutlass_kernels) if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or use_flashinfer_cutlass_kernels): @@ -1674,7 +1674,7 @@ class FusedMoE(CustomOp): do_naive_dispatch_combine: bool = ( self.dp_size > 1 and not self.moe_parallel_config.use_deepep_ht_kernels - and not self.moe_parallel_config.use_flashinfer_cutlass_kernels) + and not self.moe_config.use_flashinfer_cutlass_kernels) if do_naive_dispatch_combine: hidden_states, router_logits = get_ep_group().dispatch( hidden_states, router_logits) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 6724796904f0..f7d591328f93 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if should_use_flashinfer_mxfp4(): from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe - assert not self.moe.use_ep, ( - "EP is not supported for flashinfer mxfp4 moe backend yet.") if _should_use_flashinfer_mxfp4_bf16(): assert x.dtype == torch.bfloat16 x_quant = x @@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): None, # output1_scale_scalar None, # output1_scale_gate_scalar None, # output2_scale_scalar - self.num_experts, + global_num_experts, top_k, None, # n_group None, # topk_group self.intermediate_size, # padded to multiple of 256 - 0, # local_expert_offset + layer.ep_rank * layer.local_num_experts, # local_expert_offset self.num_experts, # local num experts None, self._get_tile_tokens_dim(x, top_k),