From 4bdf40021821dfb085e2cbe2f4dc0b7ad7b2e635 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:42:01 -0400 Subject: [PATCH] [Bugfix] Fix chunked a2_scales in modular kernels (#25264) Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/batched_deep_gemm_moe.py | 1 + .../layers/fused_moe/batched_triton_or_deep_gemm_moe.py | 3 ++- vllm/model_executor/layers/fused_moe/cutlass_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 3 ++- .../layers/fused_moe/flashinfer_cutlass_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++- .../layers/fused_moe/gpt_oss_triton_kernels_moe.py | 1 + vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 +++++ vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 2 ++ vllm/model_executor/layers/fused_moe/trtllm_moe.py | 1 + 11 files changed, 23 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index e9dfb22bea27b..cf0b965cc8c51 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -286,6 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index 8b9070f098898..c3c4f4a5d190e 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -126,6 +126,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -136,5 +137,5 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): assert experts is not None experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, - workspace13, workspace2, expert_tokens_meta, + a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 957ffca0d1246..8c2ff580575f5 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -241,6 +241,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -262,7 +263,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, self.w1_scale, self.w2_scale, - a1q_scale, self.a2_scale, self.ab_strides1, self.ab_strides2, + a1q_scale, a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, @@ -705,6 +706,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], # unused + a2_scale: Optional[torch.Tensor], # unused workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 8830b95df7cf0..51a4f275e98cc 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -214,13 +214,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): assert a1q_scale is not None - assert self.a2_scale is None + assert a2_scale is None assert self.block_shape is not None assert self.w1_scale is not None assert self.w2_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 6eeec18a6ec87..a074da883088e 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -129,6 +129,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index fe6ac458a9593..660bae3146026 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -688,6 +688,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -879,6 +880,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -970,7 +972,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache1.view(-1, N)) qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, max_num_tokens, E, N, + intermediate_cache2, a2_scale, max_num_tokens, E, N, expert_num_tokens, self.quant_dtype, self.per_act_token_quant, self.block_shape) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index d4de3f640865e..6c2a5bda7cbaa 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1598,6 +1598,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): a2q_scale: Optional[torch.Tensor] = None qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, self.quant_dtype, + intermediate_cache2, a2_scale, self.quant_dtype, self.per_act_token_quant, self.block_shape) invoke_fused_moe_kernel( diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 614a83ad1158c..08a9b34a42457 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -179,6 +179,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 729f8e39cf0f7..a16c254fadf66 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -519,6 +519,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], @@ -634,6 +635,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, ) -> torch.Tensor: @@ -671,6 +673,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=a2_scale, workspace13=workspace13, workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, @@ -718,6 +721,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=self.fused_experts.a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -803,6 +807,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=c_a1q_scale, + a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index b2dbc306a6148..3de80ff85747e 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -111,6 +111,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -134,6 +135,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts, expert_map, a1q_scale, + a2_scale, workspace13, workspace2, expert_tokens_meta, diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 8e5f6acc9df63..05ed93c942c8e 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -103,6 +103,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata],