[Kernel][Minor] Re-fuse triton moe weight application (#16071)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-12-23 01:45:02 +08:00 · 2025-04-04 19:27:34 -04:00 · 2025-04-04 19:27:34 -04:00 · d6fc629f4d
commit d6fc629f4d
parent af51d80fa1
1 changed files with 18 additions and 24 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -1297,8 +1297,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
            qintermediate_cache2 = intermediate_cache2
            a2q_scale = a2_scale
-        invoke_fused_moe_kernel(
+        invoke_fused_moe_kernel(qintermediate_cache2,
            qintermediate_cache2,
                                w2,
                                intermediate_cache3,
                                a2q_scale,
@ -1308,7 +1307,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                sorted_token_ids,
                                expert_ids,
                                num_tokens_post_padded,
-            False,  #True,
+                                True,
                                1,
                                config,
                                compute_type=compute_type,
@ -1317,11 +1316,6 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                use_int4_w4a16=use_int4_w4a16,
                                block_shape=block_shape)
        if True:
            intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)
            intermediate_cache3.mul_(
                curr_topk_weights.view(tokens_in_chunk, -1, 1))
        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                    out_hidden_states[begin_chunk_idx:end_chunk_idx])