Add bias handling to CPUFusedMOE kernel (#26289)

Signed-off-by: Crefeda Rodrigues <crefeda.rodrigues@arm.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Crefeda Rodrigues <65665931+cfRod@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Sharif Inamdar <Sharif.Inamdar@arm.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2026-01-26 19:54:34 +08:00 · 2025-10-06 19:39:10 +01:00 · 2025-10-06 19:39:10 +01:00 · c02058c222
commit c02058c222
parent b2ea5ba677
2 changed files with 78 additions and 2 deletions
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@ -909,3 +909,72 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)

    opcheck(torch.ops._moe_C.moe_sum, (input, actual))
+
+
+@pytest.mark.parametrize("m", [1, 33])
+@pytest.mark.parametrize("n,k", [(128, 128)])
+@pytest.mark.parametrize("e", [8])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("with_bias", [False, True])
+@pytest.mark.parametrize("activation", ["silu"])
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only test")
+def test_cpu_fused_moe_basic(m, n, k, e, topk, dtype, with_bias, activation):
+    from vllm.model_executor.layers.fused_moe.cpu_fused_moe import CPUFusedMOE
+
+    device = "cpu"
+    torch.manual_seed(7)
+
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w13 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
+    router_logits = torch.randn((m, e), device=device, dtype=dtype)
+
+    b1 = b2 = None
+    if with_bias:
+        b1 = torch.randn((e, 2 * n), device=device, dtype=dtype) / 10
+        b2 = torch.randn((e, k), device=device, dtype=dtype) / 10
+
+    ref = (
+        torch_moe(a, w13, w2, router_logits, topk, b1, b2)
+        if with_bias
+        else torch_moe(a, w13, w2, router_logits, topk)
+    )
+
+    class _Dummy(torch.nn.Module):
+        def __init__(self, w13, w2, b1=None, b2=None):
+            super().__init__()
+            self.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
+            self.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
+            if b1 is not None:
+                self.w13_bias = torch.nn.Parameter(b1, requires_grad=False)
+            if b2 is not None:
+                self.w2_bias = torch.nn.Parameter(b2, requires_grad=False)
+
+    layer = _Dummy(w13, w2, b1, b2).to(dtype)
+    fused = CPUFusedMOE(layer)
+    out = fused(
+        layer=layer,
+        x=a,
+        use_grouped_topk=False,
+        top_k=topk,
+        router_logits=router_logits,
+        renormalize=False,
+        global_num_experts=e,
+        expert_map=None,
+        custom_routing_function=None,
+        scoring_func="softmax",
+        routed_scaling_factor=1.0,
+        e_score_correction_bias=None,
+        apply_router_weight_on_input=False,
+        activation=activation,
+    )
+
+    # Tolerances: fp32 tight; bf16 looser (esp. with bias)
+    if dtype == torch.float32:
+        atol = 1e-3
+    elif with_bias:
+        atol = 8e-2
+    else:
+        atol = 5e-2
+    torch.testing.assert_close(out, ref, atol=atol, rtol=0)
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@ -276,6 +276,9 @@ class CPUFusedMOE:

        outputs = []
        start_idx = 0
+        has_w13_bias = hasattr(layer, "w13_bias")
+        has_w2_bias = hasattr(layer, "w2_bias")
+
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
@ -283,11 +286,15 @@ class CPUFusedMOE:
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]

            layer_w13_weight = layer.w13_weight[i]
+            layer_w13_bias = layer.w13_bias[i] if has_w13_bias else None
            layer_w2_weight = layer.w2_weight[i]
+            layer_w2_bias = layer.w2_bias[i] if has_w2_bias else None

-            gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+            gate_up = F.linear(
+                tokens_for_this_expert, layer_w13_weight, bias=layer_w13_bias
+            )
            gate_up = silu_and_mul(gate_up)
-            expert_out = F.linear(gate_up, layer_w2_weight)
+            expert_out = F.linear(gate_up, layer_w2_weight, bias=layer_w2_bias)
            outputs.append(expert_out)
            start_idx = end_idx