[Kernels] Use empty for modular MoE workspaces (#19667)

Signed-off-by: Bill Nell <bnell@redhat.com>
2026-01-28 18:37:15 +08:00 · 2025-06-16 10:58:01 -04:00 · 2025-06-16 10:58:01 -04:00 · 5e5baa91aa
commit 5e5baa91aa
parent 836d4ce140
2 changed files with 5 additions and 2 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
        intermediate_cache2 = _resize_cache(workspace2,
                                            (E, max_num_tokens, N // 2))

+        if self.use_fp8_w8a8:
+            intermediate_cache1.fill_(0)
+
        # MM1
        invoke_moe_batched_triton_kernel(A=hidden_states,
                                         B=w1,
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module):

            # We can reuse the memory between cache1 and cache3 because by the
            # time we need cache3, we're done with cache1.
-            workspace13 = torch.zeros(prod(workspace13_shape),
+            workspace13 = torch.empty(prod(workspace13_shape),
                                      device=a1.device,
                                      dtype=workspace_dtype)
-            workspace2 = torch.zeros(prod(workspace2_shape),
+            workspace2 = torch.empty(prod(workspace2_shape),
                                     device=a1.device,
                                     dtype=workspace_dtype)