[Kernels] Use empty for modular MoE workspaces (#19667)

Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
bnellnm 2025-06-16 10:58:01 -04:00 committed by GitHub
parent 836d4ce140
commit 5e5baa91aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 5 additions and 2 deletions

View File

@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
intermediate_cache2 = _resize_cache(workspace2,
(E, max_num_tokens, N // 2))
if self.use_fp8_w8a8:
intermediate_cache1.fill_(0)
# MM1
invoke_moe_batched_triton_kernel(A=hidden_states,
B=w1,

View File

@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module):
# We can reuse the memory between cache1 and cache3 because by the
# time we need cache3, we're done with cache1.
workspace13 = torch.zeros(prod(workspace13_shape),
workspace13 = torch.empty(prod(workspace13_shape),
device=a1.device,
dtype=workspace_dtype)
workspace2 = torch.zeros(prod(workspace2_shape),
workspace2 = torch.empty(prod(workspace2_shape),
device=a1.device,
dtype=workspace_dtype)