diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 3bbae4e57ba3..a12cfafd42ab 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         intermediate_cache2 = _resize_cache(workspace2,
                                             (E, max_num_tokens, N // 2))
 
+        if self.use_fp8_w8a8:
+            intermediate_cache1.fill_(0)
+
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
                                          B=w1,
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 9409b59982d9..ed3b6b8a1af4 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module):
 
             # We can reuse the memory between cache1 and cache3 because by the
             # time we need cache3, we're done with cache1.
-            workspace13 = torch.zeros(prod(workspace13_shape),
+            workspace13 = torch.empty(prod(workspace13_shape),
                                       device=a1.device,
                                       dtype=workspace_dtype)
-            workspace2 = torch.zeros(prod(workspace2_shape),
+            workspace2 = torch.empty(prod(workspace2_shape),
                                      device=a1.device,
                                      dtype=workspace_dtype)