diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 3bbae4e57ba3..a12cfafd42ab 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) + if self.use_fp8_w8a8: + intermediate_cache1.fill_(0) + # MM1 invoke_moe_batched_triton_kernel(A=hidden_states, B=w1, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 9409b59982d9..ed3b6b8a1af4 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module): # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - workspace13 = torch.zeros(prod(workspace13_shape), + workspace13 = torch.empty(prod(workspace13_shape), device=a1.device, dtype=workspace_dtype) - workspace2 = torch.zeros(prod(workspace2_shape), + workspace2 = torch.empty(prod(workspace2_shape), device=a1.device, dtype=workspace_dtype)