[Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
This commit is contained in:
ElizaWszola 2025-07-18 12:55:52 +02:00 committed by GitHub
parent 55ad648715
commit 4adc66f64d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -283,8 +283,8 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
(N // 2)) (N // 2))
output = (self.max_experts_per_worker, padded_M, K) output = (self.max_experts_per_worker, padded_M, K)
else: else:
workspace1 = (M * topk, max(2 * N, K)) workspace1 = (M * topk, max(N, K))
workspace2 = (M * topk, N) workspace2 = (M * topk, N // 2)
output = (M * topk, K) output = (M * topk, K)
return (workspace1, workspace2, output, return (workspace1, workspace2, output,
self.out_dtype if self.out_dtype is not None else a.dtype) self.out_dtype if self.out_dtype is not None else a.dtype)