mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 14:54:57 +08:00
[Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
This commit is contained in:
parent
55ad648715
commit
4adc66f64d
@ -283,8 +283,8 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
(N // 2))
|
(N // 2))
|
||||||
output = (self.max_experts_per_worker, padded_M, K)
|
output = (self.max_experts_per_worker, padded_M, K)
|
||||||
else:
|
else:
|
||||||
workspace1 = (M * topk, max(2 * N, K))
|
workspace1 = (M * topk, max(N, K))
|
||||||
workspace2 = (M * topk, N)
|
workspace2 = (M * topk, N // 2)
|
||||||
output = (M * topk, K)
|
output = (M * topk, K)
|
||||||
return (workspace1, workspace2, output,
|
return (workspace1, workspace2, output,
|
||||||
self.out_dtype if self.out_dtype is not None else a.dtype)
|
self.out_dtype if self.out_dtype is not None else a.dtype)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user