From 4adc66f64d56338489d00d94de6e13d95741c4be Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 18 Jul 2025 12:55:52 +0200 Subject: [PATCH] [Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121) Signed-off-by: ElizaWszola --- vllm/model_executor/layers/fused_moe/cutlass_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index facc01a5ba84..ff49d7bb7801 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -283,8 +283,8 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): (N // 2)) output = (self.max_experts_per_worker, padded_M, K) else: - workspace1 = (M * topk, max(2 * N, K)) - workspace2 = (M * topk, N) + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, N // 2) output = (M * topk, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype)