Optimize memory.

The tensor workspace13 is used only with one shape. Signed-off-by: Andrey Khalyavin <halyavin@yandex-team.ru>
2026-07-07 01:47:12 +08:00 · 2025-12-16 16:11:56 +03:00 · 2025-12-16 16:11:56 +03:00 · a4af2e7b3a
commit a4af2e7b3a
parent 676db55eec
1 changed files with 1 additions and 1 deletions
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@ -312,7 +312,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        num_dispatchers = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
-        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
+        workspace13 = (num_experts, max_num_tokens * num_dispatchers, N)
        workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
        output = (num_experts, max_num_tokens * num_dispatchers, K)
        return (workspace13, workspace2, output)