From a4af2e7b3a7fff90d54e579ba3fb124281c3403f Mon Sep 17 00:00:00 2001
From: Andrey Khalyavin <halyavin@yandex-team.ru>
Date: Tue, 16 Dec 2025 16:11:56 +0300
Subject: [PATCH] Optimize memory.

The tensor workspace13 is used only with one shape.

Signed-off-by: Andrey Khalyavin <halyavin@yandex-team.ru>
---
 vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 15f6e3a18ed6c..d35ad5c49d7f7 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -312,7 +312,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
-        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
+        workspace13 = (num_experts, max_num_tokens * num_dispatchers, N)
         workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output)