From c817b1415121cf88178af1e4e78f651d802df4da Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:28:34 -0500 Subject: [PATCH] [Perf] Optimize deepgemm experts initialization, 3.9% TTFT improvement (#30494) Signed-off-by: yewentao256 Co-authored-by: li-jinpeng <3332126450@qq.com> Co-authored-by: youkaichao --- .../layers/fused_moe/deep_gemm_utils.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py index 6cca954123274..57d303cd53fef 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -84,10 +84,16 @@ def _fwd_kernel_ep_scatter_1( m_indices_start_ptr = m_indices + cur_expert_start off_expert = tl.arange(0, BLOCK_E) + # any rows in the per-expert aligned region that do not correspond to + # real tokens are left untouched here and should remain initialized to + # -1 so DeepGEMM can skip them for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4): + offs = start_m + off_expert + mask = offs < cur_expert_token_num tl.store( - m_indices_start_ptr + start_m + off_expert, + m_indices_start_ptr + offs, cur_expert, + mask=mask, ) @@ -366,12 +372,17 @@ def deepgemm_moe_permute( (M_sum, H // block_k), device=device, dtype=torch.float32 ) - maybe_has_empty_blocks = (expert_tokens_meta is None) or ( - expert_tokens_meta.expert_num_tokens_cpu is None + # DeepGEMM uses negative values in m_indices (here expert_ids) to mark + # completely invalid / padded blocks that should be skipped. We always + # initialize expert_ids to -1 so any row that is not explicitly written + # by the scatter kernel will be treated as invalid and skipped by + # DeepGEMM's scheduler. + expert_ids = torch.full( + (M_sum,), + fill_value=-1, + device=device, + dtype=torch.int32, ) - expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty - - expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32) inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32) expert_num_tokens = None