From f7dac83d95ae38973b425a8bb2d3a3df9fe9a9c2 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sat, 29 Jun 2024 06:04:20 -0700 Subject: [PATCH] [Kernel] Raise an exception in MoE kernel if the batch size is larger then 65k (#5939) --- vllm/model_executor/layers/fused_moe/fused_moe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b750fc713b43f..ecab77a8b6dfb 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -423,6 +423,11 @@ def fused_experts(hidden_states: torch.Tensor, M, _ = hidden_states.shape E, N, _ = w1.shape + if M > 65536: + # https://github.com/vllm-project/vllm/issues/5938 + raise ValueError("MoE kernel does not support more than 65536 tokens, " + f"but got {M}") + if override_config: config = override_config else: