diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 19e71f917eeed..b5602a112ef13 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -717,10 +717,13 @@ class FusedMoEModularKernel(torch.nn.Module): get num_chunks == 1. Take max(M, 1) to avoid divide by zero. If there are no tokens to process, the number of chunks will be zero. """ - CHUNK_SIZE = ( - max(M, 1) - if not self.fused_experts.supports_chunking() - else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE) + CHUNK_SIZE = max( + 1, + ( + M + if not self.fused_experts.supports_chunking() + else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE) + ), ) num_chunks = cdiv(M, CHUNK_SIZE) # If there are no tokens, then there should be no loop iterations.