From 299f8291803fc49b2c9ccefd070d374688bffcc8 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 18 Jun 2025 20:09:51 +0000 Subject: [PATCH] DeepGEMM LL optimizations - Quantized dispatch - Fused act-and-mul-and-quant in the right layout for DeepGEMM Signed-off-by: Tyler Michael Smith --- .../layers/fused_moe/batched_deep_gemm_moe.py | 190 ++++++++++++++++-- 1 file changed, 174 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 5492399efdf86..758cd7c56f71f 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -6,14 +6,183 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, per_token_group_quant_fp8) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.triton_utils import tl, triton logger = init_logger(__name__) has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # *FP32 activations (E, T, 2*H) + y_q_ptr, # *FP8 quantised activations (E, T, H) + y_s_ptr, # *FP32 scales (E, T, G) + counts_ptr, # *INT32 number of tokens per expert (E) + + # Sizes --------------------------------------------------------------- + E: tl.constexpr, # num_experts + T: tl.constexpr, # max_num_tokens + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + + # Stride for counts (elements) + stride_counts_e, + + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK) + cols = cols.to(tl.int64) + mask_h = cols < BLOCK + + t = tl.zeros([], tl.int64) + while t < n_tokens: + base_i_offset = (e * stride_i_e + t * stride_i_t + + g * GROUP_SIZE * stride_i_h) + base_yq_offset = (e * stride_yq_e + t * stride_yq_t + + g * GROUP_SIZE * stride_yq_h) + base_ys_offset = e * stride_ys_e + t * stride_ys_t + g * stride_ys_g + + mask = mask_h + x = tl.load(input_ptr + base_i_offset + cols * stride_i_h, + mask=mask, + other=0.0).to(tl.float32) + y2 = tl.load(input_ptr + base_i_offset + H * stride_i_h + + cols * stride_i_h, + mask=mask, + other=0.0).to(tl.float32) + + x = x * (1.0 / (1.0 + tl.exp(-x))) + y = x * y2 + + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + y_s = _absmax / fp8_max + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + cols * stride_yq_h, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset, y_s) + + t += 1 + + +def silu_mul_fp8_quant_deep_gemm( + y: torch.Tensor, # (E, T, 2*H) float32 + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + group_size: int = 128, + eps: float = 1e-6, +): + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + + Returns `(y_q, y_s)` where + * `y_q` is the FP8 tensor of shape `(E, T, H)`, same layout as `y[..., :H]`. + * `y_s` has shape `(E, T, H // group_size)` and strides `(T*G, 1, T)` + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = H // group_size + assert H % group_size == 0, "H must be divisible by group_size" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, \ + "tokens_per_expert must be shape (E,)" + tokens_per_expert = tokens_per_expert.to(device=y.device, + dtype=torch.int32) + + # allocate outputs + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided((E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device) + + stride_cnt_e = tokens_per_expert.stride()[0] + + # static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G, ) + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = -f_info.max + + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + E, + T, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + BLOCK=group_size, + num_warps=4, + ) + + return y_q, y_s + + class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # The Deep Gemm kernels only support block size of 128 @@ -96,7 +265,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): hidden_states, w1, w2, topk_ids) workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N)) - workspace2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) # (from deepgemm docs) : A value hint (which is a value on CPU) # for the M expectation of each batch, correctly setting this value @@ -109,19 +277,9 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): masked_m=expert_num_tokens, expected_m=expected_m) - # TODO (varun) [Optimization]: Use a batched version of activation. - # Similarly for the quant below. - self.activation(activation, workspace2, workspace1.view(-1, N)) - - w2_hidden_size = workspace2.size(-1) - workspace2 = workspace2.view(-1, w2_hidden_size) - - a2q_scale: Optional[torch.Tensor] = None - a2q, a2q_scale = per_token_group_quant_fp8(workspace2, - self.block_shape[1], - column_major_scales=False) - a2q = a2q.view(E, max_num_tokens, -1) - a2q_scale = a2q_scale.view(E, max_num_tokens, -1) + assert expert_num_tokens is not None + a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm(workspace1, + expert_num_tokens) dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a2q, a2q_scale), (w2, w2_scale),