[Performance][B200] silu_mul_quant: pack scales in int32 (#28358)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2026-03-16 13:57:12 +08:00 · 2025-11-13 13:16:55 -05:00 · 2025-11-13 13:16:55 -05:00 · fe1cd7704d
commit fe1cd7704d
parent fdfd5075aa
7 changed files with 466 additions and 151 deletions
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
 }

 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
-          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
    // sizes
    Idx_t E, Idx_t T, Idx_t H,
    // strides (in elements)
    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
  static constexpr int NUM_WARPS = THREADS / WARP_SIZE;

@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(

  __nv_fp8x4_e4m3* y_q_base_ptr =
      reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
+
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;

  for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
    const Idx_t base_ys = expert_id * stride_ys_e;
    auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
    __nv_fp8x4_e4m3* y_q_ptr =
@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(

      __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);

-      if constexpr (USE_UE8M0) {
+      if constexpr (CEIL_UE8M0) {
        y_s = hexp2(hceil(hlog2(y_s)));
      }

@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
      y_q_ptr += WARP_SIZE * stride_yq_h;

      if (!lane_id) {
-        *y_s_ptr = y_s;
-        y_s_ptr += stride_ys_g;
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
+
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
+
+        current_group_id += 1;
      }
    }
  }
@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant(
    const at::Tensor& tokens_per_expert,  // (E)
    at::Tensor& y_q,                      // (E, T, H) [OUT]
    at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    bool cast_scale_ue8m0) {
 #ifndef USE_ROCM

  // This kernel currently only supports H % 128 == 0 and assumes a
@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant(
  TORCH_CHECK(input.dtype() == torch::kBFloat16);
  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
              y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);

+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
+
  using Idx_t = int64_t;

  Idx_t E = input.size(0);
@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant(
  Idx_t stride_yq_e = y_q.stride(0);
  Idx_t stride_yq_t = y_q.stride(1);
  Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);

  Idx_t stride_counts_e = tokens_per_expert.stride(0);

+  int const NUM_GROUPS = H / GROUP_SIZE;
+
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
    static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
    int sms = SILU_V2_BLOCK_COUNT;                                             \
    static constexpr int max_shared_mem_bytes =                                \
@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant(
    VLLM_DISPATCH_FP8_TYPES(                                                   \
        y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
          vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
-              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
-              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
              <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                  reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
-                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
                  reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                  T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
-                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
-                  stride_ys_g, stride_counts_e);                               \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
        });

-  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
+    }

-  int const NUM_GROUPS = H / GROUP_SIZE;
-  if (!use_ue8m0) {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
-    }
-  } else {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
-    }
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
  }

+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
+  }
+
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
+
 #endif
 }
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
    """Indirect fixture: takes a list of names, returns list of full URLs."""
    names: list[str] = request.param
    return [local_asset_server.url_for(name) for name in names]
+
+
+@pytest.fixture
+def disable_deepgemm_ue8m0(monkeypatch):
+    from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+    with monkeypatch.context() as monkeypatch_ctx:
+        monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
+        is_deep_gemm_e8m0_used.cache_clear()
+        yield
+        # Clear cache so the next time it is used it is processed with the
+        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
+        is_deep_gemm_e8m0_used.cache_clear()
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -21,7 +21,11 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm

 from ...utils import multi_gpu_test
@ -413,19 +417,16 @@ NUM_EXPERTS = [32]
@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ht_deepep_deepgemm_moe(
    mnk: tuple[int, int, int],
    num_experts: int,
    topk: int,
    world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
    """
    Tests for High-Throughput DeepEP + DeepGemm integration.
    """
-    import deep_gemm

    m, n, k = mnk
    current_platform.seed_everything(7)
@ -433,7 +434,7 @@ def test_ht_deepep_deepgemm_moe(
    if topk > num_experts:
        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")

-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
    block_size = [block_m, block_m]

    world_size, dp_size = world_dp_size
@ -487,9 +488,6 @@ USE_FP8_DISPATCH = [False]
@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ll_deepep_deepgemm_moe(
    mnk: tuple[int, int, int],
    num_experts: int,
@ -497,10 +495,12 @@ def test_ll_deepep_deepgemm_moe(
    use_fp8_dispatch: bool,
    block_size: list[int],
    world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
    """
    Tests for Low-Latency DeepEP + DeepGemm integration.
    """
+    assert not is_deep_gemm_e8m0_used()

    m, n, k = mnk
    current_platform.seed_everything(7)
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@ -294,7 +294,7 @@ def torch_moe_impl(
        # blockwise quant and de-quant.
        assert not per_act_token_quant
        a = test_tensors.rank_tokens
-        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
        a = (
            (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
            .view(a.shape)
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+
+import random
+
 import pytest
 import torch

@ -8,27 +11,30 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
+from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.math_utils import cdiv, round_up

 fp8_dtype = torch.float8_e4m3fn

 CASES = [
    (1, 1, 128, fp8_dtype),
-    (1, 4, 128, fp8_dtype),
-    (2, 4, 256, fp8_dtype),
-    (32, 64, 256, fp8_dtype),
-    (17, 31, 768, fp8_dtype),
-    (1, 1, 128 * 1, fp8_dtype),
-    (1, 1, 128 * 3, fp8_dtype),
-    (1, 1, 128 * 4, fp8_dtype),
-    (8, 16, 128 * 1, fp8_dtype),
-    (8, 16, 128 * 2, fp8_dtype),
-    (8, 16, 128 * 3, fp8_dtype),
+    (1, 4, 128 * 1, fp8_dtype),
+    (2, 4, 128 * 2, fp8_dtype),
+    (1, 4, 128 * 3, fp8_dtype),
+    (8, 16, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 5, fp8_dtype),
+    (8, 16, 128 * 6, fp8_dtype),
+    (8, 16, 128 * 7, fp8_dtype),
+    (8, 16, 128 * 8, fp8_dtype),
+    (8, 16, 128 * 9, fp8_dtype),
    (8, 64, 7168, fp8_dtype),
    (8, 128, 128 * 33, fp8_dtype),
+    (1, 4, 128 * 10, fp8_dtype),
    (8, 128, 7168, fp8_dtype),
    (8, 512, 7168, fp8_dtype),
    (8, 1024, 7168, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
    (256, 8, 7168, fp8_dtype),
    (256, 32, 7168, fp8_dtype),
    (256, 64, 7168, fp8_dtype),
@ -38,14 +44,159 @@ CASES = [
 ]


+def as_uint8(x) -> torch.Tensor:
+    return (
+        torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
+    )
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
+    x_f32 = x.to(torch.float32)
+    act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
+    assert act_f32.dtype == torch.float32
+    return act_f32.to(torch.bfloat16)
+
+
+def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
+    eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
+    one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
+    fp8_max_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_min_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_max_inv = one_bf16 / fp8_max_bf16
+    assert fp8_max_inv.dtype == torch.bfloat16
+
+    assert x.size(-1) % group_size == 0
+    num_groups = x.numel() // group_size
+    x_og_shape = x.shape
+
+    x = x.to(torch.bfloat16)
+    x = x.view((-1, group_size))
+    amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
+    assert amax.dtype == torch.bfloat16
+    s = amax * fp8_max_inv
+
+    if ceil_ue8m0:
+        s = torch.exp2(
+            torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
+        ).to(torch.bfloat16)
+
+    inv_s = one_bf16 / s
+    inv_s = inv_s.view((num_groups, 1))
+    xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
+        fp8_dtype
+    )
+
+    xq = xq.view(x_og_shape)
+    xs = s.view((-1, xq.size(-1) // group_size))
+    return xq, xs
+
+
+def silu_mul_quant(
+    gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert gate.size(-1) % group_size == 0
+    assert up.size(-1) % group_size == 0
+
+    assert gate.dtype == torch.bfloat16
+    assert up.dtype == torch.bfloat16
+
+    act_bf16 = silu(gate)
+    assert act_bf16.dtype == torch.bfloat16
+
+    # act & mul
+    a_m = act_bf16 * up
+    assert a_m.dtype == torch.bfloat16
+
+    q, s = do_quant(a_m, group_size, ceil_ue8m0)
+    return q, s
+
+
+def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
+    """
+    pack float32 scales into a int32 tensor
+    """
+    assert x.dtype == torch.float32
+    E, T, G = x.size()
+
+    # Add i32_padding here so we can view it as a i32 tensor later on.
+    i32_padding = round_up(G, 4) - G
+    ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
+
+    ref_s_i32 = ref_s_i8.view(torch.int32)
+
+    return ref_s_i32
+
+
+def ref_with_scale_fmt(
+    E: int,
+    T: int,
+    H: int,
+    group_size: int,
+    tokens_per_expert: torch.Tensor,
+    gate: torch.Tensor,
+    up: torch.Tensor,
+    scale_fmt: DeepGemmQuantScaleFMT,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    The precision types of the operations triggered by this function
+    match closely with the kernel implementation so we compare more
+    accurately.
+    """
+    scale_dtype = (
+        torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+    )
+    ceil_ue8m0 = scale_fmt in [
+        DeepGemmQuantScaleFMT.UE8M0,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]
+
+    ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
+    ref_s_f32 = torch.empty(
+        (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    )
+
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        if nt == 0:
+            continue
+        ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
+            gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
+        )
+
+    if scale_dtype == torch.float32:
+        return ref_q, ref_s_f32
+
+    assert scale_dtype == torch.int32
+    return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
+
+
+def token_random(E, T, H2, tokens_per_expert):
+    """
+    Initialize each token in a random range so we test a range of
+    scale values.
+    """
+    y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
+    for e in range(E):
+        for t in range(tokens_per_expert[e].item()):
+            exp = random.choice(range(1, 20))
+            y[e, t].uniform_(-(2**exp), 2**exp)
+    return y
+
+
@pytest.mark.parametrize("E,T,H,fp8_type", CASES)
@torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
    group_size = 128
    current_platform.seed_everything(42)

-    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
    tokens_per_expert = torch.randint(
        low=0,
        high=T,
@ -54,71 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
        device="cuda",
    )

+    # Input tensor of shape (E, T, 2*H)
+    y = token_random(E, T, 2 * H, tokens_per_expert)
+
+    gate = y[..., :H].to(torch.bfloat16)
+    up = y[..., H:].to(torch.bfloat16)
+
+    scale_fmts = [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
+
    # Run the SiLU V2 kernel
-    # TODO (varun): use_e8m0 is set to false as the reference impl does
-    # not handle that case.
-    y_q, y_s = persistent_masked_m_silu_mul_quant(
-        y, tokens_per_expert, group_size=group_size, use_ue8m0=False
-    )
-
-    torch.cuda.synchronize()
-    fp8_info = torch.finfo(fp8_dtype)
-    fp8_max = fp8_info.max
-    fp8_min = fp8_info.min
-    eps = 1e-10
-
-    y1 = y[..., :H].float()
-    y2 = y[..., H:]
-    silu_x = y1 * torch.sigmoid(y1)
-    merged = silu_x * y2
-
-    for e in range(E):
-        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty(
-            (T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
-        )
-        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
-
-        for t in range(nt):
-            data = merged[e, t].float()
-            ref_q_row = torch.empty_like(data)
-
-            # process full groups
-            n_full_groups = H // group_size
-            if n_full_groups > 0:
-                data_grp = data[: n_full_groups * group_size].view(
-                    n_full_groups, group_size
-                )
-                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
-                    group_size
-                )
-                ref_q_row[: n_full_groups * group_size] = scaled.clamp(
-                    fp8_min, fp8_max
-                ).to(fp8_dtype)
-                ref_s[t, :n_full_groups] = scale
-
-            # process remainder group
-            rem = H % group_size
-            if rem > 0:
-                data_rem = data[-rem:]
-                amax = data_rem.abs().amax().clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data_rem / scale
-                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, -1] = scale
-
-            ref_q[t] = ref_q_row
-
-        y_se = y_s[e].float()
-        y_qe = y_q[e].float()
-
-        torch.testing.assert_close(
-            y_qe[:nt].to(torch.float32),
-            ref_q[:nt].to(torch.float32),
-            atol=2,
-            rtol=2e-1,
+    for scale_fmt in scale_fmts:
+        y_q, y_s = persistent_masked_m_silu_mul_quant(
+            y,
+            tokens_per_expert,
+            group_size=group_size,
+            quant_scale_fmt=scale_fmt,
        )

-        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
+        ref_y_q, ref_y_s = ref_with_scale_fmt(
+            E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
+        )
+
+        # deepgemm scales transform
+        dg_scales = None
+        if (
+            has_deep_gemm()
+            and current_platform.has_device_capability(100)
+            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+        ):
+            from deep_gemm import transform_sf_into_required_layout
+
+            _q, _s = ref_with_scale_fmt(
+                E,
+                T,
+                H,
+                group_size,
+                tokens_per_expert,
+                gate,
+                up,
+                scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+            )
+            dg_scales = transform_sf_into_required_layout(
+                sf=_s,
+                mn=_q.size(1),
+                k=_q.size(2),
+                recipe=(1, 128, 128),
+                num_groups=_q.size(0),
+                is_sfa=True,
+            )
+
+        expected_scale_dtype = (
+            torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+        )
+        assert y_s.dtype == expected_scale_dtype
+        assert ref_y_s.dtype == expected_scale_dtype
+
+        for e in range(E):
+            nt = tokens_per_expert[e].item()
+
+            torch.testing.assert_close(
+                y_q[e, :nt].to(torch.float32),
+                ref_y_q[e, :nt].to(torch.float32),
+            )
+
+            if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
+                G = H // group_size
+                y_s_sliced = as_uint8(y_s[e])
+                ref_s_sliced = as_uint8(ref_y_s[e])
+                torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
+                if dg_scales is not None:
+                    dg_sliced = as_uint8(dg_scales[e])
+                    torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
+            else:
+                torch.testing.assert_close(
+                    y_s[e, :nt],
+                    ref_y_s[e, :nt],
+                )
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@ -13,14 +14,33 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
    fp8_m_grouped_gemm_nt_masked,
    get_mk_alignment_for_contiguous_layout,
    is_deep_gemm_e8m0_used,
 )
+from vllm.utils.math_utils import cdiv

 logger = init_logger(__name__)


+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+
+
@triton.jit
 def _silu_mul_fp8_quant_deep_gemm(
    # Pointers ------------------------------------------------------------
@ -49,7 +69,7 @@ def _silu_mul_fp8_quant_deep_gemm(
    eps: tl.constexpr,
    fp8_min: tl.constexpr,
    fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
    # Meta ---------------------------------------------------------------
    BLOCK: tl.constexpr,
    NUM_STAGES: tl.constexpr,
@ -86,7 +106,7 @@ def _silu_mul_fp8_quant_deep_gemm(
        y = gate * up

        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
+        if ceil_ue8m0:
            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))

        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
@ -100,7 +120,7 @@ def persistent_masked_m_silu_mul_quant(
    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
    num_parallel_tokens=16,
    group_size: int = 128,
-    use_ue8m0: bool | None = None,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
    y has shape (E, T, 2*H). The first half of the last dimension is
@ -137,7 +157,13 @@ def persistent_masked_m_silu_mul_quant(

    Returns `(y_q, y_s)` where
    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
    Let NUM_WARPS be the number of warps in a single thread block and
    `GROUP_SIZE = 128` be the size of the quantization group.
    """
@ -155,17 +181,18 @@ def persistent_masked_m_silu_mul_quant(
    fp8_dtype = torch.float8_e4m3fn
    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)

-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
    y_s = torch.empty_strided(
-        (E, T, G),
-        (stride_ys_e, stride_ys_t, stride_ys_g),
-        dtype=torch.float32,
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
        device=y.device,
    )

-    use_ue8m0 = use_ue8m0 if use_ue8m0 is not None else is_deep_gemm_e8m0_used()
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]

    cuda_arch = current_platform.get_device_capability(
        device_id=y.device.index
@ -173,7 +200,7 @@ def persistent_masked_m_silu_mul_quant(

    if cuda_arch >= 80:
        torch.ops._C.persistent_masked_m_silu_mul_quant(
-            y, tokens_per_expert, y_q, y_s, use_ue8m0
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
        )
    else:
        stride_cnt_e = tokens_per_expert.stride()[0]
@ -189,6 +216,10 @@ def persistent_masked_m_silu_mul_quant(
        fp8_max = f_info.max
        fp8_min = f_info.min
        eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
        _silu_mul_fp8_quant_deep_gemm[grid](
            y,
            y_q,
@ -202,14 +233,14 @@ def persistent_masked_m_silu_mul_quant(
            stride_yq_e,
            stride_yq_t,
            stride_yq_h,
-            stride_ys_e,
-            stride_ys_t,
-            stride_ys_g,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
            stride_cnt_e,
            eps,
            fp8_min,
            fp8_max,
-            is_deep_gemm_e8m0_used(),
+            ceil_ue8m0,
            BLOCK=group_size,
            NUM_STAGES=4,
            num_warps=1,
@ -255,7 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        """
        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
        """
-        return current_platform.is_device_capability(100)
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
@ -329,10 +360,17 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
            expected_m,
        )

+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
-            workspace1, expert_num_tokens
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
        )

        fp8_m_grouped_gemm_nt_masked(
-            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
        )
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@ -9,6 +9,7 @@ import functools
 import importlib
 import os
 from collections.abc import Callable
+from enum import Enum
 from typing import Any, NoReturn

 import torch
@ -20,6 +21,28 @@ from vllm.utils.import_utils import has_deep_gemm
 from vllm.utils.math_utils import cdiv


+class DeepGemmQuantScaleFMT(Enum):
+    # Float32 scales in Float32 tensor
+    FLOAT32 = 0
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Keep the scales in Float32 tensor.
+    FLOAT32_CEIL_UE8M0 = 1
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Pack the scales into a int32 tensor where each int32
+    # element contains 4 scale values.
+    UE8M0 = 2
+
+    @staticmethod
+    def from_oracle() -> "DeepGemmQuantScaleFMT":
+        if not is_deep_gemm_e8m0_used():
+            return DeepGemmQuantScaleFMT.FLOAT32
+        return (
+            DeepGemmQuantScaleFMT.UE8M0
+            if current_platform.is_device_capability(100)
+            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+        )
+
+
@functools.cache
 def is_deep_gemm_supported() -> bool:
    """Return `True` if DeepGEMM is supported on the current platform.