vllm/tests/kernels/test_fla_layernorm_guard.py
Vadim Gimpelson 82e64c7a20
[PERF] [Qwen3-next] Speed up gated RMSNorm (#26207)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-12 08:27:50 +00:00

389 lines
12 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layer_norm_fwd,
layernorm_fn,
rms_norm_ref,
)
from vllm.platforms import current_platform
def layer_norm_ref(
x,
weight,
bias,
z=None,
eps=1e-6,
group_size=None,
norm_before_gate=True,
is_rms_norm=False,
):
"""Reference implementation for both layer norm and RMS norm."""
if is_rms_norm:
# Use the imported rms_norm_ref for RMS norm cases
return rms_norm_ref(
x,
weight,
bias,
z=z,
eps=eps,
group_size=group_size,
norm_before_gate=norm_before_gate,
upcast=True,
)
# Layer norm implementation
dtype = x.dtype
x = x.float()
weight = weight.float()
bias = bias.float() if bias is not None else None
z = z.float() if z is not None else None
if z is not None and not norm_before_gate:
x = x * F.silu(z)
if group_size is None:
# Layer norm: subtract mean
mean = x.mean(dim=-1, keepdim=True)
var = ((x - mean).square()).mean(dim=-1, keepdim=True)
rstd = 1 / torch.sqrt(var + eps)
out = (x - mean) * rstd * weight
if bias is not None:
out = out + bias
else:
# Group norm
from einops import rearrange
x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
mean = x_group.mean(dim=-1, keepdim=True)
var = ((x_group - mean).square()).mean(dim=-1, keepdim=True)
rstd = 1 / torch.sqrt(var + eps)
x_group = (x_group - mean) * rstd
out = rearrange(x_group, "... g d -> ... (g d)") * weight
if bias is not None:
out = out + bias
if z is not None and norm_before_gate:
out *= F.silu(z)
return out.to(dtype)
DTYPES = [torch.bfloat16, torch.float32]
# Test various M sizes to ensure rows_per_block logic works correctly
NUM_TOKENS = [
1,
7,
16,
63,
128,
256,
512,
1024,
2048,
4096,
5789,
8189,
8191,
16383,
32767,
]
HIDDEN_SIZES = [64, 128, 256, 1024]
GROUP_SIZES = [None, 64, 128] # None means full hidden size
NORM_BEFORE_GATE = [True, False]
IS_RMS_NORM = [True, False]
SEEDS = [0, 42]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_basic(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
seed: int,
is_rms_norm: bool,
) -> None:
"""Test basic layer norm forward pass without z (gate) tensor."""
current_platform.seed_everything(seed)
device = torch.device("cuda:0")
# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm
)
# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm)
# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
# Check mean and rstd shapes
if not is_rms_norm:
assert mean.shape == (num_tokens,)
assert rstd.shape == (num_tokens,)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", [128, 256, 1024])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE)
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_with_gate(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
norm_before_gate: bool,
is_rms_norm: bool,
) -> None:
"""Test layer norm forward pass with z (gate) tensor."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x,
weight,
bias,
eps,
z=z,
norm_before_gate=norm_before_gate,
is_rms_norm=is_rms_norm,
)
# Run reference implementation
ref_out = layer_norm_ref(
x,
weight,
bias,
z=z,
eps=eps,
norm_before_gate=norm_before_gate,
is_rms_norm=is_rms_norm,
)
# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
@pytest.mark.parametrize("num_tokens", [128, 512])
@pytest.mark.parametrize("hidden_size", [512, 1024])
@pytest.mark.parametrize("group_size", [64, 128, 256])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_with_groups(
num_tokens: int,
hidden_size: int,
group_size: int,
dtype: torch.dtype,
is_rms_norm: bool,
) -> None:
"""Test layer norm forward pass with group normalization."""
if hidden_size % group_size != 0:
pytest.skip(
f"hidden_size {hidden_size} not divisible by group_size {group_size}"
)
current_platform.seed_everything(42)
device = torch.device("cuda:0")
# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
ngroups = hidden_size // group_size
# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm
)
# Run reference implementation
ref_out = layer_norm_ref(
x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm
)
# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
# Check mean and rstd shapes for groups
if not is_rms_norm:
assert mean.shape == (ngroups * num_tokens,)
assert rstd.shape == (ngroups * num_tokens,)
@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_layer_norm_rows_per_block(
num_tokens: int,
dtype: torch.dtype,
) -> None:
"""Test that rows_per_block logic works correctly for various M sizes."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
hidden_size = 1024
# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Run the triton kernel
out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False)
# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@torch.inference_mode()
def test_strided_input(dtype: torch.dtype) -> None:
"""Test that the kernel handles non-contiguous (strided)
inputs correctly."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
num_tokens = 128
hidden_size = 1024
# Create a larger tensor and take a strided slice
x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device)
x = x_large[:, :hidden_size]
# Make it contiguous for the kernel
x_contiguous = x.contiguous()
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Run the triton kernel with contiguous input
out, mean, rstd = layer_norm_fwd(
x_contiguous, weight, bias, eps, z=None, is_rms_norm=False
)
# Run reference implementation
ref_out = layer_norm_ref(
x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False
)
# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
@pytest.mark.parametrize("num_tokens", [1, 128, 2048])
@pytest.mark.parametrize("hidden_size", [768, 4096])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_output_buffer_provided(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
) -> None:
"""Test that the kernel works when an output buffer is provided."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Pre-allocate output buffer
out_buffer = torch.empty_like(x)
# Run the triton kernel with provided output
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False
)
# Check that the provided buffer was used
assert out.data_ptr() == out_buffer.data_ptr()
# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
@pytest.mark.parametrize(
"shape",
[
(4, 16, 1024), # 3D tensor
(2, 8, 512, 256), # 4D tensor
],
)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_multidimensional_input(
shape: tuple,
dtype: torch.dtype,
) -> None:
"""Test that the autograd function handles multidimensional inputs."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
hidden_size = shape[-1]
# Create inputs
x = torch.randn(*shape, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6
# Run through autograd function
out = layernorm_fn(x, weight, bias, z=None, eps=eps)
# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
# Check outputs
assert out.shape == x.shape
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
if __name__ == "__main__":
# Run a quick smoke test
test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False)
test_layer_norm_rows_per_block(513, torch.float16)
print("All smoke tests passed!")