From 63934543a0f05edfc6a5f2afa235b5e026b27b71 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sun, 25 May 2025 01:02:59 -0400 Subject: [PATCH] Speed up the `kernels/quantization/` tests (#18669) Signed-off-by: mgoin --- tests/kernels/quantization/test_block_fp8.py | 14 +++++------ tests/kernels/quantization/test_gguf.py | 4 ++-- .../quantization/test_triton_scaled_mm.py | 24 +++++++------------ 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index ef1d7e47ef81..ae05d61173f3 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192 # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] -NUM_TOKENS = [7, 83, 2048] +NUM_TOKENS = [7, 2050] D = [512, 4096, 5120, 13824] -GROUP_SIZE = [64, 128, 256, 512] -M = [1, 7, 8, 83, 84, 512, 2048, 4096] -N = [128, 512, 1024, 4096, 7168, 7748, 13824] -K = [256, 4096, 5120, 3884, 13824, 16384] +GROUP_SIZE = [64, 128, 512] +M = [1, 7, 8, 83, 84, 4096] +N = [128, 512, 7168, 7748, 13824] +K = [256, 3884, 4096, 13824, 16384] # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 # and its hidden size is 7168. -M_moe = [1, 2, 7, 83, 128, 512, 2048] -M_moe_dg = [128, 192, 512, 1335, 2048] +M_moe = [1, 2, 7, 83, 128, 2048] +M_moe_dg = [128, 192, 1335, 2048] N_moe = [128, 256, 1024, 4608] # [13824] K_moe = [256, 512, 7168] # [13824] BLOCK_SIZE = [[128, 128]] diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index 6cf88604ec65..e520e99b071c 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -35,11 +35,11 @@ def get_gguf_MoE_tensors( return GGUFReader(sample_file).tensors -DTYPES = [torch.half, torch.bfloat16, torch.float32] +DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # Hidden_size for testing, must match the sample file in HF repo, # we have `hidden_size = 256, 1024` for test in HF repo currently. HIDDEN_SIZES = [256, 1024] -NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing +NUM_TOKENS = [7, 2050] # Arbitrary values for testing SEEDS = [0] QUANT_TYPES = [ # i-matrix diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 45f10b0eb1d5..30e6eeb8d566 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -13,8 +13,13 @@ from vllm.platforms import current_platform device = "cuda" +triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") +triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm -def scaled_mm_torch(a: torch.Tensor, + +def torch_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, @@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, if use_bias: bias = torch.rand((N, ), device=device, dtype=out_dtype) - triton_scaled_mm_module = importlib.import_module( - "vllm.model_executor.layers.quantization.compressed_tensors." - "triton_scaled_mm") - triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm - c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - a_cpu = a.cpu() - b_cpu = b.cpu() - scale_a_cpu = scale_a.cpu() - scale_b_cpu = scale_b.cpu() - bias_cpu = None if bias is None else bias.cpu() + c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu, - out_dtype, bias_cpu) - - c_check_cpu = c_check.cpu() - torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1) + torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)