mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:26:12 +08:00
Speed up the kernels/quantization/ tests (#18669)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
75f81750f3
commit
63934543a0
@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192
|
|||||||
|
|
||||||
# Test configurations
|
# Test configurations
|
||||||
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||||
NUM_TOKENS = [7, 83, 2048]
|
NUM_TOKENS = [7, 2050]
|
||||||
D = [512, 4096, 5120, 13824]
|
D = [512, 4096, 5120, 13824]
|
||||||
GROUP_SIZE = [64, 128, 256, 512]
|
GROUP_SIZE = [64, 128, 512]
|
||||||
M = [1, 7, 8, 83, 84, 512, 2048, 4096]
|
M = [1, 7, 8, 83, 84, 4096]
|
||||||
N = [128, 512, 1024, 4096, 7168, 7748, 13824]
|
N = [128, 512, 7168, 7748, 13824]
|
||||||
K = [256, 4096, 5120, 3884, 13824, 16384]
|
K = [256, 3884, 4096, 13824, 16384]
|
||||||
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
|
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
|
||||||
# and its hidden size is 7168.
|
# and its hidden size is 7168.
|
||||||
M_moe = [1, 2, 7, 83, 128, 512, 2048]
|
M_moe = [1, 2, 7, 83, 128, 2048]
|
||||||
M_moe_dg = [128, 192, 512, 1335, 2048]
|
M_moe_dg = [128, 192, 1335, 2048]
|
||||||
N_moe = [128, 256, 1024, 4608] # [13824]
|
N_moe = [128, 256, 1024, 4608] # [13824]
|
||||||
K_moe = [256, 512, 7168] # [13824]
|
K_moe = [256, 512, 7168] # [13824]
|
||||||
BLOCK_SIZE = [[128, 128]]
|
BLOCK_SIZE = [[128, 128]]
|
||||||
|
|||||||
@ -35,11 +35,11 @@ def get_gguf_MoE_tensors(
|
|||||||
return GGUFReader(sample_file).tensors
|
return GGUFReader(sample_file).tensors
|
||||||
|
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||||
# Hidden_size for testing, must match the sample file in HF repo,
|
# Hidden_size for testing, must match the sample file in HF repo,
|
||||||
# we have `hidden_size = 256, 1024` for test in HF repo currently.
|
# we have `hidden_size = 256, 1024` for test in HF repo currently.
|
||||||
HIDDEN_SIZES = [256, 1024]
|
HIDDEN_SIZES = [256, 1024]
|
||||||
NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing
|
NUM_TOKENS = [7, 2050] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
QUANT_TYPES = [
|
QUANT_TYPES = [
|
||||||
# i-matrix
|
# i-matrix
|
||||||
|
|||||||
@ -13,8 +13,13 @@ from vllm.platforms import current_platform
|
|||||||
|
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|
||||||
|
triton_scaled_mm_module = importlib.import_module(
|
||||||
|
"vllm.model_executor.layers.quantization.compressed_tensors."
|
||||||
|
"triton_scaled_mm")
|
||||||
|
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
||||||
|
|
||||||
def scaled_mm_torch(a: torch.Tensor,
|
|
||||||
|
def torch_scaled_mm(a: torch.Tensor,
|
||||||
b: torch.Tensor,
|
b: torch.Tensor,
|
||||||
scale_a: torch.Tensor,
|
scale_a: torch.Tensor,
|
||||||
scale_b: torch.Tensor,
|
scale_b: torch.Tensor,
|
||||||
@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
|
|||||||
if use_bias:
|
if use_bias:
|
||||||
bias = torch.rand((N, ), device=device, dtype=out_dtype)
|
bias = torch.rand((N, ), device=device, dtype=out_dtype)
|
||||||
|
|
||||||
triton_scaled_mm_module = importlib.import_module(
|
|
||||||
"vllm.model_executor.layers.quantization.compressed_tensors."
|
|
||||||
"triton_scaled_mm")
|
|
||||||
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
|
||||||
|
|
||||||
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||||
|
|
||||||
a_cpu = a.cpu()
|
c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||||
b_cpu = b.cpu()
|
|
||||||
scale_a_cpu = scale_a.cpu()
|
|
||||||
scale_b_cpu = scale_b.cpu()
|
|
||||||
bias_cpu = None if bias is None else bias.cpu()
|
|
||||||
|
|
||||||
c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
|
torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
|
||||||
out_dtype, bias_cpu)
|
|
||||||
|
|
||||||
c_check_cpu = c_check.cpu()
|
|
||||||
torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user