mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:44:57 +08:00
[CI Perf] Prune tests in tests/kernels/moe/ (#22939)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
590bddbfc5
commit
d2b0e97ea6
@ -89,14 +89,11 @@ class BatchedMMTensors:
|
||||
return BatchedMMTensors(A, B, C, num_expert_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_experts", [8, 16, 32])
|
||||
@pytest.mark.parametrize("max_tokens_per_expert",
|
||||
[32, 64, 128, 192, 224, 256, 512])
|
||||
@pytest.mark.parametrize("K", [128, 256, 1024])
|
||||
@pytest.mark.parametrize("N", [128, 256, 1024])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_experts", [8, 32])
|
||||
@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
|
||||
@pytest.mark.parametrize("K", [128, 1024])
|
||||
@pytest.mark.parametrize("N", [128, 1024])
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
|
||||
@ -113,8 +113,7 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
|
||||
rtol=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
|
||||
@pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317])
|
||||
@pytest.mark.parametrize("num_topk", [2, 6, 8])
|
||||
@pytest.mark.parametrize("num_experts", [64])
|
||||
@pytest.mark.parametrize("ep_size", [1, 2, 4])
|
||||
@ -126,7 +125,7 @@ def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
|
||||
ep_size, topk_ids_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
|
||||
@pytest.mark.parametrize("numel", list(range(1, 8192, 111)))
|
||||
@pytest.mark.parametrize("num_experts", [32])
|
||||
@pytest.mark.parametrize("ep_size", [2])
|
||||
@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
|
||||
|
||||
@ -42,6 +42,24 @@ NUM_EXPERTS = [8, 64, 192]
|
||||
EP_SIZE = [1, 4]
|
||||
TOP_KS = [2, 6]
|
||||
|
||||
FUSED_MOE_MNK_FACTORS = [
|
||||
(1, 128, 128),
|
||||
(1, 2048, 128),
|
||||
(33, 2048, 128),
|
||||
(222, 1024, 1024),
|
||||
(32768, 128, 128),
|
||||
(32768, 2048, 511),
|
||||
(40000, 1024, 1024),
|
||||
]
|
||||
|
||||
FUSED_MOE_WN16_MNK_FACTORS = [
|
||||
(1, 128, 128),
|
||||
(1, 1024, 1024),
|
||||
(32, 2048, 128),
|
||||
(32, 1024, 1024),
|
||||
(222, 2048, 1024),
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
@ -116,13 +134,11 @@ def run_moe_test(
|
||||
return baseline_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize("chunk_size", [8192])
|
||||
def test_fused_moe(
|
||||
@ -235,13 +251,11 @@ def test_fused_moe(
|
||||
use_cudagraph=use_cudagraph)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 32, 222])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 1024])
|
||||
@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("group_size", [64, 128])
|
||||
@pytest.mark.parametrize("has_zp", [True, False])
|
||||
@pytest.mark.parametrize("weight_bits", [4, 8])
|
||||
@ -352,8 +366,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
|
||||
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
|
||||
|
||||
@ -15,10 +15,10 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import round_up
|
||||
|
||||
NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
|
||||
NUM_EXPERTS = [32, 160, 256, 257, 512]
|
||||
NUM_TOKENS = [1, 3, 256, 2256, 4096]
|
||||
NUM_EXPERTS = [32, 160, 256, 257]
|
||||
TOP_KS = [1, 2, 16, 32]
|
||||
BLOCK_SIZES = [32, 64, 128, 256]
|
||||
BLOCK_SIZES = [32, 128]
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
NUM_EXPERTS = [16, 64, 256]
|
||||
TOP_KS = [2, 4, 6, 8]
|
||||
TOP_KS = [2, 6, 8]
|
||||
EP_SIZE = [1, 4, 16]
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
@ -177,11 +177,11 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
|
||||
return output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000])
|
||||
@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168])
|
||||
@pytest.mark.parametrize("n_token", [1, 33, 1024, 5000])
|
||||
@pytest.mark.parametrize("n_hidden", [2048, 7168])
|
||||
@pytest.mark.parametrize("n_expert", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("align_block_size", [None, 128])
|
||||
def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
|
||||
|
||||
@ -44,6 +44,14 @@ requires_pplx = pytest.mark.skipif(
|
||||
reason="Requires PPLX kernels",
|
||||
)
|
||||
|
||||
BATCHED_MOE_MNK_FACTORS = [
|
||||
(1, 128, 128),
|
||||
(33, 2048, 128),
|
||||
(64, 128, 2048),
|
||||
(222, 128, 128),
|
||||
(222, 2048, 1024),
|
||||
]
|
||||
|
||||
PPLX_COMBOS = [
|
||||
# TODO: figure out why this fails, seems to be test problem
|
||||
#(1, 128, 128),
|
||||
@ -152,9 +160,7 @@ def torch_batched_moe(
|
||||
return torch_finalize(out, topk_weight, topk_ids)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64, 222])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 512, 1024])
|
||||
@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user