diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 69317405d48b..edf3e6189243 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -89,14 +89,11 @@ class BatchedMMTensors: return BatchedMMTensors(A, B, C, num_expert_tokens) -@pytest.mark.parametrize("num_experts", [8, 16, 32]) -@pytest.mark.parametrize("max_tokens_per_expert", - [32, 64, 128, 192, 224, 256, 512]) -@pytest.mark.parametrize("K", [128, 256, 1024]) -@pytest.mark.parametrize("N", [128, 256, 1024]) -@pytest.mark.parametrize( - "dtype", - [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("num_experts", [8, 32]) +@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512]) +@pytest.mark.parametrize("K", [128, 1024]) +@pytest.mark.parametrize("N", [128, 1024]) +@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py index 0872836b6064..1768baaf1ca7 100644 --- a/tests/kernels/moe/test_count_expert_num_tokens.py +++ b/tests/kernels/moe/test_count_expert_num_tokens.py @@ -113,8 +113,7 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int, rtol=0) -@pytest.mark.parametrize( - "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317]) +@pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317]) @pytest.mark.parametrize("num_topk", [2, 6, 8]) @pytest.mark.parametrize("num_experts", [64]) @pytest.mark.parametrize("ep_size", [1, 2, 4]) @@ -126,7 +125,7 @@ def test_compute_expert_num_tokens(num_tokens: int, num_topk: int, ep_size, topk_ids_dtype) -@pytest.mark.parametrize("numel", list(range(1, 8192, 11))) +@pytest.mark.parametrize("numel", list(range(1, 8192, 111))) @pytest.mark.parametrize("num_experts", [32]) @pytest.mark.parametrize("ep_size", [2]) @pytest.mark.parametrize("topk_ids_dtype", [torch.int64]) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index b82c74a42ab3..1951eb0c6180 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -42,6 +42,24 @@ NUM_EXPERTS = [8, 64, 192] EP_SIZE = [1, 4] TOP_KS = [2, 6] +FUSED_MOE_MNK_FACTORS = [ + (1, 128, 128), + (1, 2048, 128), + (33, 2048, 128), + (222, 1024, 1024), + (32768, 128, 128), + (32768, 2048, 511), + (40000, 1024, 1024), +] + +FUSED_MOE_WN16_MNK_FACTORS = [ + (1, 128, 128), + (1, 1024, 1024), + (32, 2048, 128), + (32, 1024, 1024), + (222, 2048, 1024), +] + vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 vllm_config.scheduler_config.max_model_len = 8192 @@ -116,13 +134,11 @@ def run_moe_test( return baseline_output -@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize("chunk_size", [8192]) def test_fused_moe( @@ -235,13 +251,11 @@ def test_fused_moe( use_cudagraph=use_cudagraph) -@pytest.mark.parametrize("m", [1, 32, 222]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("group_size", [64, 128]) @pytest.mark.parametrize("has_zp", [True, False]) @pytest.mark.parametrize("weight_bits", [4, 8]) @@ -352,8 +366,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) -@pytest.mark.parametrize("dtype", - [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index 12ef9e776c3a..5dfc8d9fab32 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -15,10 +15,10 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( from vllm.platforms import current_platform from vllm.utils import round_up -NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096] -NUM_EXPERTS = [32, 160, 256, 257, 512] +NUM_TOKENS = [1, 3, 256, 2256, 4096] +NUM_EXPERTS = [32, 160, 256, 257] TOP_KS = [1, 2, 16, 32] -BLOCK_SIZES = [32, 64, 128, 256] +BLOCK_SIZES = [32, 128] current_platform.seed_everything(0) diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 8d215a0cbeed..6ca01f9271bb 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -18,7 +18,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( from vllm.platforms import current_platform NUM_EXPERTS = [16, 64, 256] -TOP_KS = [2, 4, 6, 8] +TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] current_platform.seed_everything(0) @@ -177,11 +177,11 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor, return output -@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000]) -@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168]) +@pytest.mark.parametrize("n_token", [1, 33, 1024, 5000]) +@pytest.mark.parametrize("n_hidden", [2048, 7168]) @pytest.mark.parametrize("n_expert", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("ep_size", EP_SIZE) @pytest.mark.parametrize("align_block_size", [None, 128]) def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index f7a661b4bc7b..fbef6706beaf 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -44,6 +44,14 @@ requires_pplx = pytest.mark.skipif( reason="Requires PPLX kernels", ) +BATCHED_MOE_MNK_FACTORS = [ + (1, 128, 128), + (33, 2048, 128), + (64, 128, 2048), + (222, 128, 128), + (222, 2048, 1024), +] + PPLX_COMBOS = [ # TODO: figure out why this fails, seems to be test problem #(1, 128, 128), @@ -152,9 +160,7 @@ def torch_batched_moe( return torch_finalize(out, topk_weight, topk_ids) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 512, 1024]) +@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.bfloat16])