diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2dce099770f08..85b7abe817a04 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -24,23 +24,18 @@ from vllm.triton_utils import tl MNK_FACTORS = [ (1, 128, 128), - (1, 128, 2048), (1, 512, 512), - (1, 1024, 128), (1, 1024, 2048), (32, 128, 128), (32, 512, 512), (32, 1024, 2048), - (45, 128, 128), (45, 128, 2048), (45, 512, 512), (45, 1024, 128), - (45, 1024, 2048), (64, 512, 512), (64, 1024, 2048), (222, 128, 128), (222, 128, 2048), - (222, 1024, 128), (222, 1024, 2048), ] NUM_EXPERTS = [8, 64] @@ -102,7 +97,7 @@ class BatchedMMTensors: @pytest.mark.parametrize("num_experts", [8, 32]) -@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512]) +@pytest.mark.parametrize("max_tokens_per_expert", [32, 512]) @pytest.mark.parametrize("K", [128, 1024]) @pytest.mark.parametrize("N", [128, 1024]) @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) @@ -121,6 +116,13 @@ def test_batched_mm( use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8: pytest.skip("Don't test blocking for non-quantized types.") @@ -248,6 +250,13 @@ def test_fused_moe_batched_experts( use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if topk > e: pytest.skip("topk > e") diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 11b1e2ff3c27e..b5b42ac03239c 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -42,18 +42,14 @@ DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # and its hidden size is 7168. MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), (1, 4608, 128), - (1, 4608, 512), (1, 4608, 7168), (83, 128, 128), (83, 512, 512), - (83, 1024, 7168), (83, 4608, 512), (83, 4608, 7168), - (128, 128, 128), (128, 512, 512), (128, 1024, 7168), (128, 4608, 512), @@ -63,22 +59,17 @@ MNK_FACTORS = [ (2048, 4608, 512), (2048, 4608, 7168), (8192, 128, 128), - (8192, 512, 512), (8192, 128, 7168), (8192, 1024, 7168), - (8192, 4608, 512), (8192, 4608, 7168), ] MNK_FACTORS_DG = [ (128, 128, 128), - (128, 512, 512), (128, 128, 7168), (128, 1024, 7168), (128, 4608, 128), - (128, 4608, 512), (128, 4608, 7168), - (192, 128, 128), (192, 512, 512), (192, 1024, 7168), (192, 4608, 512), @@ -88,11 +79,8 @@ MNK_FACTORS_DG = [ (1335, 4608, 512), (1335, 4608, 7168), (2048, 128, 128), - (2048, 512, 512), (2048, 128, 7168), (2048, 1024, 7168), - (2048, 4608, 128), - (2048, 4608, 512), (2048, 4608, 7168), ] diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 74cc943714dd9..408cb09d484d0 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -21,36 +21,29 @@ vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 vllm_config.scheduler_config.max_model_len = 8192 -DTYPES = [torch.half, torch.bfloat16] +DTYPES = [torch.bfloat16] MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), - (1, 4096, 128), (1, 4096, 512), (1, 4096, 7168), - (33, 128, 128), (33, 512, 512), (33, 128, 7168), (33, 1024, 7168), (33, 4096, 128), - (33, 4096, 512), (33, 4096, 7168), (128, 128, 128), - (128, 512, 512), (128, 1024, 7168), (128, 4096, 512), (128, 4096, 7168), - (222, 128, 128), (222, 512, 512), (222, 1024, 7168), (222, 4096, 512), (222, 4096, 7168), (2048, 128, 128), (2048, 1024, 7168), - (2048, 4096, 512), (2048, 4096, 4096), ] diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 4330eda251f75..5512ccce47b05 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -26,16 +26,13 @@ TOP_KS = [6, 8] MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), (7, 3072, 1536), (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), - (64, 3072, 1536), (224, 1024, 1024), - (224, 1024, 1536), (224, 3072, 1024), (224, 3072, 1536), (32768, 1024, 1024), diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 65cd3e110a0fa..97b74ced2e9aa 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -393,7 +393,6 @@ def _test_deepep_deepgemm_moe( MNKs = [ (8, 128, 128), (8, 128, 512), - (8, 512, 512), (3, 1024, 2048), (32, 128, 1024), (45, 512, 2048), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index cad0085d5ba6e..9b1054f7d0ab8 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -130,10 +130,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size): # Note: N <= 512 will disable the deepgemm path due to performance issues. MNKs = [ (1024, 768, 128), - (1024, 768, 512), (2048, 768, 512), (512, 1024, 1024), - (512, 2048, 2048), (4096, 4096, 1024), ] diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 0780232a82640..f985f9ac7ca67 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -34,8 +34,6 @@ TOP_KS = [1] MNK_FACTORS = [ (256, 8192, 5120), - (256, 4096, 5120), - (127, 8192, 5120), (127, 4096, 5120), (10, 8192, 5120), (10, 4096, 5120), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 18cfd4f79092d..be3e36865d1a4 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -34,10 +34,8 @@ if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_cap MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), - (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), @@ -49,7 +47,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_flashinfer_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index 3f4f142be7674..662e0723b7583 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -27,7 +27,7 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("topk_group", [2]) @pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) @pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32]) def test_grouped_topk( monkeypatch: pytest.MonkeyPatch, n_token: int, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index a86185a2dc461..0617ffd142957 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -308,6 +308,16 @@ def test_modular_kernel_combinations_singlegpu( world_size=world_size, ) + if ( + (dtype == torch.float8_e4m3fn) + or ( + quant_config is not None and quant_config.quant_dtype == torch.float8_e4m3fn + ) + ) and not current_platform.has_device_capability(89): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + verbosity = pytestconfig.getoption("verbose") run(config, verbosity > 0) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 2c802ff4e6bd6..014df1fa111f2 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -66,8 +66,6 @@ FUSED_MOE_MNK_FACTORS = [ (1, 128, 128), (1, 2048, 128), (33, 2048, 128), - (222, 1024, 1024), - (32768, 128, 128), (32768, 2048, 511), (40000, 1024, 1024), ] @@ -76,7 +74,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [ (1, 128, 128), (1, 1024, 1024), (32, 2048, 128), - (32, 1024, 1024), (222, 2048, 1024), ] @@ -512,8 +509,8 @@ def marlin_moe_generate_valid_test_cases(): e_list = [4, 12] topk_list = [2, 3] ep_size_list = [1, 4] - dtype_list = [torch.half, torch.bfloat16] - group_size_list = [-1, 16, 32, 128] + dtype_list = [torch.bfloat16] + group_size_list = [-1, 32, 128] act_order_list = [True, False] quant_type_list = [ scalar_types.float4_e2m1f, @@ -885,10 +882,10 @@ def test_batched_moe_align_block_size_opcheck(): ) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("m", [1, 33, 222]) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): input = torch.randn((m, topk, k), device="cuda", dtype=dtype) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index dae19c0b2b31b..aa544fe0e0f63 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -26,9 +26,7 @@ MNK_FACTORS = [ (2, 1024, 1024), (2, 1024, 1536), (2, 3072, 1024), - (2, 3072, 1536), (64, 1024, 1024), - (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), (224, 1024, 1024), @@ -39,7 +37,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_cutlass_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 8b3bebb391f2f..be3a0e25516e5 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -19,20 +19,16 @@ CASES = [ (32, 64, 256, fp8_dtype), (17, 31, 768, fp8_dtype), (1, 1, 128 * 1, fp8_dtype), - (1, 1, 128 * 2, fp8_dtype), (1, 1, 128 * 3, fp8_dtype), (1, 1, 128 * 4, fp8_dtype), (8, 16, 128 * 1, fp8_dtype), (8, 16, 128 * 2, fp8_dtype), (8, 16, 128 * 3, fp8_dtype), - (8, 16, 128 * 4, fp8_dtype), (8, 64, 7168, fp8_dtype), (8, 128, 7168, fp8_dtype), - (8, 256, 7168, fp8_dtype), (8, 512, 7168, fp8_dtype), (8, 1024, 7168, fp8_dtype), (256, 8, 7168, fp8_dtype), - (256, 16, 7168, fp8_dtype), (256, 32, 7168, fp8_dtype), (256, 64, 7168, fp8_dtype), # Only add a few fnuz tests to help with long CI times.