diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 4ae23eff62f37..6015a83e82950 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -16,7 +16,7 @@ import numpy import pytest import yaml -RTOL = 0.05 +RTOL = 0.08 TEST_DATA_FILE = os.environ.get( "LM_EVAL_TEST_DATA_FILE", ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2420b2d5d71b5..ec00bc7f108df 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -317,15 +317,46 @@ steps: commands: - pytest -v -s compile/test_full_graph.py -- label: Kernels Test %N # 1h each - mirror_hardwares: [amd] +- label: Kernels Core Operation Test source_file_dependencies: - csrc/ - - vllm/attention - - tests/kernels + - tests/kernels/core commands: - - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 + - pytest -v -s kernels/core + +- label: Kernels Attention Test %N + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test + source_file_dependencies: + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + commands: + - pytest -v -s kernels/moe + +- label: Kernels Mamba Test + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + commands: + - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min # mirror_hardwares: [amd] diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py similarity index 100% rename from tests/kernels/conftest.py rename to tests/kernels/attention/conftest.py diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py similarity index 99% rename from tests/kernels/test_attention.py rename to tests/kernels/attention/test_attention.py index 0d7898a900e48..e5650136f2584 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -6,13 +6,12 @@ from typing import Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils import get_max_shared_memory_bytes -from .allclose_default import get_default_atol, get_default_rtol - if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py similarity index 95% rename from tests/kernels/test_attention_selector.py rename to tests/kernels/attention/test_attention_selector.py index 2b5e0a29ddc55..b0414244c2151 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -156,6 +156,15 @@ def test_env( expected = ("TRITON_MLA_VLLM_V1" if use_v1 else "TRITON_MLA") assert backend.get_name() == expected + elif name == "FLASHINFER": + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = "FLASHINFER_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected else: backend = get_attn_backend(16, torch.float16, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py similarity index 99% rename from tests/kernels/test_blocksparse_attention.py rename to tests/kernels/attention/test_blocksparse_attention.py index 3025ae0f921a4..82d038257575c 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/attention/test_blocksparse_attention.py @@ -6,14 +6,13 @@ from typing import Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) from vllm.platforms import current_platform from vllm.utils import get_max_shared_memory_bytes -from .allclose_default import get_default_atol, get_default_rtol - FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py similarity index 100% rename from tests/kernels/test_cache.py rename to tests/kernels/attention/test_cache.py diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py similarity index 100% rename from tests/kernels/test_cascade_flash_attn.py rename to tests/kernels/attention/test_cascade_flash_attn.py diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py similarity index 100% rename from tests/kernels/test_encoder_decoder_attn.py rename to tests/kernels/attention/test_encoder_decoder_attn.py diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py similarity index 100% rename from tests/kernels/test_flash_attn.py rename to tests/kernels/attention/test_flash_attn.py diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py similarity index 100% rename from tests/kernels/test_flashinfer.py rename to tests/kernels/attention/test_flashinfer.py diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py similarity index 100% rename from tests/kernels/test_flashmla.py rename to tests/kernels/attention/test_flashmla.py diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py similarity index 100% rename from tests/kernels/test_lightning_attn.py rename to tests/kernels/attention/test_lightning_attn.py diff --git a/tests/kernels/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py similarity index 100% rename from tests/kernels/test_merge_attn_states.py rename to tests/kernels/attention/test_merge_attn_states.py diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py similarity index 100% rename from tests/kernels/test_mha_attn.py rename to tests/kernels/attention/test_mha_attn.py diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py similarity index 100% rename from tests/kernels/test_mla_decode_cpu.py rename to tests/kernels/attention/test_mla_decode_cpu.py diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py similarity index 100% rename from tests/kernels/test_prefix_prefill.py rename to tests/kernels/attention/test_prefix_prefill.py diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py similarity index 100% rename from tests/kernels/test_rocm_attention_selector.py rename to tests/kernels/attention/test_rocm_attention_selector.py diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py similarity index 100% rename from tests/kernels/test_triton_decode_attention.py rename to tests/kernels/attention/test_triton_decode_attention.py diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py similarity index 97% rename from tests/kernels/test_activation.py rename to tests/kernels/core/test_activation.py index cf0f21ce06514..79f838a954e70 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -5,6 +5,7 @@ import random import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, @@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, SiluAndMul) from vllm.platforms import current_platform -from .allclose_default import get_default_atol, get_default_rtol - DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py similarity index 100% rename from tests/kernels/test_fused_quant_layernorm.py rename to tests/kernels/core/test_fused_quant_layernorm.py diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py similarity index 100% rename from tests/kernels/test_layernorm.py rename to tests/kernels/core/test_layernorm.py diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py new file mode 100644 index 0000000000000..c9a9679c5d80f --- /dev/null +++ b/tests/kernels/core/test_opcheck.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Tests for miscellaneous utilities +""" + +import torch + +from tests.kernels.utils import opcheck + + +def test_convert_fp8_opcheck(): + data = torch.randn((256, 256), dtype=torch.float32, device="cuda") + result = torch.empty_like(data, dtype=torch.float8_e4m3fn) + opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) + + +# TODO: Add this back, currently fails with +# csrc/cuda_utils_kernels.cu:15 'invalid argument' +# @pytest.mark.skipif(not current_platform.is_cuda(), +# reason="Only supported for CUDA") +# def test_cuda_utils_opcheck(): +# opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) +# opcheck( +# torch.ops._C_cuda_utils. +# get_max_shared_memory_per_block_device_attribute, (0, )) diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py similarity index 100% rename from tests/kernels/test_permute_cols.py rename to tests/kernels/core/test_permute_cols.py diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py similarity index 99% rename from tests/kernels/test_pos_encoding.py rename to tests/kernels/core/test_pos_encoding.py index eb83b4d612c22..2b7bf755ec22d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -6,11 +6,10 @@ from typing import Callable, Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform -from .allclose_default import get_default_atol, get_default_rtol - IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] HEAD_SIZES = [64, 80, 112, 120, 256] diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py similarity index 100% rename from tests/kernels/test_rotary_embedding.py rename to tests/kernels/core/test_rotary_embedding.py diff --git a/tests/kernels/test_uva.py b/tests/kernels/core/test_uva.py similarity index 100% rename from tests/kernels/test_uva.py rename to tests/kernels/core/test_uva.py diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py similarity index 100% rename from tests/kernels/test_causal_conv1d.py rename to tests/kernels/mamba/test_causal_conv1d.py diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py similarity index 100% rename from tests/kernels/test_mamba_mixer2.py rename to tests/kernels/mamba/test_mamba_mixer2.py diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py similarity index 100% rename from tests/kernels/test_mamba_ssm.py rename to tests/kernels/mamba/test_mamba_ssm.py diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py similarity index 100% rename from tests/kernels/test_mamba_ssm_ssd.py rename to tests/kernels/mamba/test_mamba_ssm_ssd.py diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py similarity index 100% rename from tests/kernels/test_cutlass_moe.py rename to tests/kernels/moe/test_cutlass_moe.py diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py similarity index 100% rename from tests/kernels/test_moe.py rename to tests/kernels/moe/test_moe.py diff --git a/tests/kernels/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py similarity index 100% rename from tests/kernels/test_triton_moe_ptpc_fp8.py rename to tests/kernels/moe/test_triton_moe_ptpc_fp8.py diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py similarity index 100% rename from tests/kernels/test_allspark_gemm.py rename to tests/kernels/quantization/test_allspark_gemm.py diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py similarity index 100% rename from tests/kernels/test_aqlm.py rename to tests/kernels/quantization/test_aqlm.py diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py similarity index 100% rename from tests/kernels/test_awq.py rename to tests/kernels/quantization/test_awq.py diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py similarity index 100% rename from tests/kernels/test_awq_marlin.py rename to tests/kernels/quantization/test_awq_marlin.py diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py similarity index 100% rename from tests/kernels/test_awq_triton.py rename to tests/kernels/quantization/test_awq_triton.py diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py similarity index 99% rename from tests/kernels/test_block_fp8.py rename to tests/kernels/quantization/test_block_fp8.py index c450048bf6651..da594675e924a 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -6,6 +6,7 @@ import itertools import pytest import torch +from tests.kernels.utils_block import native_w8a8_block_matmul from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe @@ -18,8 +19,6 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, w8a8_block_fp8_matmul) from vllm.platforms import current_platform -from .utils_block import native_w8a8_block_matmul - dg_available = False try: import deep_gemm diff --git a/tests/kernels/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py similarity index 99% rename from tests/kernels/test_block_int8.py rename to tests/kernels/quantization/test_block_int8.py index 9447f9d691650..943470ad113d1 100644 --- a/tests/kernels/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -6,6 +6,7 @@ import itertools import pytest import torch +from tests.kernels.utils_block import native_w8a8_block_matmul from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe @@ -13,8 +14,6 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import ( w8a8_block_int8_matmul) from vllm.platforms import current_platform -from .utils_block import native_w8a8_block_matmul - if current_platform.get_device_capability() < (7, 0): pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True) diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py similarity index 99% rename from tests/kernels/test_cutlass_2of4_sparse.py rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py index 2890e15d6cbaf..d67d2dbb89981 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -7,13 +7,12 @@ Run `pytest tests/kernels/test_semi_structured.py`. import pytest import torch +from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( sparse_cutlass_supported) from vllm.platforms import current_platform -from .utils import baseline_scaled_mm, to_fp8, to_int8 - CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py similarity index 99% rename from tests/kernels/test_cutlass.py rename to tests/kernels/quantization/test_cutlass_scaled_mm.py index f11ce6f45a984..8084d9bf2c2da 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -8,13 +8,11 @@ import random import pytest import torch -from tests.kernels.utils import opcheck +from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils import cdiv -from .utils import baseline_scaled_mm, to_fp8, to_int8 - MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py similarity index 100% rename from tests/kernels/test_fp8_quant.py rename to tests/kernels/quantization/test_fp8_quant.py diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py similarity index 100% rename from tests/kernels/test_ggml.py rename to tests/kernels/quantization/test_ggml.py diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py similarity index 100% rename from tests/kernels/test_gguf.py rename to tests/kernels/quantization/test_gguf.py diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py similarity index 100% rename from tests/kernels/test_gptq.py rename to tests/kernels/quantization/test_gptq.py diff --git a/tests/kernels/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py similarity index 100% rename from tests/kernels/test_int8_kernel.py rename to tests/kernels/quantization/test_int8_kernel.py diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py similarity index 100% rename from tests/kernels/test_int8_quant.py rename to tests/kernels/quantization/test_int8_quant.py diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py similarity index 100% rename from tests/kernels/test_machete_mm.py rename to tests/kernels/quantization/test_machete_mm.py diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py similarity index 100% rename from tests/kernels/test_marlin_gemm.py rename to tests/kernels/quantization/test_marlin_gemm.py diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py similarity index 100% rename from tests/kernels/test_nvfp4_quant.py rename to tests/kernels/quantization/test_nvfp4_quant.py diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py similarity index 100% rename from tests/kernels/test_nvfp4_scaled_mm.py rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py similarity index 100% rename from tests/kernels/test_triton_scaled_mm.py rename to tests/kernels/quantization/test_triton_scaled_mm.py diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py deleted file mode 100644 index d3f0320026519..0000000000000 --- a/tests/kernels/test_utils.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -Tests for miscellaneous utilities -""" - -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm.platforms import current_platform - - -def test_convert_fp8_opcheck(): - data = torch.randn((256, 256), dtype=torch.float32, device="cuda") - result = torch.empty_like(data, dtype=torch.float8_e4m3fn) - opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) - - -@pytest.mark.skipif(not current_platform.is_cuda(), - reason="Only supported for CUDA") -def test_cuda_utils_opcheck(): - opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) - opcheck( - torch.ops._C_cuda_utils. - get_max_shared_memory_per_block_device_attribute, (0, ))