diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index d0e4f6554a91f..32c77b9a01ece 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -54,6 +54,10 @@ def setup_cuda(): torch.set_default_device("cuda") +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "num_tokens,d,dtype,group_size,seed", itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS), @@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_info = torch.finfo(current_platform.fp8_dtype()) fp8_max, fp8_min = fp8_info.max, fp8_info.min A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n @@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform." +) @torch.inference_mode() def test_w8a8_block_fp8_cutlass_matmul(): # Test simple case where weight.shape % 128 != 0, @@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul(): assert rel_diff < 0.001 +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS), diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index de595b0a34e46..bc4744df7e69e 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -15,6 +15,9 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index cccef28f5e931..8cfc993fe8e82 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel # unit tests to a common utility function. Currently the use of # `is_quant_method_supported` conflates kernels with quantization methods