mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 07:57:04 +08:00
[CI/Build][AMD] Skip quantization kernels tests that require CUTLASS or e4m3fn when not supported by platform (#30020)
Signed-off-by: Randall Smith <ransmith@amd.com> Co-authored-by: Randall Smith <ransmith@amd.com>
This commit is contained in:
parent
c3487aca34
commit
b75f826fca
@ -54,6 +54,10 @@ def setup_cuda():
|
|||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
current_platform.is_fp8_fnuz(),
|
||||||
|
reason="This platform supports e4m3fnuz, not e4m3fn.",
|
||||||
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"num_tokens,d,dtype,group_size,seed",
|
"num_tokens,d,dtype,group_size,seed",
|
||||||
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
|
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
|
||||||
@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
|
|||||||
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
factor_for_scale = 1e-2
|
factor_for_scale = 1e-2
|
||||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
fp8_info = torch.finfo(current_platform.fp8_dtype())
|
||||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||||
|
|
||||||
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||||
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
|
||||||
|
|
||||||
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||||
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
|
||||||
|
|
||||||
block_n, block_k = block_size[0], block_size[1]
|
block_n, block_k = block_size[0], block_size[1]
|
||||||
n_tiles = (N + block_n - 1) // block_n
|
n_tiles = (N + block_n - 1) // block_n
|
||||||
@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
|||||||
assert rel_diff < 0.001
|
assert rel_diff < 0.001
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
|
||||||
|
)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_w8a8_block_fp8_cutlass_matmul():
|
def test_w8a8_block_fp8_cutlass_matmul():
|
||||||
# Test simple case where weight.shape % 128 != 0,
|
# Test simple case where weight.shape % 128 != 0,
|
||||||
@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
|
|||||||
assert rel_diff < 0.001
|
assert rel_diff < 0.001
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
current_platform.is_fp8_fnuz(),
|
||||||
|
reason="This platform supports e4m3fnuz, not e4m3fn.",
|
||||||
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"M,N,K,block_size,out_dtype,seed",
|
"M,N,K,block_size,out_dtype,seed",
|
||||||
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
|
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
|
||||||
|
|||||||
@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.math_utils import cdiv
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
|
||||||
|
|
||||||
MNK_FACTORS = [
|
MNK_FACTORS = [
|
||||||
(1, 256, 128),
|
(1, 256, 128),
|
||||||
(1, 16384, 1024),
|
(1, 16384, 1024),
|
||||||
|
|||||||
@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
|
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
|
||||||
|
|
||||||
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
|
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
|
||||||
# unit tests to a common utility function. Currently the use of
|
# unit tests to a common utility function. Currently the use of
|
||||||
# `is_quant_method_supported` conflates kernels with quantization methods
|
# `is_quant_method_supported` conflates kernels with quantization methods
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user