diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 176c1825530e4..9c62595ad280b 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -60,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str: return base64.b64encode(buffer.getvalue()).decode('utf-8') +@pytest.mark.skip("This test is skipped because it is flaky.") @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_prompt_embeds( diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index 97dd4d6135ac4..bb8ae741b6149 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -32,7 +32,7 @@ from ..utils import check_logprobs_close # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) @@ -57,6 +57,9 @@ def test_models( pytest.skip( f"{kv_cache_dtype} is currently not supported on ROCm/HIP.") + if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None): + pytest.skip(f"{kv_cache_dtype} is not supported on this platform.") + with monkeypatch.context() as m: m.setenv("TOKENIZERS_PARALLELISM", 'true') m.setenv(STR_BACKEND_ENV_VAR, backend) diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index cb30d77c4f0ea..9b376f2a260ac 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -63,6 +63,7 @@ def test_oot_registration_embedding( image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") +@pytest.mark.skip(reason="This test is skipped because it failed on V1.") @create_new_process_for_each_test() def test_oot_registration_multimodal( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b7949a488ad05..c0ab3fbb10622 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -357,6 +357,9 @@ def test_compressed_tensors_fp8(vllm_runner): assert output +@pytest.mark.skipif( + not current_platform.is_kv_cache_dtype_supported("fp8", None), + reason="FP8 KV cache is not supported on this device.") @pytest.mark.skipif(not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform.") def test_compressed_tensors_kv_cache(vllm_runner): @@ -738,4 +741,4 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt, with vllm_runner(model, enforce_eager=True) as llm: perplexity = llm.generate_prompt_perplexity([prompt])[0] print(perplexity) - assert perplexity <= exp_perplexity \ No newline at end of file + assert perplexity <= exp_perplexity