mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-23 17:51:21 +08:00
Fixing Chunked Prefill Test. (#19762)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
This commit is contained in:
parent
466166dcfd
commit
4719460644
@ -89,7 +89,7 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
|
|||||||
@ -49,7 +49,13 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
|||||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||||
# reset distributed env properly. Use a value > 1 just when you test.
|
# reset distributed env properly. Use a value > 1 just when you test.
|
||||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||||
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
@pytest.mark.parametrize("attention_backend", [
|
||||||
|
pytest.param("FLASHINFER",
|
||||||
|
marks=pytest.mark.skipif(
|
||||||
|
current_platform.is_rocm(),
|
||||||
|
reason="FLASHINFER isn't supported on ROCm")),
|
||||||
|
"FLASH_ATTN"
|
||||||
|
])
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner: HfRunner,
|
hf_runner: HfRunner,
|
||||||
vllm_runner: VllmRunner,
|
vllm_runner: VllmRunner,
|
||||||
@ -99,7 +105,13 @@ def test_models(
|
|||||||
@multi_gpu_test(num_gpus=2)
|
@multi_gpu_test(num_gpus=2)
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
@pytest.mark.parametrize("attention_backend", [
|
||||||
|
pytest.param("FLASHINFER",
|
||||||
|
marks=pytest.mark.skipif(
|
||||||
|
current_platform.is_rocm(),
|
||||||
|
reason="FLASHINFER isn't supported on ROCm")),
|
||||||
|
"FLASH_ATTN"
|
||||||
|
])
|
||||||
def test_models_distributed(
|
def test_models_distributed(
|
||||||
hf_runner: HfRunner,
|
hf_runner: HfRunner,
|
||||||
vllm_runner: VllmRunner,
|
vllm_runner: VllmRunner,
|
||||||
@ -172,6 +184,8 @@ def test_models_distributed(
|
|||||||
# Due to low-precision numerical divergence, this test is too sensitive to
|
# Due to low-precision numerical divergence, this test is too sensitive to
|
||||||
# the async postprocessor
|
# the async postprocessor
|
||||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||||
|
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||||
|
reason="machete_prepack_B isn't supported on ROCm")
|
||||||
def test_models_with_fp8_kv_cache(
|
def test_models_with_fp8_kv_cache(
|
||||||
vllm_runner: VllmRunner,
|
vllm_runner: VllmRunner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user