From 660328873619f7a711b33c7a6f1c5669a1ccdea2 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:39:01 -0400 Subject: [PATCH] [CI][V0 Deprecation] Removed V0 Only Chunked Prefill and Prefix Caching Tests (#22871) Signed-off-by: Robert Shaw Signed-off-by: Woosuk Kwon Co-authored-by: Robert Shaw Co-authored-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 18 -- .github/CODEOWNERS | 1 - .../basic_correctness/test_chunked_prefill.py | 296 ------------------ tests/prefix_caching/__init__.py | 0 .../test_disable_sliding_window.py | 49 --- tests/prefix_caching/test_prefix_caching.py | 231 -------------- 6 files changed, 595 deletions(-) delete mode 100644 tests/basic_correctness/test_chunked_prefill.py delete mode 100644 tests/prefix_caching/__init__.py delete mode 100644 tests/prefix_caching/test_disable_sliding_window.py delete mode 100644 tests/prefix_caching/test_prefix_caching.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4fc8857854927..0912bc1fd94fe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -88,15 +88,6 @@ steps: - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py -- label: Chunked Prefill Test - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_chunked_prefill - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - label: Core Test # 10min mirror_hardwares: [amdexperimental] fast_check: true @@ -295,15 +286,6 @@ steps: - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 -- label: Prefix Caching Test # 9min - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/prefix_caching - commands: - - pytest -v -s prefix_caching - - - label: Platform Tests (CUDA) mirror_hardwares: [amdexperimental] source_file_dependencies: diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7dce62fc9c7d7..ce9590f02ce71 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -31,7 +31,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo -/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py deleted file mode 100644 index 4816b76996fc8..0000000000000 --- a/tests/basic_correctness/test_chunked_prefill.py +++ /dev/null @@ -1,296 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the outputs of HF and vLLM when using greedy sampling. - -It tests chunked prefill. Chunked prefill can be enabled by -enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens, -prefill requests are chunked. - -Run `pytest tests/models/test_chunked_prefill.py`. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR - -from ..models.utils import check_logprobs_close, check_outputs_equal -from ..utils import multi_gpu_test - -if TYPE_CHECKING: - from .conftest import HfRunner, VllmRunner - -MODELS = [ - "facebook/opt-125m", - "meta-llama/Llama-3.2-1B-Instruct", -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch: pytest.MonkeyPatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the file. - """ - with monkeypatch.context() as m: - m.setenv('VLLM_USE_V1', '0') - yield - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) -@pytest.mark.parametrize("enforce_eager", [False, True]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) -@pytest.mark.parametrize("attention_backend", [ - pytest.param("FLASHINFER", - marks=pytest.mark.skipif( - current_platform.is_rocm(), - reason="FLASHINFER isn't supported on ROCm")), - "FLASH_ATTN" -]) -def test_models( - hf_runner: HfRunner, - vllm_runner: VllmRunner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, - enforce_eager: bool, - tensor_parallel_size: int, - attention_backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """ - Checks exact match decode between huggingface model and vllm runner with - chunked prefill. - """ - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("attention_backend", [ - pytest.param("FLASHINFER", - marks=pytest.mark.skipif( - current_platform.is_rocm(), - reason="FLASHINFER isn't supported on ROCm")), - "FLASH_ATTN" -]) -def test_models_distributed( - hf_runner: HfRunner, - vllm_runner: VllmRunner, - example_prompts, - model: str, - distributed_executor_backend: str, - attention_backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - if (model == "meta-llama/Llama-3.2-1B-Instruct" - and distributed_executor_backend == "ray"): - # test Ray Compiled Graph - m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") - m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - - dtype = "half" - max_tokens = 5 - chunked_prefill_token_size = 16 - - # Add a chunked prefill config. - max_num_seqs = min(chunked_prefill_token_size, 256) - assert chunked_prefill_token_size != -1 - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with - # fork method (the default method). - - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy( - example_prompts, - max_tokens, - ) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize( - "kv_cache_dtype,model", - [("fp8_e4m3", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")]) -# Due to low-precision numerical divergence, we only test logprob of 4 tokens -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16]) -@pytest.mark.parametrize("enforce_eager", [False, True]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) -# Due to low-precision numerical divergence, this test is too sensitive to -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) -@pytest.mark.skipif(current_platform.is_rocm(), - reason="machete_prepack_B isn't supported on ROCm") -def test_models_with_fp8_kv_cache( - vllm_runner: VllmRunner, - example_prompts, - kv_cache_dtype: str, - model: str, - max_tokens: int, - chunked_prefill_token_size: int, - enforce_eager: bool, - tensor_parallel_size: int, - disable_async_output_proc: bool, -) -> None: - """ - Check output logprobs match between no_chunked_prefill and chunked_prefill - with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py, - so here we only check chunked prefill. - """ - NUM_LOG_PROBS = 8 - - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - - with vllm_runner( - model, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner( - model, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=no_chunked_prefill_outputs, - outputs_1_lst=chunked_prefill_outputs, - name_0="no_chunked_prefill", - name_1="chunked_prefill", - ) - - -@pytest.mark.parametrize("max_tokens", [16]) -@pytest.mark.parametrize("enforce_eager", [False]) -@pytest.mark.parametrize("chunk_size", [30, 32]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) -@pytest.mark.parametrize("dtype", ["half"]) -def test_with_prefix_caching( - vllm_runner: VllmRunner, - max_tokens: int, - enforce_eager: bool, - chunk_size: int, - tensor_parallel_size: int, - dtype: str, -) -> None: - """ - Checks exact match decode with and without prefix caching - with chunked prefill enabled. - """ - model = "meta-llama/Llama-3.2-1B-Instruct" - # The common prompt has 142 tokens with Llama-2 tokenizer. - common_prompt = "You are a helpful AI assistant " * 20 - unique_prompts = [ - "Question", # Warmup - "Question", # Fully cached - "Another question", # Partial cached - ] - full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts] - - max_num_batched_tokens = max_num_seqs = chunk_size - outputs = {} # type: ignore - for enable in (True, False): - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - enable_prefix_caching=enable, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - outputs[enable] = [] - for prompt in full_prompts: - outputs[enable] += vllm_model.generate_greedy( - [prompt], - max_tokens, - ) - - check_outputs_equal( - outputs_0_lst=outputs[False], - outputs_1_lst=outputs[True], - name_0="w/o prefix caching", - name_1="with prefix caching", - ) diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py deleted file mode 100644 index b940ab416e673..0000000000000 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the with and without prefix caching. - -Run `pytest tests/prefix_caching/test_prefix_caching.py`. -""" -import pytest - -from vllm import LLM -from vllm.distributed import cleanup_dist_env_and_memory - -MODEL_LEN_LEN = [ - # Example models with sliding window. - ("bigcode/starcoder2-3b", 4096, 16384), - # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI - - # Confirm model with sliding window works. - # config has "use_sliding_window": false - ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768), - # config has no sliding window attribute. - ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048), -] - - -@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) -def test_disable_sliding_window(model_len_len, ): - model, sliding_len, full_len = model_len_len - disabled_llm = LLM(model, disable_sliding_window=True) - disabled_llm.generate("Hi my name is") - model_config = disabled_llm.llm_engine.model_config - assert model_config.max_model_len == sliding_len, ( - "Max len expected to equal sliding_len of %s, but got %s", sliding_len, - model_config.max_model_len) - - del disabled_llm - cleanup_dist_env_and_memory() - - enabled_llm = LLM(model, - enforce_eager=True, - disable_sliding_window=False, - enable_prefix_caching=False) - enabled_llm.generate("Hi my name is") - model_config = enabled_llm.llm_engine.model_config - assert model_config.max_model_len == full_len, ( - "Max len expected to equal full_len of %s, but got %s", full_len, - model_config.max_model_len) - - del enabled_llm - cleanup_dist_env_and_memory() diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py deleted file mode 100644 index 5bf6ed957c74e..0000000000000 --- a/tests/prefix_caching/test_prefix_caching.py +++ /dev/null @@ -1,231 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the with and without prefix caching. - -Run `pytest tests/prefix_caching/test_prefix_caching.py`. -""" - -from __future__ import annotations - -import pytest - -from tests.conftest import VllmRunner -from tests.core.utils import SchedulerProxy, create_dummy_prompt -from vllm import SamplingParams, TokensPrompt -from vllm.core.scheduler import Scheduler -from vllm.engine.llm_engine import LLMEngine -from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR - -from ..models.utils import check_outputs_equal - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch: pytest.MonkeyPatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - with monkeypatch.context() as m: - m.setenv('VLLM_USE_V1', '0') - yield - - -MODELS = [ - "distilbert/distilgpt2", -] - -UNSTABLE_PROMPT_SEQUENCE = [ - ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1), - ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50), - ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95), - ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174), - ([0] * 588) + ([8] * 1539), -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("cached_position", [0, 1]) -@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) -@pytest.mark.parametrize("block_size", [16]) -def test_mixed_requests( - hf_runner, - vllm_runner, - example_prompts, - model: str, - backend: str, - dtype: str, - max_tokens: int, - cached_position: int, - enable_chunked_prefill: bool, - block_size: int, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """ - Test the case when some sequences have the prefix cache hit - and the others don't. The cached position determines where - the sequence is at among the batch of prefills. - """ - if backend == "FLASHINFER" and current_platform.is_rocm(): - pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and current_platform.is_rocm(): - pytest.skip("Xformers does not support ROCm/HIP.") - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, backend) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - cached_prompt = example_prompts[cached_position] - with vllm_runner( - model, - dtype=dtype, - enable_prefix_caching=True, - enable_chunked_prefill=enable_chunked_prefill, - block_size=block_size, - ) as vllm_model: - # Run the first prompt so the cache is populated - vllm_outputs = vllm_model.generate_greedy([cached_prompt], - max_tokens) - - # Run all the promopts - greedy_params = SamplingParams(temperature=0.0, - max_tokens=max_tokens) - req_outputs = vllm_model.llm.generate(example_prompts, - greedy_params) - - # Verify number of cached tokens - for i in range(len(req_outputs)): - if i == cached_position: - expected_num_cached_tokens = ( - len(req_outputs[i].prompt_token_ids) // - block_size) * block_size - else: - expected_num_cached_tokens = 0 - assert (req_outputs[i].num_cached_tokens == - expected_num_cached_tokens) - - vllm_outputs = [( - output.prompt_token_ids + list(output.outputs[0].token_ids), - output.prompt + output.outputs[0].text, - ) for output in req_outputs] - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) -def test_unstable_prompt_sequence( - vllm_runner, - backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - - if backend == "FLASHINFER" and current_platform.is_rocm(): - pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and current_platform.is_rocm(): - pytest.skip("Xformers does not support ROCm/HIP.") - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, backend) - - with vllm_runner( - "Qwen/Qwen2.5-0.5B-Instruct", - enable_chunked_prefill=True, - enable_prefix_caching=True, - max_model_len=4096, - ) as vllm_model: - for prompt in UNSTABLE_PROMPT_SEQUENCE: - vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), - SamplingParams(max_tokens=1)) - - -@pytest.mark.parametrize("model", MODELS) -def test_fully_cached_prefill_needs_uncached_token(model): - block_size = 16 - max_num_batched_tokens = 16 - num_output_tokens = 5 - # Make a vllm engine - runner = VllmRunner( - model_name=model, - gpu_memory_utilization=0.7, - enable_chunked_prefill=True, - enforce_eager=True, - enable_prefix_caching=True, - block_size=block_size, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_batched_tokens, - ) - engine: LLMEngine = runner.llm.llm_engine - - scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore - engine.scheduler[0] = scheduler - - # SeqA - seqA_tokens = list(range(2 * block_size)) - seqA, seq_groupA = create_dummy_prompt( - request_id="0", - prompt_tokens=seqA_tokens, - max_tokens=num_output_tokens, - block_size=block_size, - ) - - scheduler.add_seq_group(seq_groupA) - - assert seqA.data.get_num_computed_tokens() == 0 - - # Prefill seqA - while not seqA.is_finished(): - engine.step() - - # seqB - seqB_tokens = [t + 1 for t in seqA_tokens] # shift by 1 - seqB, seq_groupB = create_dummy_prompt( - request_id="1", - prompt_tokens=seqB_tokens, - max_tokens=num_output_tokens, - block_size=block_size, - ) - - # seqC is the same as seqA - seqC, seq_groupC = create_dummy_prompt( - request_id="2", - prompt_tokens=seqA_tokens, - max_tokens=num_output_tokens, - block_size=block_size, - ) - - scheduler.add_seq_group(seq_groupB) - scheduler.add_seq_group(seq_groupC) - - # Even seqC is fully cached, it should not be prefilled since we - # require at least 1 uncached token. - engine.step() - - sched_metas, sched_out, _ = scheduler.last_schedule_ret() - assert len(sched_out.scheduled_seq_groups) == 1 - assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == - seq_groupB.request_id) - assert (sched_out.scheduled_seq_groups[0].token_chunk_size == - max_num_batched_tokens) - - # When seqB is finished, seqC could be prefilled. - while not seqB.is_finished(): - engine.step() - sched_metas, sched_out, _ = scheduler.last_schedule_ret() - assert len(sched_out.scheduled_seq_groups) == 1 - assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == - seq_groupB.request_id) - - engine.step() - sched_metas, sched_out, _ = scheduler.last_schedule_ret() - assert len(sched_out.scheduled_seq_groups) == 1 - assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == - seq_groupC.request_id) - assert sched_out.scheduled_seq_groups[0].token_chunk_size == len( - seqA_tokens)