vllm/tests/v1/core/test_scheduler_e2e.py
Cyrus Leung 1e4ecca1d0
[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 15:42:31 +00:00

38 lines
970 B
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import LLM
MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def llm() -> LLM:
return LLM(
MODEL,
enforce_eager=True,
enable_prefix_caching=True,
long_prefill_token_threshold=2,
max_num_batched_tokens=6,
max_num_seqs=3,
block_size=16,
)
def test_concurrent_partial_prefill(llm):
outputs = llm.generate([PROMPT] * 3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
def test_prefix_cache_stats_is_recorded(llm):
# 17 tokens will make sure first 16 tokens are cached in a block
input_tokens = {"prompt_token_ids": [101] * 17}
_ = llm.generate([input_tokens])
outputs = llm.generate([input_tokens])
assert outputs[0].num_cached_tokens == 16