diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 05197f44f93b..d864cb2af23e 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -8,7 +8,9 @@ import pytest from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG from vllm import SamplingParams +from vllm.assets.image import ImageAsset from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.inputs import PromptType from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM @@ -17,13 +19,32 @@ if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) -ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - disable_log_requests=True) +TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + disable_log_requests=True) + +VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct", + enforce_eager=True, + disable_log_requests=True) + +TEXT_PROMPT = "Hello my name is Robert and" + +VISION_PROMPT_TEMPLATE = ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" + "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + "What is in the image?<|im_end|>\n" + "<|im_start|>assistant\n") +VISION_PROMPT = { + "prompt": VISION_PROMPT_TEMPLATE, + "multi_modal_data": { + "image": ImageAsset("stop_sign").pil_image + } +} async def generate(engine: AsyncLLM, request_id: str, + prompt: PromptType, output_kind: RequestOutputKind, max_tokens: int, prompt_logprobs: Optional[int] = None) -> Tuple[int, str]: @@ -32,11 +53,12 @@ async def generate(engine: AsyncLLM, count = 0 sampling_params = SamplingParams(max_tokens=max_tokens, + ignore_eos=True, output_kind=output_kind, temperature=0, prompt_logprobs=prompt_logprobs) async for out in engine.generate(request_id=request_id, - prompt="Hello my name is Robert and", + prompt=prompt, sampling_params=sampling_params): num_tokens = len(out.outputs[0].token_ids) @@ -74,6 +96,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc( await asyncio.create_task( generate(engine, "request-0", + TEXT_PROMPT, output_kind, 10, prompt_logprobs=5)) @@ -86,18 +109,24 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc( @pytest.mark.parametrize( "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.parametrize("engine_args_and_prompt", + [(TEXT_ENGINE_ARGS, TEXT_PROMPT), + (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_load(monkeypatch, output_kind: RequestOutputKind): +async def test_load(monkeypatch, output_kind: RequestOutputKind, + engine_args_and_prompt: Tuple[AsyncEngineArgs, + PromptType]): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") + engine_args, prompt = engine_args_and_prompt - engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) - NUM_REQUESTS = 10000 + NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 10 request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] @@ -107,7 +136,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, output_kind, + generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. @@ -126,13 +155,19 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind): @pytest.mark.parametrize( "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.parametrize("engine_args_and_prompt", + [(TEXT_ENGINE_ARGS, TEXT_PROMPT), + (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_abort(monkeypatch, output_kind: RequestOutputKind): +async def test_abort(monkeypatch, output_kind: RequestOutputKind, + engine_args_and_prompt: Tuple[AsyncEngineArgs, + PromptType]): with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") + engine_args, prompt = engine_args_and_prompt - engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) NUM_REQUESTS = 100 @@ -146,7 +181,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, output_kind, + generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS))) # API server cancels requests when they disconnect. @@ -172,7 +207,8 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind): # Confirm we can do another generation. request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}" task = asyncio.create_task( - generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS)) + generate(engine, request_id, prompt, output_kind, + NUM_EXPECTED_TOKENS)) num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS assert not engine.output_processor.has_unfinished_requests()