[V1][Tests] Adding additional testing for multimodal models to V1 (#13308)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
2026-03-20 05:21:04 +08:00 · 2025-02-18 09:53:14 -08:00 · 2025-02-18 09:53:14 -08:00 · a4d577b379
commit a4d577b379
parent 7b203b7694
1 changed files with 48 additions and 12 deletions
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -8,7 +8,9 @@ import pytest

 from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import SamplingParams
+from vllm.assets.image import ImageAsset
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
@ -17,13 +19,32 @@ if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.",
                allow_module_level=True)

-ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
-                              enforce_eager=True,
-                              disable_log_requests=True)
+TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
+                                   enforce_eager=True,
+                                   disable_log_requests=True)
+
+VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
+                                     enforce_eager=True,
+                                     disable_log_requests=True)
+
+TEXT_PROMPT = "Hello my name is Robert and"
+
+VISION_PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+VISION_PROMPT = {
+    "prompt": VISION_PROMPT_TEMPLATE,
+    "multi_modal_data": {
+        "image": ImageAsset("stop_sign").pil_image
+    }
+}


 async def generate(engine: AsyncLLM,
                   request_id: str,
+                   prompt: PromptType,
                   output_kind: RequestOutputKind,
                   max_tokens: int,
                   prompt_logprobs: Optional[int] = None) -> Tuple[int, str]:
@ -32,11 +53,12 @@ async def generate(engine: AsyncLLM,

    count = 0
    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
                                     output_kind=output_kind,
                                     temperature=0,
                                     prompt_logprobs=prompt_logprobs)
    async for out in engine.generate(request_id=request_id,
-                                     prompt="Hello my name is Robert and",
+                                     prompt=prompt,
                                     sampling_params=sampling_params):

        num_tokens = len(out.outputs[0].token_ids)
@ -74,6 +96,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
            await asyncio.create_task(
                generate(engine,
                         "request-0",
+                         TEXT_PROMPT,
                         output_kind,
                         10,
                         prompt_logprobs=5))
@ -86,18 +109,24 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(

@pytest.mark.parametrize(
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("engine_args_and_prompt",
+                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
+                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind):
+async def test_load(monkeypatch, output_kind: RequestOutputKind,
+                    engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                                                  PromptType]):
    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
    # so that in the future when we switch, we don't have to change all the
    # tests.
    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")
+        engine_args, prompt = engine_args_and_prompt

-        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)

-        NUM_REQUESTS = 10000
+        NUM_REQUESTS = 100
        NUM_EXPECTED_TOKENS = 10

        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
@ -107,7 +136,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):
        for request_id in request_ids:
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, output_kind,
+                    generate(engine, request_id, prompt, output_kind,
                             NUM_EXPECTED_TOKENS)))

        # Confirm that we got all the EXPECTED tokens from the requests.
@ -126,13 +155,19 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):

@pytest.mark.parametrize(
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("engine_args_and_prompt",
+                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
+                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind):
+async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+                     engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                                                   PromptType]):

    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")
+        engine_args, prompt = engine_args_and_prompt

-        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)

        NUM_REQUESTS = 100
@ -146,7 +181,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
        for request_id in request_ids:
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, output_kind,
+                    generate(engine, request_id, prompt, output_kind,
                             NUM_EXPECTED_TOKENS)))

        # API server cancels requests when they disconnect.
@ -172,7 +207,8 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
        # Confirm we can do another generation.
        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
        task = asyncio.create_task(
-            generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, prompt, output_kind,
+                     NUM_EXPECTED_TOKENS))
        num_generated_tokens, request_id = await task
        assert num_generated_tokens == NUM_EXPECTED_TOKENS
        assert not engine.output_processor.has_unfinished_requests()