diff --git a/tests/conftest.py b/tests/conftest.py index 27db5422ceac2..1052aeb35bac7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import http.server import json import math +import mimetypes import os +import socket import tempfile +import threading +from collections.abc import Generator from enum import Enum from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast @@ -32,6 +37,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger +from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.sequence import Logprob @@ -1253,3 +1259,119 @@ def cli_config_file(): def cli_config_file_with_model(): """Return the path to the CLI config file with model.""" return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml") + + +class AssetHandler(http.server.BaseHTTPRequestHandler): + # _IMAGE_CACHE : Dict[str, bytes] = {} + + def log_message(self, *args, **kwargs): + pass + + def do_GET(self): + # Accepts paths like: /1280px-Venn_diagram_rgb.jpg + filename = self.path.lstrip("/") + if not filename or "." not in filename: + self.send_error(404, "Missing filename (expected /.)") + return + + base, ext = filename.rsplit(".", 1) + ext = ext.lower() + + if ext not in ["jpg", "png"]: + self.send_error(404, f"Unsupported extension: .{ext}") + return + + try: + data = ImageAsset(base).read_bytes(ext=ext) + except Exception as e: + self.send_error(500, f"Failed to load asset: {ext} {base} {e} ") + return + + ctype, _ = mimetypes.guess_type(filename) + if ctype is None: + ctype = {"jpg": "image/jpg", "png": "image/png"}[ext] + self.send_response(200) + self.send_header("Content-Type", ctype) + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + + +def _find_free_port() -> int: + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +class LocalAssetServer: + + address: str + port: int + server: Optional[http.server.ThreadingHTTPServer] + thread: Optional[threading.Thread] + + def __init__(self, address: str = "127.0.0.1") -> None: + self.address = address + self.port = -1 + self.server = None + self.thread = None + + def __enter__(self): + self.port = _find_free_port() + self.server = http.server.ThreadingHTTPServer( + (self.address, self.port), AssetHandler) + self.thread = threading.Thread(target=self.server.serve_forever, + daemon=True) + self.thread.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.server: + self.server.shutdown() + del self.server + + if self.thread: + self.thread.join() + del self.thread + + if exc_type is None: + return None + + return False + + @property + def base_url(self) -> str: + assert self.port is not None + return f"http://{self.address}:{self.port}" + + def url_for(self, name: str) -> str: + """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'""" + return f"{self.base_url}/{name}" + + def get_image_asset(self, name: str) -> Image.Image: + return fetch_image(self.url_for(name)) + + +@pytest.fixture(scope="session") +def local_asset_server() -> Generator[LocalAssetServer, None, None]: + """ + Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. + The server currently servers images at: + http://127.0.0.1:/. + """ + with LocalAssetServer() as srv: + yield srv + + +@pytest.fixture +def image_url(request, local_asset_server) -> str: + # request.param is one of the IMAGE_ASSETS filenames + name = request.param + return local_asset_server.url_for(name) + + +@pytest.fixture +def image_urls(request, local_asset_server) -> list[str]: + """Indirect fixture: takes a list of names, returns list of full URLs.""" + names: list[str] = request.param + return [local_asset_server.url_for(name) for name in names] diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 2cbfed98a577a..bf460d0fb25d3 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -7,7 +7,7 @@ import pytest from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory -from ..openai.test_vision import TEST_IMAGE_URLS +from ..openai.test_vision import TEST_IMAGE_ASSETS @pytest.fixture(scope="function") @@ -95,7 +95,8 @@ def vision_llm(): @pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) + [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], + indirect=True) def test_chat_multi_image(vision_llm, image_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 106ec121a422e..9d61754059e2f 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] EXPECTED_MM_BEAM_SEARCH_RES = [ @@ -69,10 +69,11 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_asset: + encode_image_base64(local_asset_server.get_image_asset(image_asset)) + for image_asset in TEST_IMAGE_ASSETS } @@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" @@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, model_name: str, image_url: str): @@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, model_name: str, image_url: str): @@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_base64encoded( - client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: dict[str, str]): + client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, + image_url: str, base64_encoded_image: dict[str, str]): content_text = "What's in this image?" messages = [{ @@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded( "type": "image_url", "image_url": { "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" } }, { @@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS)))) +@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_idx: int, base64_encoded_image: dict[str, str]): # NOTE: This test also validates that we pass MM data through beam search - image_url = TEST_IMAGE_URLS[image_idx] + raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] messages = [{ @@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( "type": "image_url", "image_url": { "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" } }, { @@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ @@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", - [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index d3cc2fac6af57..dbd403fb7a7b5 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -19,11 +19,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" assert vlm2vec_jinja_path.exists() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] @@ -49,10 +49,11 @@ def server(): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_url: + encode_image_base64(local_asset_server.get_image_asset(image_url)) + for image_url in TEST_IMAGE_ASSETS } @@ -70,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, image_url: str): content_text = "Represent the given image." diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index d39cf706786e2..f95dbc7547ecc 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID] IMG_URLS = [ - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg", + "237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", ] PROMPT = "Describe each image in one short sentence." @@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt: return engine_inputs -MSGS = [ - _create_msg_format(IMG_URLS[:1]), - _create_msg_format(IMG_URLS[:2]), - _create_msg_format(IMG_URLS), -] - SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) LIMIT_MM_PER_PROMPT = dict(image=4) @@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN) @pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_chat( - vllm_runner, - max_model_len: int, - model: str, - dtype: str, -) -> None: +def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str, + local_asset_server) -> None: EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs( FIXTURE_LOGPROBS_CHAT[model]) with vllm_runner( @@ -174,7 +164,14 @@ def test_chat( limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, ) as vllm_model: outputs = [] - for msg in MSGS: + + urls_all = [local_asset_server.url_for(u) for u in IMG_URLS] + msgs = [ + _create_msg_format(urls_all[:1]), + _create_msg_format(urls_all[:2]), + _create_msg_format(urls_all), + ] + for msg in msgs: output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS) outputs.extend(output) @@ -190,14 +187,24 @@ def test_chat( name_1="output") -@pytest.mark.parametrize("prompt,expected_ranges", - [(_create_engine_inputs_hf(IMG_URLS[:1]), - [PlaceholderRange(offset=11, length=494)]), - (_create_engine_inputs_hf(IMG_URLS[1:4]), [ - PlaceholderRange(offset=11, length=266), - PlaceholderRange(offset=277, length=1056), - PlaceholderRange(offset=1333, length=418) - ])]) +@pytest.fixture +def prompt(request, local_asset_server) -> TextPrompt: + names = request.param + urls = [local_asset_server.url_for(n) for n in names] + return _create_engine_inputs_hf(urls) + + +@pytest.mark.parametrize( + "prompt,expected_ranges", + [ + pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), + pytest.param(IMG_URLS[1:4], [ + PlaceholderRange(offset=11, length=266), + PlaceholderRange(offset=277, length=1056), + PlaceholderRange(offset=1333, length=418) + ]) + ], +) def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt, expected_ranges: list[PlaceholderRange], monkeypatch) -> None: diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 0f82e1f3e343e..886582a516409 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -31,11 +31,11 @@ if TYPE_CHECKING: from vllm.multimodal.inputs import MultiModalPlaceholderDict # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] TEST_VIDEO_URLS = [ @@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [ @pytest.fixture(scope="module") -def url_images() -> dict[str, Image.Image]: - connector = MediaConnector() +def url_images(local_asset_server) -> dict[str, Image.Image]: return { - image_url: connector.fetch_image(image_url) - for image_url in TEST_IMAGE_URLS + image_url: local_asset_server.get_image_asset(image_url) + for image_url in TEST_IMAGE_ASSETS } @@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_fetch_image_http(image_url: str): connector = MediaConnector() @@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("suffix", get_supported_suffixes()) async def test_fetch_image_base64(url_images: dict[str, Image.Image], - image_url: str, suffix: str): + raw_image_url: str, suffix: str): connector = MediaConnector() - url_image = url_images[image_url] + url_image = url_images[raw_image_url] try: mime_type = Image.MIME[Image.registered_extensions()[suffix]] @@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image], @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_fetch_image_local_files(image_url: str): connector = MediaConnector() @@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str): @pytest.mark.asyncio -async def test_fetch_image_local_files_with_space_in_name(): - image_url = TEST_IMAGE_URLS[0] +@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True) +async def test_fetch_image_local_files_with_space_in_name(image_url: str): connector = MediaConnector() with TemporaryDirectory() as temp_dir: diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py index c8d09fd39fb13..3ed36ca678c0c 100644 --- a/tests/v1/entrypoints/openai/responses/test_image.py +++ b/tests/v1/entrypoints/openai/responses/test_image.py @@ -8,17 +8,17 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_base64 # Use a small vision model for testing MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] @@ -52,16 +52,17 @@ async def client(image_server): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_url: + encode_image_base64(local_asset_server.get_image_asset(image_url)) + for image_url in TEST_IMAGE_ASSETS } @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" @@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, - image_url: str, + raw_image_url: str, base64_encoded_image: dict[str, str], ): content_text = "What's in this image?" @@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded( { "type": "input_image", "image_url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}", + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", "detail": "auto", }, { @@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", - [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): messages = [{ diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index bcc2993028dd6..9947fcbe73135 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -4,18 +4,19 @@ import openai import pytest -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_base64 from vllm.platforms import current_platform -from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS +from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS from ...utils import RemoteOpenAIServer @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_asset: + encode_image_base64(local_asset_server.get_image_asset(image_asset)) + for image_asset in TEST_IMAGE_ASSETS } @@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, client: openai.AsyncOpenAI = remote_server.get_async_client() # Other requests now should be much faster - for image_url in TEST_IMAGE_URLS: + for image_url in TEST_IMAGE_ASSETS: image_base64 = base64_encoded_image[image_url] chat_completion_from_base64 = await client.chat.completions\ .create( diff --git a/vllm/assets/image.py b/vllm/assets/image.py index c8f8d43a98355..4639a11187d03 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from pathlib import Path from typing import Literal import torch @@ -11,17 +12,29 @@ from .base import get_vllm_public_assets VLM_IMAGES_DIR = "vision_model_images" -ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"] +ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk", + "Grayscale_8bits_palette_sample_image", + "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300", + "231-200x300", "27-500x500", "17-150x600", + "handelsblatt-preview", "paper-11"] @dataclass(frozen=True) class ImageAsset: name: ImageAssetName + def get_path(self, ext: str) -> Path: + """ + Return s3 path for given image. + """ + return get_vllm_public_assets(filename=f"{self.name}.{ext}", + s3_prefix=VLM_IMAGES_DIR) + @property - def pil_image(self) -> Image.Image: - image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", - s3_prefix=VLM_IMAGES_DIR) + def pil_image(self, ext="jpg") -> Image.Image: + + image_path = self.get_path(ext) return Image.open(image_path) @property @@ -29,6 +42,9 @@ class ImageAsset: """ Image embeddings, only used for testing purposes with llava 1.5. """ - image_path = get_vllm_public_assets(filename=f"{self.name}.pt", - s3_prefix=VLM_IMAGES_DIR) + image_path = self.get_path('pt') return torch.load(image_path, map_location="cpu", weights_only=True) + + def read_bytes(self, ext: str) -> bytes: + p = Path(self.get_path(ext)) + return p.read_bytes()