[CI/Build] Serve images used by multimodal tests through local HTTP Server (#23907)

Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
dsinghvi 2025-09-03 13:43:11 +05:30 committed by GitHub
parent f0c503f66e
commit 70549c1245
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 250 additions and 98 deletions

View File

@ -1,9 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import http.server
import json
import math
import mimetypes
import os
import socket
import tempfile
import threading
from collections.abc import Generator
from enum import Enum
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
@ -32,6 +37,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.sequence import Logprob
@ -1253,3 +1259,119 @@ def cli_config_file():
def cli_config_file_with_model():
"""Return the path to the CLI config file with model."""
return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
class AssetHandler(http.server.BaseHTTPRequestHandler):
# _IMAGE_CACHE : Dict[str, bytes] = {}
def log_message(self, *args, **kwargs):
pass
def do_GET(self):
# Accepts paths like: /1280px-Venn_diagram_rgb.jpg
filename = self.path.lstrip("/")
if not filename or "." not in filename:
self.send_error(404, "Missing filename (expected /<name>.<ext>)")
return
base, ext = filename.rsplit(".", 1)
ext = ext.lower()
if ext not in ["jpg", "png"]:
self.send_error(404, f"Unsupported extension: .{ext}")
return
try:
data = ImageAsset(base).read_bytes(ext=ext)
except Exception as e:
self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
return
ctype, _ = mimetypes.guess_type(filename)
if ctype is None:
ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
self.send_response(200)
self.send_header("Content-Type", ctype)
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
def _find_free_port() -> int:
with socket.socket() as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]
class LocalAssetServer:
address: str
port: int
server: Optional[http.server.ThreadingHTTPServer]
thread: Optional[threading.Thread]
def __init__(self, address: str = "127.0.0.1") -> None:
self.address = address
self.port = -1
self.server = None
self.thread = None
def __enter__(self):
self.port = _find_free_port()
self.server = http.server.ThreadingHTTPServer(
(self.address, self.port), AssetHandler)
self.thread = threading.Thread(target=self.server.serve_forever,
daemon=True)
self.thread.start()
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.server:
self.server.shutdown()
del self.server
if self.thread:
self.thread.join()
del self.thread
if exc_type is None:
return None
return False
@property
def base_url(self) -> str:
assert self.port is not None
return f"http://{self.address}:{self.port}"
def url_for(self, name: str) -> str:
"""e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
return f"{self.base_url}/{name}"
def get_image_asset(self, name: str) -> Image.Image:
return fetch_image(self.url_for(name))
@pytest.fixture(scope="session")
def local_asset_server() -> Generator[LocalAssetServer, None, None]:
"""
Starts a thread based HTTP server bound to 127.0.0.1 on a random free port.
The server currently servers images at:
http://127.0.0.1:<port>/<name>.<ext>
"""
with LocalAssetServer() as srv:
yield srv
@pytest.fixture
def image_url(request, local_asset_server) -> str:
# request.param is one of the IMAGE_ASSETS filenames
name = request.param
return local_asset_server.url_for(name)
@pytest.fixture
def image_urls(request, local_asset_server) -> list[str]:
"""Indirect fixture: takes a list of names, returns list of full URLs."""
names: list[str] = request.param
return [local_asset_server.url_for(name) for name in names]

View File

@ -7,7 +7,7 @@ import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from ..openai.test_vision import TEST_IMAGE_URLS
from ..openai.test_vision import TEST_IMAGE_ASSETS
@pytest.fixture(scope="function")
@ -95,7 +95,8 @@ def vision_llm():
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
[[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
indirect=True)
def test_chat_multi_image(vision_llm, image_urls: list[str]):
messages = [{
"role":

View File

@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
EXPECTED_MM_BEAM_SEARCH_RES = [
@ -69,10 +69,11 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
image_asset:
encode_image_base64(local_asset_server.get_image_asset(image_asset))
for image_asset in TEST_IMAGE_ASSETS
}
@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str):
content_text = "What's in this image?"
@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
model_name: str,
image_url: str):
@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
model_name: str,
image_url: str):
@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: dict[str, str]):
client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
image_url: str, base64_encoded_image: dict[str, str]):
content_text = "What's in this image?"
messages = [{
@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded(
"type": "image_url",
"image_url": {
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
}
},
{
@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_idx: int,
base64_encoded_image: dict[str, str]):
# NOTE: This test also validates that we pass MM data through beam search
image_url = TEST_IMAGE_URLS[image_idx]
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = [{
@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
"type": "image_url",
"image_url": {
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
}
},
{
@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_chat_streaming_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True)
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: list[str]):

View File

@ -19,11 +19,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
@ -49,10 +49,11 @@ def server():
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
image_url:
encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
@ -70,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
image_url: str):
content_text = "Represent the given image."

View File

@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
IMG_URLS = [
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
]
PROMPT = "Describe each image in one short sentence."
@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
return engine_inputs
MSGS = [
_create_msg_format(IMG_URLS[:1]),
_create_msg_format(IMG_URLS[:2]),
_create_msg_format(IMG_URLS),
]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(
vllm_runner,
max_model_len: int,
model: str,
dtype: str,
) -> None:
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
local_asset_server) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
@ -174,7 +164,14 @@ def test_chat(
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
for msg in MSGS:
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
msgs = [
_create_msg_format(urls_all[:1]),
_create_msg_format(urls_all[:2]),
_create_msg_format(urls_all),
]
for msg in msgs:
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
@ -190,14 +187,24 @@ def test_chat(
name_1="output")
@pytest.mark.parametrize("prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]),
[PlaceholderRange(offset=11, length=494)]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])])
@pytest.fixture
def prompt(request, local_asset_server) -> TextPrompt:
names = request.param
urls = [local_asset_server.url_for(n) for n in names]
return _create_engine_inputs_hf(urls)
@pytest.mark.parametrize(
"prompt,expected_ranges",
[
pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
pytest.param(IMG_URLS[1:4], [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])
],
)
def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
expected_ranges: list[PlaceholderRange],
monkeypatch) -> None:

View File

@ -31,11 +31,11 @@ if TYPE_CHECKING:
from vllm.multimodal.inputs import MultiModalPlaceholderDict
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
TEST_VIDEO_URLS = [
@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
@pytest.fixture(scope="module")
def url_images() -> dict[str, Image.Image]:
connector = MediaConnector()
def url_images(local_asset_server) -> dict[str, Image.Image]:
return {
image_url: connector.fetch_image(image_url)
for image_url in TEST_IMAGE_URLS
image_url: local_asset_server.get_image_asset(image_url)
for image_url in TEST_IMAGE_ASSETS
}
@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_fetch_image_http(image_url: str):
connector = MediaConnector()
@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
image_url: str, suffix: str):
raw_image_url: str, suffix: str):
connector = MediaConnector()
url_image = url_images[image_url]
url_image = url_images[raw_image_url]
try:
mime_type = Image.MIME[Image.registered_extensions()[suffix]]
@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_fetch_image_local_files(image_url: str):
connector = MediaConnector()
@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
@pytest.mark.asyncio
async def test_fetch_image_local_files_with_space_in_name():
image_url = TEST_IMAGE_URLS[0]
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
connector = MediaConnector()
with TemporaryDirectory() as temp_dir:

View File

@ -8,17 +8,17 @@ import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import encode_image_base64
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
TEST_IMAGE_ASSETS = [
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
@ -52,16 +52,17 @@ async def client(image_server):
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
image_url:
encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str):
content_text = "What's in this image?"
@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
image_url: str,
raw_image_url: str,
base64_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded(
{
"type": "input_image",
"image_url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
"detail": "auto",
},
{
@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
indirect=True)
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: list[str]):
messages = [{

View File

@ -4,18 +4,19 @@
import openai
import pytest
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform
from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
image_asset:
encode_image_base64(local_asset_server.get_image_asset(image_asset))
for image_asset in TEST_IMAGE_ASSETS
}
@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
client: openai.AsyncOpenAI = remote_server.get_async_client()
# Other requests now should be much faster
for image_url in TEST_IMAGE_URLS:
for image_url in TEST_IMAGE_ASSETS:
image_base64 = base64_encoded_image[image_url]
chat_completion_from_base64 = await client.chat.completions\
.create(

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from pathlib import Path
from typing import Literal
import torch
@ -11,17 +12,29 @@ from .base import get_vllm_public_assets
VLM_IMAGES_DIR = "vision_model_images"
ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
"Grayscale_8bits_palette_sample_image",
"1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
"231-200x300", "27-500x500", "17-150x600",
"handelsblatt-preview", "paper-11"]
@dataclass(frozen=True)
class ImageAsset:
name: ImageAssetName
def get_path(self, ext: str) -> Path:
"""
Return s3 path for given image.
"""
return get_vllm_public_assets(filename=f"{self.name}.{ext}",
s3_prefix=VLM_IMAGES_DIR)
@property
def pil_image(self) -> Image.Image:
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
s3_prefix=VLM_IMAGES_DIR)
def pil_image(self, ext="jpg") -> Image.Image:
image_path = self.get_path(ext)
return Image.open(image_path)
@property
@ -29,6 +42,9 @@ class ImageAsset:
"""
Image embeddings, only used for testing purposes with llava 1.5.
"""
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
s3_prefix=VLM_IMAGES_DIR)
image_path = self.get_path('pt')
return torch.load(image_path, map_location="cpu", weights_only=True)
def read_bytes(self, ext: str) -> bytes:
p = Path(self.get_path(ext))
return p.read_bytes()