mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-23 04:39:08 +08:00
[CI/Build] Serve images used by multimodal tests through local HTTP Server (#23907)
Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com> Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
f0c503f66e
commit
70549c1245
@ -1,9 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import http.server
|
||||
import json
|
||||
import math
|
||||
import mimetypes
|
||||
import os
|
||||
import socket
|
||||
import tempfile
|
||||
import threading
|
||||
from collections.abc import Generator
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
||||
|
||||
@ -32,6 +37,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
to_enc_dec_tuple_list, zip_enc_dec_prompts)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.sequence import Logprob
|
||||
@ -1253,3 +1259,119 @@ def cli_config_file():
|
||||
def cli_config_file_with_model():
|
||||
"""Return the path to the CLI config file with model."""
|
||||
return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
|
||||
|
||||
|
||||
class AssetHandler(http.server.BaseHTTPRequestHandler):
|
||||
# _IMAGE_CACHE : Dict[str, bytes] = {}
|
||||
|
||||
def log_message(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def do_GET(self):
|
||||
# Accepts paths like: /1280px-Venn_diagram_rgb.jpg
|
||||
filename = self.path.lstrip("/")
|
||||
if not filename or "." not in filename:
|
||||
self.send_error(404, "Missing filename (expected /<name>.<ext>)")
|
||||
return
|
||||
|
||||
base, ext = filename.rsplit(".", 1)
|
||||
ext = ext.lower()
|
||||
|
||||
if ext not in ["jpg", "png"]:
|
||||
self.send_error(404, f"Unsupported extension: .{ext}")
|
||||
return
|
||||
|
||||
try:
|
||||
data = ImageAsset(base).read_bytes(ext=ext)
|
||||
except Exception as e:
|
||||
self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
|
||||
return
|
||||
|
||||
ctype, _ = mimetypes.guess_type(filename)
|
||||
if ctype is None:
|
||||
ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", ctype)
|
||||
self.send_header("Content-Length", str(len(data)))
|
||||
self.end_headers()
|
||||
self.wfile.write(data)
|
||||
|
||||
|
||||
def _find_free_port() -> int:
|
||||
with socket.socket() as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
class LocalAssetServer:
|
||||
|
||||
address: str
|
||||
port: int
|
||||
server: Optional[http.server.ThreadingHTTPServer]
|
||||
thread: Optional[threading.Thread]
|
||||
|
||||
def __init__(self, address: str = "127.0.0.1") -> None:
|
||||
self.address = address
|
||||
self.port = -1
|
||||
self.server = None
|
||||
self.thread = None
|
||||
|
||||
def __enter__(self):
|
||||
self.port = _find_free_port()
|
||||
self.server = http.server.ThreadingHTTPServer(
|
||||
(self.address, self.port), AssetHandler)
|
||||
self.thread = threading.Thread(target=self.server.serve_forever,
|
||||
daemon=True)
|
||||
self.thread.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
if self.server:
|
||||
self.server.shutdown()
|
||||
del self.server
|
||||
|
||||
if self.thread:
|
||||
self.thread.join()
|
||||
del self.thread
|
||||
|
||||
if exc_type is None:
|
||||
return None
|
||||
|
||||
return False
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
assert self.port is not None
|
||||
return f"http://{self.address}:{self.port}"
|
||||
|
||||
def url_for(self, name: str) -> str:
|
||||
"""e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
|
||||
return f"{self.base_url}/{name}"
|
||||
|
||||
def get_image_asset(self, name: str) -> Image.Image:
|
||||
return fetch_image(self.url_for(name))
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def local_asset_server() -> Generator[LocalAssetServer, None, None]:
|
||||
"""
|
||||
Starts a thread based HTTP server bound to 127.0.0.1 on a random free port.
|
||||
The server currently servers images at:
|
||||
http://127.0.0.1:<port>/<name>.<ext>
|
||||
"""
|
||||
with LocalAssetServer() as srv:
|
||||
yield srv
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def image_url(request, local_asset_server) -> str:
|
||||
# request.param is one of the IMAGE_ASSETS filenames
|
||||
name = request.param
|
||||
return local_asset_server.url_for(name)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def image_urls(request, local_asset_server) -> list[str]:
|
||||
"""Indirect fixture: takes a list of names, returns list of full URLs."""
|
||||
names: list[str] = request.param
|
||||
return [local_asset_server.url_for(name) for name in names]
|
||||
|
||||
@ -7,7 +7,7 @@ import pytest
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
from ..openai.test_vision import TEST_IMAGE_ASSETS
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
@ -95,7 +95,8 @@ def vision_llm():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_urls",
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
[[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
|
||||
indirect=True)
|
||||
def test_chat_multi_image(vision_llm, image_urls: list[str]):
|
||||
messages = [{
|
||||
"role":
|
||||
|
||||
@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
@ -69,10 +69,11 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
image_asset:
|
||||
encode_image_base64(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
model_name: str, image_url: str):
|
||||
content_text = "What's in this image?"
|
||||
@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_url: str):
|
||||
@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_url: str):
|
||||
@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str,
|
||||
base64_encoded_image: dict[str, str]):
|
||||
client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
|
||||
image_url: str, base64_encoded_image: dict[str, str]):
|
||||
|
||||
content_text = "What's in this image?"
|
||||
messages = [{
|
||||
@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url":
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
|
||||
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
|
||||
async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_idx: int,
|
||||
base64_encoded_image: dict[str, str]):
|
||||
# NOTE: This test also validates that we pass MM data through beam search
|
||||
image_url = TEST_IMAGE_URLS[image_idx]
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = [{
|
||||
@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url":
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_chat_streaming_image(client: openai.AsyncOpenAI,
|
||||
model_name: str, image_url: str):
|
||||
messages = [{
|
||||
@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
image_urls: list[str]):
|
||||
|
||||
|
||||
@ -19,11 +19,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
|
||||
assert vlm2vec_jinja_path.exists()
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
|
||||
@ -49,10 +49,11 @@ def server():
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
image_url:
|
||||
encode_image_base64(local_asset_server.get_image_asset(image_url))
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
@ -70,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
|
||||
image_url: str):
|
||||
content_text = "Represent the given image."
|
||||
|
||||
@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
|
||||
|
||||
IMG_URLS = [
|
||||
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
|
||||
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
|
||||
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
|
||||
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
|
||||
]
|
||||
PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
|
||||
return engine_inputs
|
||||
|
||||
|
||||
MSGS = [
|
||||
_create_msg_format(IMG_URLS[:1]),
|
||||
_create_msg_format(IMG_URLS[:2]),
|
||||
_create_msg_format(IMG_URLS),
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(
|
||||
vllm_runner,
|
||||
max_model_len: int,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
local_asset_server) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
|
||||
FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
@ -174,7 +164,14 @@ def test_chat(
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
for msg in MSGS:
|
||||
|
||||
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
|
||||
msgs = [
|
||||
_create_msg_format(urls_all[:1]),
|
||||
_create_msg_format(urls_all[:2]),
|
||||
_create_msg_format(urls_all),
|
||||
]
|
||||
for msg in msgs:
|
||||
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
@ -190,14 +187,24 @@ def test_chat(
|
||||
name_1="output")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prompt,expected_ranges",
|
||||
[(_create_engine_inputs_hf(IMG_URLS[:1]),
|
||||
[PlaceholderRange(offset=11, length=494)]),
|
||||
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])])
|
||||
@pytest.fixture
|
||||
def prompt(request, local_asset_server) -> TextPrompt:
|
||||
names = request.param
|
||||
urls = [local_asset_server.url_for(n) for n in names]
|
||||
return _create_engine_inputs_hf(urls)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,expected_ranges",
|
||||
[
|
||||
pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
|
||||
pytest.param(IMG_URLS[1:4], [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
monkeypatch) -> None:
|
||||
|
||||
@ -31,11 +31,11 @@ if TYPE_CHECKING:
|
||||
from vllm.multimodal.inputs import MultiModalPlaceholderDict
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
TEST_VIDEO_URLS = [
|
||||
@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def url_images() -> dict[str, Image.Image]:
|
||||
connector = MediaConnector()
|
||||
def url_images(local_asset_server) -> dict[str, Image.Image]:
|
||||
|
||||
return {
|
||||
image_url: connector.fetch_image(image_url)
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
image_url: local_asset_server.get_image_asset(image_url)
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_fetch_image_http(image_url: str):
|
||||
connector = MediaConnector()
|
||||
|
||||
@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
|
||||
@pytest.mark.parametrize("suffix", get_supported_suffixes())
|
||||
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
|
||||
image_url: str, suffix: str):
|
||||
raw_image_url: str, suffix: str):
|
||||
connector = MediaConnector()
|
||||
url_image = url_images[image_url]
|
||||
url_image = url_images[raw_image_url]
|
||||
|
||||
try:
|
||||
mime_type = Image.MIME[Image.registered_extensions()[suffix]]
|
||||
@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_fetch_image_local_files(image_url: str):
|
||||
connector = MediaConnector()
|
||||
|
||||
@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fetch_image_local_files_with_space_in_name():
|
||||
image_url = TEST_IMAGE_URLS[0]
|
||||
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
|
||||
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
|
||||
connector = MediaConnector()
|
||||
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
|
||||
@ -8,17 +8,17 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
|
||||
# Use a small vision model for testing
|
||||
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
|
||||
@ -52,16 +52,17 @@ async def client(image_server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
image_url:
|
||||
encode_image_base64(local_asset_server.get_image_asset(image_url))
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
model_name: str, image_url: str):
|
||||
content_text = "What's in this image?"
|
||||
@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_url: str,
|
||||
raw_image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url":
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
"detail": "auto",
|
||||
},
|
||||
{
|
||||
@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded(
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
image_urls: list[str]):
|
||||
messages = [{
|
||||
|
||||
@ -4,18 +4,19 @@
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
|
||||
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
image_asset:
|
||||
encode_image_base64(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
|
||||
client: openai.AsyncOpenAI = remote_server.get_async_client()
|
||||
|
||||
# Other requests now should be much faster
|
||||
for image_url in TEST_IMAGE_URLS:
|
||||
for image_url in TEST_IMAGE_ASSETS:
|
||||
image_base64 = base64_encoded_image[image_url]
|
||||
chat_completion_from_base64 = await client.chat.completions\
|
||||
.create(
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
@ -11,17 +12,29 @@ from .base import get_vllm_public_assets
|
||||
|
||||
VLM_IMAGES_DIR = "vision_model_images"
|
||||
|
||||
ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
|
||||
ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
|
||||
"Grayscale_8bits_palette_sample_image",
|
||||
"1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
|
||||
"231-200x300", "27-500x500", "17-150x600",
|
||||
"handelsblatt-preview", "paper-11"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ImageAsset:
|
||||
name: ImageAssetName
|
||||
|
||||
def get_path(self, ext: str) -> Path:
|
||||
"""
|
||||
Return s3 path for given image.
|
||||
"""
|
||||
return get_vllm_public_assets(filename=f"{self.name}.{ext}",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
|
||||
@property
|
||||
def pil_image(self) -> Image.Image:
|
||||
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
def pil_image(self, ext="jpg") -> Image.Image:
|
||||
|
||||
image_path = self.get_path(ext)
|
||||
return Image.open(image_path)
|
||||
|
||||
@property
|
||||
@ -29,6 +42,9 @@ class ImageAsset:
|
||||
"""
|
||||
Image embeddings, only used for testing purposes with llava 1.5.
|
||||
"""
|
||||
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
image_path = self.get_path('pt')
|
||||
return torch.load(image_path, map_location="cpu", weights_only=True)
|
||||
|
||||
def read_bytes(self, ext: str) -> bytes:
|
||||
p = Path(self.get_path(ext))
|
||||
return p.read_bytes()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user