mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:35:50 +08:00
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
03dd652c16
commit
01dfb5e982
@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
# Default white background (no configuration needed)
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
)
|
||||
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
)
|
||||
```
|
||||
@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
|
||||
|
||||
!!! important
|
||||
A chat template is **required** to use Chat Completions API.
|
||||
@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
url": image_url
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_duck
|
||||
},
|
||||
"uuid": image_url_duck # Optional
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_lion
|
||||
},
|
||||
"uuid": image_url_lion # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": video_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}"
|
||||
"image_embeds": f"{base64_image_embedding}",
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
|
||||
@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image_with_uuid(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_url
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image_with_incorrect_uuid_format(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"incorrect_uuid_key": image_url,
|
||||
},
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
resolve_chat_template_content_format,
|
||||
resolve_hf_chat_template)
|
||||
from vllm.entrypoints.llm import apply_hf_chat_template
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||
encode_video_base64)
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input(
|
||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: Optional[MultiModalUUIDDict],
|
||||
media_count: int,
|
||||
expected_uuids: list[Optional[str]],
|
||||
modality: str = "image",
|
||||
) -> None:
|
||||
if len(expected_uuids) > 0:
|
||||
assert mm_uuids is not None
|
||||
assert modality in mm_uuids
|
||||
|
||||
image_uuids = mm_uuids.get(modality)
|
||||
assert image_uuids is not None
|
||||
|
||||
assert isinstance(image_uuids,
|
||||
list) and len(image_uuids) == media_count
|
||||
|
||||
assert image_uuids == expected_uuids
|
||||
else:
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
ModalityType = Literal["image", "video", "audio"]
|
||||
MultiModalDataCounts = Mapping[ModalityType, int]
|
||||
|
||||
@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
"bad_uuid_key": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
|
||||
|
||||
|
||||
def test_parse_chat_messages_empty_system(
|
||||
@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system(
|
||||
mistral_tokenizer,
|
||||
):
|
||||
# Test string format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system(
|
||||
]
|
||||
|
||||
# Test openai format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images(
|
||||
@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
"What's in <|image_1|> and how does it compare to <|image_2|>?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
"other one?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_across_messages(
|
||||
@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -569,13 +849,84 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What about this one?"
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in this image?"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_2|>\nWhat about this one?"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_context_text_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format(
|
||||
}],
|
||||
},
|
||||
]
|
||||
assert mm_data is None
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave(
|
||||
@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages_futures(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I need you to compare this image",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "and this one"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Do they have differences?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Be accurate."
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_1|>\nBe accurate.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_2|>"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=[None, None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
"uuid": "audio_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", "image_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="audio",
|
||||
expected_uuids=["audio_123"])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", None])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
@ -1081,7 +1791,7 @@ def test_mllama_single_image(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that a single image is parsed correctly mllama."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -1100,6 +1810,7 @@ def test_mllama_single_image(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that multiple image are parsed as interleaved dicts."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@ -1227,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
# Now parse with vLLMs chat utils & apply the template
|
||||
vllm_conversation = get_conversation(is_hf=False)
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
vllm_conversation,
|
||||
model_config,
|
||||
tokenizer_group,
|
||||
@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
||||
}],
|
||||
}]
|
||||
|
||||
conversation_with_thinking, _ = parse_chat_messages(
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
|
||||
@ -41,7 +41,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import SupportsMultiModal
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
||||
MultiModalUUIDDict)
|
||||
from vllm.multimodal.utils import MediaConnector
|
||||
# yapf: disable
|
||||
from vllm.transformers_utils.chat_templates import (
|
||||
@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
|
||||
|
||||
type: Required[Literal["audio_url"]]
|
||||
"""The type of the content part."""
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
|
||||
@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
|
||||
"""
|
||||
type: Required[Literal["image_embeds"]]
|
||||
"""The type of the content part."""
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class VideoURL(TypedDict, total=False):
|
||||
@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
|
||||
|
||||
type: Required[Literal["video_url"]]
|
||||
"""The type of the content part."""
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class PILImage(BaseModel):
|
||||
@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
|
||||
"""
|
||||
|
||||
image_pil: Required[PILImage]
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
||||
@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
||||
"""
|
||||
|
||||
image_url: Required[str]
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
|
||||
@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
||||
"""
|
||||
|
||||
video_url: Required[str]
|
||||
uuid: Optional[str]
|
||||
"""
|
||||
User-provided UUID of a media. User must guarantee that it is properly
|
||||
generated and unique for different medias.
|
||||
"""
|
||||
|
||||
|
||||
class CustomThinkCompletionContentParam(TypedDict, total=False):
|
||||
@ -567,6 +598,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
self._tokenizer = tokenizer
|
||||
|
||||
self._items_by_modality = defaultdict[str, list[_T]](list)
|
||||
self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
@ -591,10 +623,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
def mm_processor(self):
|
||||
return self.mm_registry.create_processor(self.model_config)
|
||||
|
||||
def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
|
||||
def add(
|
||||
self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Add a multi-modal item to the current prompt and returns the
|
||||
placeholder string to use, if any.
|
||||
|
||||
An optional uuid can be added which serves as a unique identifier of the
|
||||
media.
|
||||
"""
|
||||
input_modality = modality.replace("_embeds", "")
|
||||
num_items = len(self._items_by_modality[modality]) + 1
|
||||
@ -602,9 +639,35 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
self.mm_processor.validate_num_items(input_modality, num_items)
|
||||
|
||||
self._items_by_modality[modality].append(item)
|
||||
self._uuids_by_modality[modality].append(uuid)
|
||||
|
||||
return self.model_cls.get_placeholder_str(modality, num_items)
|
||||
|
||||
def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
|
||||
if not self._items_by_modality:
|
||||
return None
|
||||
mm_uuids = {}
|
||||
uuids_by_modality = dict(self._uuids_by_modality)
|
||||
if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
|
||||
raise ValueError(
|
||||
"Mixing raw image and embedding inputs is not allowed"
|
||||
)
|
||||
|
||||
if "image_embeds" in uuids_by_modality:
|
||||
image_embeds_uuids = uuids_by_modality["image_embeds"]
|
||||
if len(image_embeds_uuids) > 1:
|
||||
raise ValueError(
|
||||
"Only one message can have {'type': 'image_embeds'}"
|
||||
)
|
||||
mm_uuids["image"] = uuids_by_modality["image_embeds"]
|
||||
if "image" in uuids_by_modality:
|
||||
mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images
|
||||
if "audio" in uuids_by_modality:
|
||||
mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios
|
||||
if "video" in uuids_by_modality:
|
||||
mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos
|
||||
return mm_uuids
|
||||
|
||||
@abstractmethod
|
||||
def create_parser(self) -> "BaseMultiModalContentParser":
|
||||
raise NotImplementedError
|
||||
@ -697,29 +760,35 @@ class BaseMultiModalContentParser(ABC):
|
||||
return dict(self._placeholder_storage)
|
||||
|
||||
@abstractmethod
|
||||
def parse_image(self, image_url: str) -> None:
|
||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_image_embeds(
|
||||
self, image_embeds: Union[str, dict[str, str]]
|
||||
self,
|
||||
image_embeds: Union[str, dict[str, str]],
|
||||
uuid: Optional[str] = None,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||
def parse_image_pil(
|
||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
def parse_input_audio(
|
||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@ -734,49 +803,55 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
)
|
||||
|
||||
def parse_image(self, image_url: str) -> None:
|
||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
||||
image = self._connector.fetch_image(image_url)
|
||||
|
||||
placeholder = self._tracker.add("image", image)
|
||||
placeholder = self._tracker.add("image", image, uuid)
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_image_embeds(
|
||||
self, image_embeds: Union[str, dict[str, str]]
|
||||
self,
|
||||
image_embeds: Union[str, dict[str, str]],
|
||||
uuid: Optional[str] = None,
|
||||
) -> None:
|
||||
if isinstance(image_embeds, dict):
|
||||
embeds = {
|
||||
k: self._connector.fetch_image_embedding(v)
|
||||
for k, v in image_embeds.items()
|
||||
}
|
||||
placeholder = self._tracker.add("image_embeds", embeds)
|
||||
placeholder = self._tracker.add("image_embeds", embeds, uuid)
|
||||
|
||||
if isinstance(image_embeds, str):
|
||||
embedding = self._connector.fetch_image_embedding(image_embeds)
|
||||
placeholder = self._tracker.add("image_embeds", embedding)
|
||||
placeholder = self._tracker.add("image_embeds", embedding, uuid)
|
||||
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||
placeholder = self._tracker.add("image", image_pil)
|
||||
def parse_image_pil(
|
||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
placeholder = self._tracker.add("image", image_pil, uuid)
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
||||
audio = self._connector.fetch_audio(audio_url)
|
||||
|
||||
placeholder = self._tracker.add("audio", audio)
|
||||
placeholder = self._tracker.add("audio", audio, uuid)
|
||||
self._add_placeholder("audio", placeholder)
|
||||
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
def parse_input_audio(
|
||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
audio_data = input_audio.get("data", "")
|
||||
audio_format = input_audio.get("format", "")
|
||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||
|
||||
return self.parse_audio(audio_url)
|
||||
return self.parse_audio(audio_url, uuid)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
||||
video = self._connector.fetch_video(video_url=video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
placeholder = self._tracker.add("video", video, uuid)
|
||||
self._add_placeholder("video", placeholder)
|
||||
|
||||
|
||||
@ -790,14 +865,16 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
)
|
||||
|
||||
def parse_image(self, image_url: str) -> None:
|
||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
||||
image_coro = self._connector.fetch_image_async(image_url)
|
||||
|
||||
placeholder = self._tracker.add("image", image_coro)
|
||||
placeholder = self._tracker.add("image", image_coro, uuid)
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_image_embeds(
|
||||
self, image_embeds: Union[str, dict[str, str]]
|
||||
self,
|
||||
image_embeds: Union[str, dict[str, str]],
|
||||
uuid: Optional[str] = None,
|
||||
) -> None:
|
||||
future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
|
||||
|
||||
@ -812,33 +889,37 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
embedding = self._connector.fetch_image_embedding(image_embeds)
|
||||
future.set_result(embedding)
|
||||
|
||||
placeholder = self._tracker.add("image_embeds", future)
|
||||
placeholder = self._tracker.add("image_embeds", future, uuid)
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||
def parse_image_pil(
|
||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
future: asyncio.Future[Image.Image] = asyncio.Future()
|
||||
future.set_result(image_pil)
|
||||
|
||||
placeholder = self._tracker.add("image", future)
|
||||
placeholder = self._tracker.add("image", future, uuid)
|
||||
self._add_placeholder("image", placeholder)
|
||||
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
||||
audio_coro = self._connector.fetch_audio_async(audio_url)
|
||||
|
||||
placeholder = self._tracker.add("audio", audio_coro)
|
||||
placeholder = self._tracker.add("audio", audio_coro, uuid)
|
||||
self._add_placeholder("audio", placeholder)
|
||||
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
def parse_input_audio(
|
||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
||||
) -> None:
|
||||
audio_data = input_audio.get("data", "")
|
||||
audio_format = input_audio.get("format", "")
|
||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||
|
||||
return self.parse_audio(audio_url)
|
||||
return self.parse_audio(audio_url, uuid)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
||||
video = self._connector.fetch_video_async(video_url=video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
placeholder = self._tracker.add("video", video, uuid)
|
||||
self._add_placeholder("video", placeholder)
|
||||
|
||||
|
||||
@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part(
|
||||
else:
|
||||
return str_content
|
||||
|
||||
# For media items, if a user has provided one, use it. Otherwise, insert
|
||||
# a placeholder empty uuid.
|
||||
uuid = part.get("uuid", None)
|
||||
if uuid is not None:
|
||||
uuid = str(uuid)
|
||||
|
||||
modality = None
|
||||
if part_type == "image_pil":
|
||||
image_content = cast(Image.Image, content)
|
||||
mm_parser.parse_image_pil(image_content)
|
||||
mm_parser.parse_image_pil(image_content, uuid)
|
||||
modality = "image"
|
||||
elif part_type in ("image_url", "input_image"):
|
||||
str_content = cast(str, content)
|
||||
mm_parser.parse_image(str_content)
|
||||
mm_parser.parse_image(str_content, uuid)
|
||||
modality = "image"
|
||||
elif part_type == "image_embeds":
|
||||
content = cast(Union[str, dict[str, str]], content)
|
||||
mm_parser.parse_image_embeds(content)
|
||||
mm_parser.parse_image_embeds(content, uuid)
|
||||
modality = "image"
|
||||
elif part_type == "audio_url":
|
||||
str_content = cast(str, content)
|
||||
mm_parser.parse_audio(str_content)
|
||||
mm_parser.parse_audio(str_content, uuid)
|
||||
modality = "audio"
|
||||
elif part_type == "input_audio":
|
||||
dict_content = cast(InputAudio, content)
|
||||
mm_parser.parse_input_audio(dict_content)
|
||||
mm_parser.parse_input_audio(dict_content, uuid)
|
||||
modality = "audio"
|
||||
elif part_type == "video_url":
|
||||
str_content = cast(str, content)
|
||||
mm_parser.parse_video(str_content)
|
||||
mm_parser.parse_video(str_content, uuid)
|
||||
modality = "video"
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown part type: {part_type}")
|
||||
@ -1288,7 +1375,11 @@ def parse_chat_messages(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
content_format: _ChatTemplateContentFormat,
|
||||
) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
|
||||
) -> tuple[
|
||||
list[ConversationMessage],
|
||||
Optional[MultiModalDataDict],
|
||||
Optional[MultiModalUUIDDict],
|
||||
]:
|
||||
conversation: list[ConversationMessage] = []
|
||||
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
|
||||
|
||||
@ -1308,7 +1399,7 @@ def parse_chat_messages(
|
||||
|
||||
_postprocess_messages(conversation)
|
||||
|
||||
return conversation, mm_tracker.all_mm_data()
|
||||
return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
|
||||
|
||||
|
||||
def parse_chat_messages_futures(
|
||||
@ -1316,7 +1407,11 @@ def parse_chat_messages_futures(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
content_format: _ChatTemplateContentFormat,
|
||||
) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
|
||||
) -> tuple[
|
||||
list[ConversationMessage],
|
||||
Awaitable[Optional[MultiModalDataDict]],
|
||||
Optional[MultiModalUUIDDict],
|
||||
]:
|
||||
conversation: list[ConversationMessage] = []
|
||||
mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
|
||||
|
||||
@ -1336,7 +1431,7 @@ def parse_chat_messages_futures(
|
||||
|
||||
_postprocess_messages(conversation)
|
||||
|
||||
return conversation, mm_tracker.all_mm_data()
|
||||
return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
|
||||
|
||||
|
||||
def apply_hf_chat_template(
|
||||
|
||||
@ -796,7 +796,7 @@ class LLM:
|
||||
# NOTE: _parse_chat_message_content_parts() currently doesn't
|
||||
# handle mm_processor_kwargs, since there is no implementation in
|
||||
# the chat message parsing for it.
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
msgs,
|
||||
model_config,
|
||||
tokenizer,
|
||||
@ -826,6 +826,9 @@ class LLM:
|
||||
if mm_data is not None:
|
||||
prompt["multi_modal_data"] = mm_data
|
||||
|
||||
if mm_uuids is not None:
|
||||
prompt["multi_modal_uuids"] = mm_uuids
|
||||
|
||||
if mm_processor_kwargs is not None:
|
||||
prompt["mm_processor_kwargs"] = mm_processor_kwargs
|
||||
|
||||
|
||||
@ -929,7 +929,7 @@ class OpenAIServing:
|
||||
tokenizer,
|
||||
model_config=model_config,
|
||||
)
|
||||
conversation, mm_data_future = parse_chat_messages_futures(
|
||||
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
|
||||
messages,
|
||||
model_config,
|
||||
tokenizer,
|
||||
@ -1006,6 +1006,10 @@ class OpenAIServing:
|
||||
prompt_token_ids=prompt_inputs["prompt_token_ids"])
|
||||
if mm_data is not None:
|
||||
engine_prompt["multi_modal_data"] = mm_data
|
||||
|
||||
if mm_uuids is not None:
|
||||
engine_prompt["multi_modal_uuids"] = mm_uuids
|
||||
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
|
||||
@ -276,13 +276,23 @@ class InputPreprocessor:
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(
|
||||
mm_input = mm_processor.apply(
|
||||
prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_hash_overrides=mm_hash_overrides,
|
||||
)
|
||||
mm_hashes = mm_input["mm_hashes"]
|
||||
|
||||
# Validate that all mm items have a string as their hash
|
||||
if not contains_only_strings(mm_hashes):
|
||||
raise ValueError(
|
||||
f"mm_hashes must contain only strings, got: {mm_hashes}. "
|
||||
"This is likely due to an incorrect custom implementation of "
|
||||
"MultiModalProcessor.apply method.")
|
||||
|
||||
return mm_input
|
||||
|
||||
async def _process_multimodal_async(
|
||||
self,
|
||||
@ -310,13 +320,23 @@ class InputPreprocessor:
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(
|
||||
mm_input = mm_processor.apply(
|
||||
prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_hash_overrides=mm_hash_overrides,
|
||||
)
|
||||
mm_hashes = mm_input["mm_hashes"]
|
||||
|
||||
# Validate that all mm items have a string as their hash
|
||||
if not contains_only_strings(mm_hashes):
|
||||
raise ValueError(
|
||||
f"mm_hashes must contain only strings, got: {mm_hashes}. "
|
||||
"This is likely due to an incorrect custom implementation of "
|
||||
"MultiModalProcessor.apply method.")
|
||||
|
||||
return mm_input
|
||||
|
||||
def _process_embeds(
|
||||
self,
|
||||
@ -953,3 +973,15 @@ class InputPreprocessor:
|
||||
def clear_cache(self) -> None:
|
||||
if self.mm_processor_cache is not None:
|
||||
self.mm_processor_cache.clear_cache()
|
||||
|
||||
|
||||
# Helper function to validate that a nested dictionary contains
|
||||
# only strings or list of strings as the leaf values.
|
||||
def contains_only_strings(obj: object):
|
||||
if isinstance(obj, str):
|
||||
return True
|
||||
if isinstance(obj, list):
|
||||
return all(isinstance(x, str) for x in obj)
|
||||
if isinstance(obj, dict):
|
||||
return all(contains_only_strings(v) for v in obj.values())
|
||||
return False
|
||||
|
||||
@ -174,9 +174,10 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
|
||||
|
||||
mm_items = self._to_mm_items(mm_data)
|
||||
tokenization_kwargs = tokenization_kwargs or {}
|
||||
mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
|
||||
self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs))
|
||||
mm_hashes = self._hash_mm_items(mm_items,
|
||||
hf_processor_mm_kwargs,
|
||||
tokenization_kwargs,
|
||||
mm_hash_overrides=mm_hash_overrides)
|
||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||
|
||||
mm_processed_data = BatchFeature(image_data)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user