[Frontend][Multimodal] Allow skipping media data when UUIDs are provided. (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
Chenheli Hua 2025-09-12 19:16:06 -07:00 committed by GitHub
parent 4fdd6f5cbf
commit 7f2ea7074e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 970 additions and 96 deletions

View File

@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t
print(o.outputs[0].text) print(o.outputs[0].text)
``` ```
Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
??? code
```python
from vllm import LLM
from PIL import Image
# Qwen2.5-VL example with two images
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
img_b = Image.open("/path/to/b.jpg")
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": [None, img_b]},
# Since img_a is expected to be cached, we can skip sending the actual
# image entirely.
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
})
for o in outputs:
print(o.outputs[0].text)
```
!!! warning !!! warning
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
) )
``` ```
For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
```python
# Image/video/audio URL:
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid,
},
# image_embeds
{
"type": "image_embeds",
"image_embeds": None,
"uuid": image_uuid
},
# input_audio:
{
"type": "input_audio",
"input_audio": None,
"uuid": audio_uuid
},
# PIL Image:
{
"type": "image_pil",
"image_pil": None
"uuid": image_uuid
}
```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.
If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.

View File

@ -1764,6 +1764,7 @@ def apply_image_repeat(
probs = [1.0 - image_repeat_prob, image_repeat_prob] probs = [1.0 - image_repeat_prob, image_repeat_prob]
inputs = [] inputs = []
inputs_with_empty_media = []
cur_image = data cur_image = data
for i in range(num_prompts): for i in range(num_prompts):
if image_repeat_prob is not None: if image_repeat_prob is not None:
@ -1774,14 +1775,25 @@ def apply_image_repeat(
new_val = (i // 256 // 256, i // 256, i % 256) new_val = (i // 256 // 256, i // 256, i % 256)
cur_image.putpixel((0, 0), new_val) cur_image.putpixel((0, 0), new_val)
uuid = "uuid_{}".format(i)
inputs.append( inputs.append(
{ {
"prompt": prompts[i % len(prompts)], "prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: cur_image}, "multi_modal_data": {modality: cur_image},
"multi_modal_uuids": {modality: uuid},
} }
) )
return inputs inputs_with_empty_media.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
}
)
return inputs, inputs_with_empty_media
@contextmanager @contextmanager
@ -1860,6 +1872,13 @@ def parse_args():
help="If True, then use different prompt (with the same multi-modal " help="If True, then use different prompt (with the same multi-modal "
"data) for each request.", "data) for each request.",
) )
parser.add_argument(
"--verify-mm-cache-hit-with-uuids",
action="store_true",
help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.",
)
return parser.parse_args() return parser.parse_args()
@ -1903,26 +1922,48 @@ def main(args):
assert args.num_prompts > 0 assert args.num_prompts > 0
if args.num_prompts == 1: if args.num_prompts == 1:
# Single inference # Single inference
uuid = "uuid_0"
inputs = { inputs = {
"prompt": prompts[0], "prompt": prompts[0],
"multi_modal_data": {modality: data}, "multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
}
inputs_with_empty_media = {
"prompt": prompts[0],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
} }
else: else:
# Batch inference # Batch inference
if args.image_repeat_prob is not None: if args.image_repeat_prob is not None:
# Repeat images with specified probability of "image_repeat_prob" # Repeat images with specified probability of "image_repeat_prob"
inputs = apply_image_repeat( inputs, inputs_with_empty_media = apply_image_repeat(
args.image_repeat_prob, args.num_prompts, data, prompts, modality args.image_repeat_prob,
args.num_prompts,
data,
prompts,
modality,
) )
else: else:
# Use the same image for all prompts # Use the same image for all prompts
inputs = [ inputs = []
{ inputs_with_empty_media = []
"prompt": prompts[i % len(prompts)], for i in range(args.num_prompts):
"multi_modal_data": {modality: data}, uuid = "uuid_{}".format(i)
} inputs.append(
for i in range(args.num_prompts) {
] "prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
}
)
inputs_with_empty_media.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
}
)
# Add LoRA request if applicable # Add LoRA request if applicable
lora_request = ( lora_request = (
@ -1942,6 +1983,26 @@ def main(args):
print(generated_text) print(generated_text)
print("-" * 50) print("-" * 50)
if args.verify_mm_cache_hit_with_uuids:
try:
# Verify cache hits with UUIDs
print(
"Sending a second batch of requests with empty media"
" and matching UUIDs."
)
outputs = llm.generate(
inputs_with_empty_media,
sampling_params=sampling_params,
lora_request=lora_request,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
except Exception as e:
print(f"Failed to verify cache hits with UUIDs. Error: {e}")
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()

View File

@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid(
assert isinstance(chat_completion.choices[0].message.content, str) assert isinstance(chat_completion.choices[0].message.content, str)
assert len(chat_completion.choices[0].message.content) > 0 assert len(chat_completion.choices[0].message.content) > 0
# Second request, with empty image but the same uuid.
chat_completion_with_empty_image = await client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role":
"user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {},
"uuid": image_url
},
],
},
],
model=model_name,
)
assert chat_completion_with_empty_image.choices[
0].message.content is not None
assert isinstance(
chat_completion_with_empty_image.choices[0].message.content, str)
assert len(
chat_completion_with_empty_image.choices[0].message.content) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
client: openai.AsyncOpenAI,
model_name: str,
):
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role":
"user",
"content": [
{
"type": "text",
"text": "Describe this image.",
},
{
"type": "image_url",
"image_url": {},
"uuid": "uuid_not_previously_seen"
},
],
},
],
model=model_name,
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])

View File

@ -79,6 +79,28 @@ def phi3v_tokenizer():
) )
@pytest.fixture(scope="function")
def qwen2_audio_model_config():
return ModelConfig(
QWEN2AUDIO_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"audio": 1,
},
)
@pytest.fixture(scope="module")
def qwen2_audio_tokenizer():
return TokenizerGroup(
tokenizer_id=QWEN2AUDIO_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
)
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved(): def qwen25omni_model_config_mm_interleaved():
return ModelConfig( return ModelConfig(
@ -169,6 +191,7 @@ def audio_url():
def _assert_mm_data_is_image_input( def _assert_mm_data_is_image_input(
mm_data: Optional[MultiModalDataDict], mm_data: Optional[MultiModalDataDict],
image_count: int, image_count: int,
skipped_image_indices: Optional[list] = None,
) -> None: ) -> None:
assert mm_data is not None assert mm_data is not None
assert set(mm_data.keys()) == {"image"} assert set(mm_data.keys()) == {"image"}
@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input(
assert image_data is not None assert image_data is not None
assert isinstance(image_data, list) and len(image_data) == image_count assert isinstance(image_data, list) and len(image_data) == image_count
if skipped_image_indices is not None:
for i in skipped_image_indices:
assert image_data[i] is None
def _assert_mm_uuids( def _assert_mm_uuids(
@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int]
def _assert_mm_data_inputs( def _assert_mm_data_inputs(
mm_data: Optional[MultiModalDataDict], mm_data: Optional[MultiModalDataDict],
data_count: MultiModalDataCounts, data_count: MultiModalDataCounts,
skipped_media_indices: Optional[dict[
str, list]] = None, # modality -> list[int]
) -> None: ) -> None:
assert mm_data is not None assert mm_data is not None
assert set(data_count.keys()) == (set(mm_data.keys())) assert set(data_count.keys()) == (set(mm_data.keys()))
@ -216,6 +244,13 @@ def _assert_mm_data_inputs(
assert modality_data is not None assert modality_data is not None
assert isinstance(modality_data, list) and len(modality_data) == n assert isinstance(modality_data, list) and len(modality_data) == n
if skipped_media_indices is not None:
skipped_media_indices_for_modality = skipped_media_indices.get(
modality)
assert skipped_media_indices_for_modality is not None
for i in skipped_media_indices_for_modality:
assert modality_data[i] is None
def test_parse_chat_messages_single_image( def test_parse_chat_messages_single_image(
phi3v_model_config, phi3v_model_config,
@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid(
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
def test_parse_chat_messages_single_empty_image_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid,
},
{
"type": "text",
"text": "What's in the image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
_assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
def test_parse_chat_messages_single_image_with_bad_uuid_format( def test_parse_chat_messages_single_image_with_bad_uuid_format(
phi3v_model_config, phi3v_model_config,
phi3v_tokenizer, phi3v_tokenizer,
@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
def test_parse_chat_messages_multiple_empty_images_with_uuids(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
image_uuid2 = "my_uuid_2"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid1,
},
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid2,
},
{
"type": "text",
"text": "What's in the image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
}]
_assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
def test_parse_chat_messages_mixed_empty_images_with_uuids(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
image_uuid2 = "my_uuid_2"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url,
},
"uuid": image_uuid1,
},
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid2,
},
{
"type": "text",
"text": "What's in the image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
}]
_assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_chat_messages_single_image_with_uuid_async( async def test_parse_chat_messages_single_image_with_uuid_async(
phi3v_model_config, phi3v_model_config,
@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_with_uuid_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid,
},
{
"type": "text",
"text": "What's in the image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
_assert_mm_data_is_image_input(await mm_future,
1,
skipped_image_indices=[0])
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_async( async def test_parse_chat_messages_multiple_images_with_uuids_async(
phi3v_model_config, phi3v_model_config,
@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
image_uuid2 = "my_uuid_2"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": None,
"uuid": image_uuid1,
},
{
"type": "image_pil",
"image_pil": None,
"uuid": image_uuid2,
},
{
"type": "text",
"text": "What's in these images?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
}]
_assert_mm_data_is_image_input(await mm_future,
2,
skipped_image_indices=[0, 1])
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
phi3v_model_config, phi3v_model_config,
@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
def test_parse_chat_messages_empty_pil_image_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "image_pil",
"image_pil": None,
"uuid": uuid
},
{
"type": "text",
"text": "What's in this image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?",
}]
_assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
def test_parse_chat_messages_empty_image_embeds_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "image_embeds",
"image_embeds": None,
"uuid": uuid
},
{
"type": "text",
"text": "What's in this image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?",
}]
assert mm_data is not None
assert "image" in mm_data
assert mm_data["image"] is None
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
phi3v_model_config,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{
"role":
"user",
"content": [
{
"type": "image_embeds",
"image_embeds": None,
"uuid": uuid
},
{
"type": "text",
"text": "What's in this image?"
},
],
}],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?",
}]
mm_data = await mm_future
assert mm_data is not None
assert "image" in mm_data
assert mm_data["image"] is None
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async( async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config, phi3v_model_config,
@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
expected_uuids=["audio_123"]) expected_uuids=["audio_123"])
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
video_url,
audio_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's on this image?"
},
{
"type": "image_url",
"image_url": None,
"uuid": "image_123",
},
{
"type": "text",
"text": "Now listen to this audio"
},
{
"type": "audio_url",
"audio_url": None,
"uuid": "audio_123",
},
],
},
{
"role": "assistant",
"content": "Some stuff."
},
{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's on this image?"
},
{
"type": "image_url",
"image_url": None,
"uuid": "image_123",
},
{
"type": "text",
"text": "And what's in the video?"
},
{
"type": "video_url",
"video_url": None,
"uuid": "video_123",
},
],
},
],
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
content_format="string",
)
assert conversation == [
{
"role":
"user",
"content":
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
},
{
"role": "assistant",
"content": "Some stuff."
},
{
"role":
"user",
"content":
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
},
]
_assert_mm_data_inputs(mm_data, {
"image": 2,
"video": 1,
"audio": 1
},
skipped_media_indices={
"image": [0, 1],
"video": [0],
"audio": [0]
})
_assert_mm_uuids(mm_uuids,
2,
modality="image",
expected_uuids=["image_123", "image_123"])
_assert_mm_uuids(mm_uuids,
1,
modality="video",
expected_uuids=["video_123"])
_assert_mm_uuids(mm_uuids,
1,
modality="audio",
expected_uuids=["audio_123"])
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
qwen25omni_model_config_mm_interleaved, qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer, qwen25omni_tokenizer,
@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk():
r"[INST]Thanks, what is 3+3?[/INST]") r"[INST]Thanks, what is 3+3?[/INST]")
assert string_tokens == expected_tokens assert string_tokens == expected_tokens
def test_parse_chat_messages_single_empty_audio_with_uuid(
qwen2_audio_model_config,
qwen2_audio_tokenizer,
):
audio_uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
[{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{
"type": "text",
"text": "What does the audio say?"
},
],
}],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
}]
_assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids,
1,
modality="audio",
expected_uuids=[audio_uuid])
@pytest.mark.asyncio
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
qwen2_audio_model_config,
qwen2_audio_tokenizer,
):
audio_uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {},
"uuid": audio_uuid,
},
{
"type": "text",
"text": "What does the audio say?"
},
],
}],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
assert conversation == [{
"role":
"user",
"content":
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
}]
_assert_mm_data_inputs(await mm_future, {"audio": 1})
_assert_mm_uuids(mm_uuids,
1,
modality="audio",
expected_uuids=[audio_uuid])

View File

@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
type: Required[Literal["audio_url"]] type: Required[Literal["audio_url"]]
"""The type of the content part.""" """The type of the content part."""
uuid: Optional[str]
"""
User-provided UUID of a media. User must guarantee that it is properly
generated and unique for different medias.
"""
class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
image_embeds: Required[Union[str, dict[str, str]]] image_embeds: Optional[Union[str, dict[str, str]]]
""" """
The image embeddings. It can be either: The image embeddings. It can be either:
- A single base64 string. - A single base64 string.
@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
type: Required[Literal["video_url"]] type: Required[Literal["video_url"]]
"""The type of the content part.""" """The type of the content part."""
uuid: Optional[str]
"""
User-provided UUID of a media. User must guarantee that it is properly
generated and unique for different medias.
"""
class PILImage(BaseModel): class PILImage(BaseModel):
@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
} }
""" """
image_pil: Required[PILImage] image_pil: Optional[PILImage]
uuid: Optional[str] uuid: Optional[str]
""" """
User-provided UUID of a media. User must guarantee that it is properly User-provided UUID of a media. User must guarantee that it is properly
@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
} }
""" """
image_url: Required[str] image_url: Optional[str]
uuid: Optional[str] uuid: Optional[str]
""" """
User-provided UUID of a media. User must guarantee that it is properly User-provided UUID of a media. User must guarantee that it is properly
@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
} }
""" """
audio_url: Required[str] audio_url: Optional[str]
class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
} }
""" """
video_url: Required[str] video_url: Optional[str]
uuid: Optional[str] uuid: Optional[str]
""" """
User-provided UUID of a media. User must guarantee that it is properly User-provided UUID of a media. User must guarantee that it is properly
@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
self._model_config = model_config self._model_config = model_config
self._tokenizer = tokenizer self._tokenizer = tokenizer
self._items_by_modality = defaultdict[str, list[_T]](list) self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
@property @property
@ -624,7 +614,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return self.mm_registry.create_processor(self.model_config) return self.mm_registry.create_processor(self.model_config)
def add( def add(
self, modality: ModalityStr, item: _T, uuid: Optional[str] = None self,
modality: ModalityStr,
item: Optional[_T],
uuid: Optional[str] = None,
) -> Optional[str]: ) -> Optional[str]:
""" """
Add a multi-modal item to the current prompt and returns the Add a multi-modal item to the current prompt and returns the
@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
if not self._items_by_modality: if not self._items_by_modality:
return None return None
mm_inputs = {} mm_inputs = {}
items_by_modality = { items_by_modality = {}
modality: await asyncio.gather(*items) for modality, items in self._items_by_modality.items():
for modality, items in self._items_by_modality.items() coros = []
} for item in items:
if item is not None:
coros.append(item)
else:
coros.append(asyncio.sleep(0))
items_by_modality[modality] = await asyncio.gather(*coros)
if "image" in items_by_modality and "image_embeds" in items_by_modality: if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError( raise ValueError(
@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC):
return dict(self._placeholder_storage) return dict(self._placeholder_storage)
@abstractmethod @abstractmethod
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: def parse_image(
self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def parse_image_embeds( def parse_image_embeds(
self, self,
image_embeds: Union[str, dict[str, str]], image_embeds: Union[str, dict[str, str], None],
uuid: Optional[str] = None, uuid: Optional[str] = None,
) -> None: ) -> None:
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def parse_image_pil( def parse_image_pil(
self, image_pil: Image.Image, uuid: Optional[str] = None self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
) -> None: ) -> None:
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: def parse_audio(
self, audio_url: Optional[str], uuid: Optional[str] = None
) -> None:
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def parse_input_audio( def parse_input_audio(
self, input_audio: InputAudio, uuid: Optional[str] = None self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
) -> None: ) -> None:
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: def parse_video(
self, video_url: Optional[str], uuid: Optional[str] = None
) -> None:
raise NotImplementedError raise NotImplementedError
@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser):
allowed_local_media_path=tracker.allowed_local_media_path, allowed_local_media_path=tracker.allowed_local_media_path,
) )
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: def parse_image(
image = self._connector.fetch_image(image_url) self, image_url: Optional[str], uuid: Optional[str] = None
) -> None:
image = self._connector.fetch_image(image_url) if image_url else None
placeholder = self._tracker.add("image", image, uuid) placeholder = self._tracker.add("image", image, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_image_embeds( def parse_image_embeds(
self, self,
image_embeds: Union[str, dict[str, str]], image_embeds: Union[str, dict[str, str], None],
uuid: Optional[str] = None, uuid: Optional[str] = None,
) -> None: ) -> None:
if isinstance(image_embeds, dict): if isinstance(image_embeds, dict):
@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser):
embedding = self._connector.fetch_image_embedding(image_embeds) embedding = self._connector.fetch_image_embedding(image_embeds)
placeholder = self._tracker.add("image_embeds", embedding, uuid) placeholder = self._tracker.add("image_embeds", embedding, uuid)
if image_embeds is None:
placeholder = self._tracker.add("image_embeds", None, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_image_pil( def parse_image_pil(
self, image_pil: Image.Image, uuid: Optional[str] = None self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
) -> None: ) -> None:
placeholder = self._tracker.add("image", image_pil, uuid) placeholder = self._tracker.add("image", image_pil, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: def parse_audio(
audio = self._connector.fetch_audio(audio_url) self, audio_url: Optional[str], uuid: Optional[str] = None
) -> None:
audio = self._connector.fetch_audio(audio_url) if audio_url else None
placeholder = self._tracker.add("audio", audio, uuid) placeholder = self._tracker.add("audio", audio, uuid)
self._add_placeholder("audio", placeholder) self._add_placeholder("audio", placeholder)
def parse_input_audio( def parse_input_audio(
self, input_audio: InputAudio, uuid: Optional[str] = None self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
) -> None: ) -> None:
audio_data = input_audio.get("data", "") if input_audio:
audio_format = input_audio.get("format", "") audio_data = input_audio.get("data", "")
audio_url = f"data:audio/{audio_format};base64,{audio_data}" audio_format = input_audio.get("format", "")
if audio_data:
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
else:
# If a UUID is provided, audio data may be empty.
audio_url = None
else:
audio_url = None
return self.parse_audio(audio_url, uuid) return self.parse_audio(audio_url, uuid)
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: def parse_video(
video = self._connector.fetch_video(video_url=video_url) self, video_url: Optional[str], uuid: Optional[str] = None
) -> None:
video = (
self._connector.fetch_video(video_url=video_url)
if video_url
else None
)
placeholder = self._tracker.add("video", video, uuid) placeholder = self._tracker.add("video", video, uuid)
self._add_placeholder("video", placeholder) self._add_placeholder("video", placeholder)
@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
allowed_local_media_path=tracker.allowed_local_media_path, allowed_local_media_path=tracker.allowed_local_media_path,
) )
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: def parse_image(
image_coro = self._connector.fetch_image_async(image_url) self, image_url: Optional[str], uuid: Optional[str] = None
) -> None:
image_coro = (
self._connector.fetch_image_async(image_url) if image_url else None
)
placeholder = self._tracker.add("image", image_coro, uuid) placeholder = self._tracker.add("image", image_coro, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_image_embeds( def parse_image_embeds(
self, self,
image_embeds: Union[str, dict[str, str]], image_embeds: Union[str, dict[str, str], None],
uuid: Optional[str] = None, uuid: Optional[str] = None,
) -> None: ) -> None:
future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() future: asyncio.Future[Union[str, dict[str, str], None]] = (
asyncio.Future()
)
if isinstance(image_embeds, dict): if isinstance(image_embeds, dict):
embeds = { embeds = {
@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
embedding = self._connector.fetch_image_embedding(image_embeds) embedding = self._connector.fetch_image_embedding(image_embeds)
future.set_result(embedding) future.set_result(embedding)
if image_embeds is None:
future.set_result(None)
placeholder = self._tracker.add("image_embeds", future, uuid) placeholder = self._tracker.add("image_embeds", future, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_image_pil( def parse_image_pil(
self, image_pil: Image.Image, uuid: Optional[str] = None self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
) -> None: ) -> None:
future: asyncio.Future[Image.Image] = asyncio.Future() future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
future.set_result(image_pil) if image_pil:
future.set_result(image_pil)
else:
future.set_result(None)
placeholder = self._tracker.add("image", future, uuid) placeholder = self._tracker.add("image", future, uuid)
self._add_placeholder("image", placeholder) self._add_placeholder("image", placeholder)
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: def parse_audio(
audio_coro = self._connector.fetch_audio_async(audio_url) self, audio_url: Optional[str], uuid: Optional[str] = None
) -> None:
audio_coro = (
self._connector.fetch_audio_async(audio_url) if audio_url else None
)
placeholder = self._tracker.add("audio", audio_coro, uuid) placeholder = self._tracker.add("audio", audio_coro, uuid)
self._add_placeholder("audio", placeholder) self._add_placeholder("audio", placeholder)
def parse_input_audio( def parse_input_audio(
self, input_audio: InputAudio, uuid: Optional[str] = None self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
) -> None: ) -> None:
audio_data = input_audio.get("data", "") if input_audio:
audio_format = input_audio.get("format", "") audio_data = input_audio.get("data", "")
audio_url = f"data:audio/{audio_format};base64,{audio_data}" audio_format = input_audio.get("format", "")
if audio_data:
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
else:
# If a UUID is provided, audio data may be empty.
audio_url = None
else:
audio_url = None
return self.parse_audio(audio_url, uuid) return self.parse_audio(audio_url, uuid)
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: def parse_video(
video = self._connector.fetch_video_async(video_url=video_url) self, video_url: Optional[str], uuid: Optional[str] = None
) -> None:
video = (
self._connector.fetch_video_async(video_url=video_url)
if video_url
else None
)
placeholder = self._tracker.add("video", video, uuid) placeholder = self._tracker.add("video", video, uuid)
self._add_placeholder("video", placeholder) self._add_placeholder("video", placeholder)
@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part(
part, dict part, dict
) # This is needed to avoid mypy errors: part.get() from str ) # This is needed to avoid mypy errors: part.get() from str
part_type = part.get("type", None) part_type = part.get("type", None)
uuid = part.get("uuid", None)
if isinstance(part_type, str) and part_type in MM_PARSER_MAP: if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501
content = MM_PARSER_MAP[part_type](part) content = MM_PARSER_MAP[part_type](part)
# Special case for 'image_url.detail' # Special case for 'image_url.detail'
@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part(
# Handle missing 'type' but provided direct URL fields. # Handle missing 'type' but provided direct URL fields.
# 'type' is required field by pydantic # 'type' is required field by pydantic
if part_type is None: if part_type is None or uuid is not None:
if part.get("image_url") is not None: if "image_url" in part:
image_params = cast( image_params = cast(
CustomChatCompletionContentSimpleImageParam, part CustomChatCompletionContentSimpleImageParam, part
) )
return "image_url", image_params.get("image_url", "") image_url = image_params.get("image_url", None)
if part.get("audio_url") is not None: if isinstance(image_url, dict):
# Can potentially happen if user provides a uuid
# with url as a dict of {"url": url}
image_url = image_url.get("url", None)
return "image_url", image_url
if "image_pil" in part:
# "image_pil" could be None if UUID is provided.
image_params = cast( # type: ignore
CustomChatCompletionContentPILImageParam, part
)
image_pil = image_params.get("image_pil", None)
return "image_pil", image_pil
if "image_embeds" in part:
# "image_embeds" could be None if UUID is provided.
image_params = cast( # type: ignore
ChatCompletionContentPartImageEmbedsParam, part
)
image_embeds = image_params.get("image_embeds", None)
return "image_embeds", image_embeds
if "audio_url" in part:
audio_params = cast( audio_params = cast(
CustomChatCompletionContentSimpleAudioParam, part CustomChatCompletionContentSimpleAudioParam, part
) )
return "audio_url", audio_params.get("audio_url", "") audio_url = audio_params.get("audio_url", None)
if isinstance(audio_url, dict):
# Can potentially happen if user provides a uuid
# with url as a dict of {"url": url}
audio_url = audio_url.get("url", None)
return "audio_url", audio_url
if part.get("input_audio") is not None: if part.get("input_audio") is not None:
input_audio_params = cast(dict[str, str], part) input_audio_params = cast(dict[str, str], part)
return "input_audio", input_audio_params return "input_audio", input_audio_params
if part.get("video_url") is not None: if "video_url" in part:
video_params = cast( video_params = cast(
CustomChatCompletionContentSimpleVideoParam, part CustomChatCompletionContentSimpleVideoParam, part
) )
return "video_url", video_params.get("video_url", "") video_url = video_params.get("video_url", None)
if isinstance(video_url, dict):
# Can potentially happen if user provides a uuid
# with url as a dict of {"url": url}
video_url = video_url.get("url", None)
return "video_url", video_url
# Raise an error if no 'type' or direct URL is found. # Raise an error if no 'type' or direct URL is found.
raise ValueError("Missing 'type' field in multimodal part.") raise ValueError("Missing 'type' field in multimodal part.")
@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part(
return part_type, "unknown part_type content" return part_type, "unknown part_type content"
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ( PART_TYPES_TO_SKIP_NONE_CONTENT = (
"text", "text",
"refusal", "refusal",
"image_url",
"image_embeds",
"image_pil",
"audio_url",
"input_audio",
"video_url",
) )
@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part(
part_type, content = _parse_chat_message_content_mm_part(part) part_type, content = _parse_chat_message_content_mm_part(part)
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
# content is None, log a warning and skip # content is None, log a warning and skip
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None:
logger.warning( logger.warning(
"Skipping multimodal part '%s' (type: '%s') " "Skipping multimodal part '%s' (type: '%s') "
"with empty / unparsable content.", "with empty / unparsable content.",
@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part(
modality = None modality = None
if part_type == "image_pil": if part_type == "image_pil":
image_content = cast(Image.Image, content) if content is not None:
image_content = cast(Image.Image, content)
else:
image_content = None
mm_parser.parse_image_pil(image_content, uuid) mm_parser.parse_image_pil(image_content, uuid)
modality = "image" modality = "image"
elif part_type in ("image_url", "input_image"): elif part_type in ("image_url", "input_image"):
@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part(
mm_parser.parse_image(str_content, uuid) mm_parser.parse_image(str_content, uuid)
modality = "image" modality = "image"
elif part_type == "image_embeds": elif part_type == "image_embeds":
content = cast(Union[str, dict[str, str]], content) if content is not None:
content = cast(Union[str, dict[str, str]], content)
else:
content = None
mm_parser.parse_image_embeds(content, uuid) mm_parser.parse_image_embeds(content, uuid)
modality = "image" modality = "image"
elif part_type == "audio_url": elif part_type == "audio_url":

View File

@ -1491,6 +1491,11 @@ class LLM:
for i, prompt in enumerate(it): for i, prompt in enumerate(it):
if isinstance(prompt, dict):
self._validate_mm_data_and_uuids(
prompt.get("multi_modal_data"),
prompt.get("multi_modal_uuids"))
param = params[i] if isinstance(params, Sequence) else params param = params[i] if isinstance(params, Sequence) else params
tokenization_kwargs: dict[str, Any] = {} tokenization_kwargs: dict[str, Any] = {}
@ -1507,6 +1512,41 @@ class LLM:
priority=priority[i] if priority else 0, priority=priority[i] if priority else 0,
) )
def _validate_mm_data_and_uuids(
self,
multi_modal_data: Optional[Any], # MultiModalDataDict
multi_modal_uuids: Optional[Any], # MultiModalUUIDDict
):
"""
Validate that if any multi-modal data is skipped (i.e. None),
then its corresponding UUID must be set.
"""
if multi_modal_data is None:
return
for modality, data in multi_modal_data.items():
if isinstance(data, list):
for i, d in enumerate(data):
if d is None:
if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[ # noqa: E501
modality] is None:
raise ValueError(
f"Multi-modal data for {modality} is None "
f"but UUID is not provided")
else:
if len(
multi_modal_uuids[modality]
) <= i or multi_modal_uuids[modality][i] is None:
raise ValueError(
f"Multi-modal data for {modality} is None "
f"but UUID is not provided")
else:
if data is None and (multi_modal_uuids is None
or modality not in multi_modal_uuids
or multi_modal_uuids[modality] is None):
raise ValueError(f"Multi-modal data for {modality} is None"
f" but UUID is not provided")
def _add_request( def _add_request(
self, self,
prompt: PromptType, prompt: PromptType,

View File

@ -85,9 +85,10 @@ which are treated as audio embeddings;
these are directly passed to the model without HF processing. these are directly passed to the model without HF processing.
""" """
ModalityData: TypeAlias = Union[_T, list[_T]] ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
""" """
Either a single data item, or a list of data items. Either a single data item, or a list of data items. Can only be None if UUID
is provided.
The number of data items allowed per modality is restricted by The number of data items allowed per modality is restricted by
`--limit-mm-per-prompt`. `--limit-mm-per-prompt`.

View File

@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
def __init__(self, data: _T, modality: str) -> None: def __init__(self, data: _T, modality: str) -> None:
super().__init__() super().__init__()
self.data = data self.data: _T = data
self.modality = modality self.modality = modality
def __repr__(self) -> str: def __repr__(self) -> str:
@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
def __init__(self, data: Sequence[HfAudioItem]) -> None: def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
if data is None:
data = [None]
super().__init__(data, "audio") super().__init__(data, "audio")
def get_audio_length(self, item_idx: int) -> int: def get_audio_length(self, item_idx: int) -> int:
@ -198,7 +200,9 @@ class ImageSize(NamedTuple):
class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
def __init__(self, data: Sequence[HfImageItem]) -> None: def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
if data is None:
data = [None]
super().__init__(data, "image") super().__init__(data, "image")
def get_image_size(self, item_idx: int) -> ImageSize: def get_image_size(self, item_idx: int) -> ImageSize:
@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
def __init__( def __init__(
self, self,
data: Sequence[HfVideoItem], data: Optional[Sequence[HfVideoItem]],
metadata: Optional[Union[dict[str, Any], metadata: Optional[Union[dict[str, Any],
list[Optional[dict[str, Any]]]]] = None, list[Optional[dict[str, Any]]]]] = None,
) -> None: ) -> None:
if data is None:
data = [None]
super().__init__(data, "video") super().__init__(data, "video")
self.metadata = metadata self.metadata = metadata
@ -385,6 +391,9 @@ class MultiModalDataParser:
self, self,
data: ModalityData[AudioItem], data: ModalityData[AudioItem],
) -> Optional[ModalityDataItems[Any, Any]]: ) -> Optional[ModalityDataItems[Any, Any]]:
if data is None:
return AudioProcessorItems(None)
# also check single audio item with sampling rate # also check single audio item with sampling rate
if self._is_empty(data) or (isinstance(data, tuple) if self._is_empty(data) or (isinstance(data, tuple)
and self._is_empty(data[0])): and self._is_empty(data[0])):
@ -420,6 +429,9 @@ class MultiModalDataParser:
self, self,
data: ModalityData[ImageItem], data: ModalityData[ImageItem],
) -> Optional[ModalityDataItems[Any, Any]]: ) -> Optional[ModalityDataItems[Any, Any]]:
if data is None:
return ImageProcessorItems(None)
if self._is_empty(data): if self._is_empty(data):
return None return None
@ -441,6 +453,9 @@ class MultiModalDataParser:
self, self,
data: ModalityData[VideoItem], data: ModalityData[VideoItem],
) -> Optional[ModalityDataItems[Any, Any]]: ) -> Optional[ModalityDataItems[Any, Any]]:
if data is None:
return VideoProcessorItems(None)
if self._is_empty(data): if self._is_empty(data):
return None return None

View File

@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
""" """
mm_items = self.data_parser.parse_mm_data(mm_data) mm_items = self.data_parser.parse_mm_data(mm_data)
for modality, items in mm_items.items(): for modality, items in mm_items.items():
self.validate_num_items(modality, len(items)) self.validate_num_items(modality, len(items))
@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
] ]
for modality, items_is_cached in mm_is_cached.items() for modality, items_is_cached in mm_is_cached.items()
} }
mm_missing_data = { mm_missing_data = {}
modality: [mm_data_items[modality][idx] for idx in idxs] for modality, idxs in mm_missing_idxs.items():
for modality, idxs in mm_missing_idxs.items() missing_modality_data = []
} for idx in idxs:
data = mm_data_items[modality][idx]
if data is None:
raise ValueError(
f"Cache miss for {modality} at index {idx} "
f"but data is not provided.")
else:
missing_modality_data.append(data)
mm_missing_data[modality] = missing_modality_data
return self._to_mm_items(mm_missing_data) return self._to_mm_items(mm_missing_data)