diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 77baa27c7a958..7fb0337235005 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t print(o.outputs[0].text) ``` +Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache. + +??? code + + ```python + from vllm import LLM + from PIL import Image + + # Qwen2.5-VL example with two images + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct") + + prompt = "USER: \nDescribe the differences.\nASSISTANT:" + img_b = Image.open("/path/to/b.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": [None, img_b]}, + # Since img_a is expected to be cached, we can skip sending the actual + # image entirely. + "multi_modal_uuids": {"image": ["sku-1234-a", None]}, + }) + + for o in outputs: + print(o.outputs[0].text) + ``` + !!! warning If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. @@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ) ``` +For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this: + + ```python + # Image/video/audio URL: + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + + # image_embeds + { + "type": "image_embeds", + "image_embeds": None, + "uuid": image_uuid + }, + + # input_audio: + { + "type": "input_audio", + "input_audio": None, + "uuid": audio_uuid + }, + + # PIL Image: + { + "type": "image_pil", + "image_pil": None + "uuid": image_uuid + } + + ``` + !!! note Only one message can contain `{"type": "image_embeds"}`. If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index b104113b88213..4b75eb19fcf94 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1764,6 +1764,7 @@ def apply_image_repeat( probs = [1.0 - image_repeat_prob, image_repeat_prob] inputs = [] + inputs_with_empty_media = [] cur_image = data for i in range(num_prompts): if image_repeat_prob is not None: @@ -1774,14 +1775,25 @@ def apply_image_repeat( new_val = (i // 256 // 256, i // 256, i % 256) cur_image.putpixel((0, 0), new_val) + uuid = "uuid_{}".format(i) + inputs.append( { "prompt": prompts[i % len(prompts)], "multi_modal_data": {modality: cur_image}, + "multi_modal_uuids": {modality: uuid}, } ) - return inputs + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) + + return inputs, inputs_with_empty_media @contextmanager @@ -1860,6 +1872,13 @@ def parse_args(): help="If True, then use different prompt (with the same multi-modal " "data) for each request.", ) + + parser.add_argument( + "--verify-mm-cache-hit-with-uuids", + action="store_true", + help="If True, will send all requests in a second batch with empty mm " + "data to verify cache hits with UUIDs.", + ) return parser.parse_args() @@ -1903,26 +1922,48 @@ def main(args): assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference + uuid = "uuid_0" inputs = { "prompt": prompts[0], "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + inputs_with_empty_media = { + "prompt": prompts[0], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, } else: # Batch inference if args.image_repeat_prob is not None: # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat( - args.image_repeat_prob, args.num_prompts, data, prompts, modality + inputs, inputs_with_empty_media = apply_image_repeat( + args.image_repeat_prob, + args.num_prompts, + data, + prompts, + modality, ) else: # Use the same image for all prompts - inputs = [ - { - "prompt": prompts[i % len(prompts)], - "multi_modal_data": {modality: data}, - } - for i in range(args.num_prompts) - ] + inputs = [] + inputs_with_empty_media = [] + for i in range(args.num_prompts): + uuid = "uuid_{}".format(i) + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + ) + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) # Add LoRA request if applicable lora_request = ( @@ -1942,6 +1983,26 @@ def main(args): print(generated_text) print("-" * 50) + if args.verify_mm_cache_hit_with_uuids: + try: + # Verify cache hits with UUIDs + print( + "Sending a second batch of requests with empty media" + " and matching UUIDs." + ) + outputs = llm.generate( + inputs_with_empty_media, + sampling_params=sampling_params, + lora_request=lora_request, + ) + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + except Exception as e: + print(f"Failed to verify cache hits with UUIDs. Error: {e}") + if __name__ == "__main__": args = parse_args() diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 72819f31de206..a324e86666055 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid( assert isinstance(chat_completion.choices[0].message.content, str) assert len(chat_completion.choices[0].message.content) > 0 + # Second request, with empty image but the same uuid. + chat_completion_with_empty_image = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": image_url + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion_with_empty_image.choices[ + 0].message.content is not None + assert isinstance( + chat_completion_with_empty_image.choices[0].message.content, str) + assert len( + chat_completion_with_empty_image.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_completions_with_empty_image_with_uuid_without_cache_hit( + client: openai.AsyncOpenAI, + model_name: str, +): + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": "uuid_not_previously_seen" + }, + ], + }, + ], + model=model_name, + ) + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5149ca346050e..dd33f5c8c1d8e 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -79,6 +79,28 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="function") +def qwen2_audio_model_config(): + return ModelConfig( + QWEN2AUDIO_MODEL_ID, + runner="generate", + trust_remote_code=True, + limit_mm_per_prompt={ + "audio": 1, + }, + ) + + +@pytest.fixture(scope="module") +def qwen2_audio_tokenizer(): + return TokenizerGroup( + tokenizer_id=QWEN2AUDIO_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): return ModelConfig( @@ -169,6 +191,7 @@ def audio_url(): def _assert_mm_data_is_image_input( mm_data: Optional[MultiModalDataDict], image_count: int, + skipped_image_indices: Optional[list] = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} @@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input( assert image_data is not None assert isinstance(image_data, list) and len(image_data) == image_count + if skipped_image_indices is not None: + for i in skipped_image_indices: + assert image_data[i] is None def _assert_mm_uuids( @@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( - mm_data: Optional[MultiModalDataDict], - data_count: MultiModalDataCounts, + mm_data: Optional[MultiModalDataDict], + data_count: MultiModalDataCounts, + skipped_media_indices: Optional[dict[ + str, list]] = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) @@ -216,6 +244,13 @@ def _assert_mm_data_inputs( assert modality_data is not None assert isinstance(modality_data, list) and len(modality_data) == n + if skipped_media_indices is not None: + skipped_media_indices_for_modality = skipped_media_indices.get( + modality) + assert skipped_media_indices_for_modality is not None + for i in skipped_media_indices_for_modality: + assert modality_data[i] is None + def test_parse_chat_messages_single_image( phi3v_model_config, @@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +def test_parse_chat_messages_single_empty_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + def test_parse_chat_messages_single_image_with_bad_uuid_format( phi3v_model_config, phi3v_tokenizer, @@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +def test_parse_chat_messages_multiple_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +def test_parse_chat_messages_mixed_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_single_image_with_uuid_async( phi3v_model_config, @@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, + 1, + skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_async( phi3v_model_config, @@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_pil", + "image_pil": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, + 2, + skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( phi3v_model_config, @@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) +def test_parse_chat_messages_empty_pil_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_pil", + "image_pil": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +def test_parse_chat_messages_empty_image_embeds_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + mm_data = await mm_future + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, @@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl expected_uuids=["audio_123"]) +def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": None, + "uuid": "audio_123", + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": None, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, { + "image": 2, + "video": 1, + "audio": 1 + }, + skipped_media_indices={ + "image": [0, 1], + "video": [0], + "audio": [0] + }) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", "image_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=["audio_123"]) + + def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, @@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk(): r"[INST]Thanks, what is 3+3?[/INST]") assert string_tokens == expected_tokens + + +def test_parse_chat_messages_single_empty_audio_with_uuid( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(mm_data, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_single_empty_audio_with_uuid_async( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(await mm_future, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b53dbfb3a26a2..aa231de93c0c3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): type: Required[Literal["audio_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): - image_embeds: Required[Union[str, dict[str, str]]] + image_embeds: Optional[Union[str, dict[str, str]]] """ The image embeddings. It can be either: - A single base64 string. @@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class PILImage(BaseModel): @@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): } """ - image_pil: Required[PILImage] + image_pil: Optional[PILImage] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): } """ - image_url: Required[str] + image_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): } """ - audio_url: Required[str] + audio_url: Optional[str] class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): @@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): } """ - video_url: Required[str] + video_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self._model_config = model_config self._tokenizer = tokenizer - self._items_by_modality = defaultdict[str, list[_T]](list) + self._items_by_modality = defaultdict[str, list[Optional[_T]]](list) self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) @property @@ -624,14 +614,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return self.mm_registry.create_processor(self.model_config) def add( - self, modality: ModalityStr, item: _T, uuid: Optional[str] = None + self, + modality: ModalityStr, + item: Optional[_T], + uuid: Optional[str] = None, ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. An optional uuid can be added which serves as a unique identifier of the - media. + media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 @@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): if not self._items_by_modality: return None mm_inputs = {} - items_by_modality = { - modality: await asyncio.gather(*items) - for modality, items in self._items_by_modality.items() - } + items_by_modality = {} + for modality, items in self._items_by_modality.items(): + coros = [] + for item in items: + if item is not None: + coros.append(item) + else: + coros.append(asyncio.sleep(0)) + items_by_modality[modality] = await asyncio.gather(*coros) if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( @@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC): return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image = self._connector.fetch_image(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image = self._connector.fetch_image(image_url) if image_url else None placeholder = self._tracker.add("image", image, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: if isinstance(image_embeds, dict): @@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) placeholder = self._tracker.add("image_embeds", embedding, uuid) + if image_embeds is None: + placeholder = self._tracker.add("image_embeds", None, uuid) + self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: placeholder = self._tracker.add("image", image_pil, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio = self._connector.fetch_audio(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio = self._connector.fetch_audio(audio_url) if audio_url else None placeholder = self._tracker.add("audio", audio, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image_coro = self._connector.fetch_image_async(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image_coro = ( + self._connector.fetch_image_async(image_url) if image_url else None + ) placeholder = self._tracker.add("image", image_coro, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: - future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() + future: asyncio.Future[Union[str, dict[str, str], None]] = ( + asyncio.Future() + ) if isinstance(image_embeds, dict): embeds = { @@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) + if image_embeds is None: + future.set_result(None) + placeholder = self._tracker.add("image_embeds", future, uuid) self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: - future: asyncio.Future[Image.Image] = asyncio.Future() - future.set_result(image_pil) + future: asyncio.Future[Optional[Image.Image]] = asyncio.Future() + if image_pil: + future.set_result(image_pil) + else: + future.set_result(None) placeholder = self._tracker.add("image", future, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio_coro = self._connector.fetch_audio_async(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio_coro = ( + self._connector.fetch_audio_async(audio_url) if audio_url else None + ) placeholder = self._tracker.add("audio", audio_coro, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video_async(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video_async(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part( part, dict ) # This is needed to avoid mypy errors: part.get() from str part_type = part.get("type", None) + uuid = part.get("uuid", None) - if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501 content = MM_PARSER_MAP[part_type](part) # Special case for 'image_url.detail' @@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part( # Handle missing 'type' but provided direct URL fields. # 'type' is required field by pydantic - if part_type is None: - if part.get("image_url") is not None: + if part_type is None or uuid is not None: + if "image_url" in part: image_params = cast( CustomChatCompletionContentSimpleImageParam, part ) - return "image_url", image_params.get("image_url", "") - if part.get("audio_url") is not None: + image_url = image_params.get("image_url", None) + if isinstance(image_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + image_url = image_url.get("url", None) + return "image_url", image_url + if "image_pil" in part: + # "image_pil" could be None if UUID is provided. + image_params = cast( # type: ignore + CustomChatCompletionContentPILImageParam, part + ) + image_pil = image_params.get("image_pil", None) + return "image_pil", image_pil + if "image_embeds" in part: + # "image_embeds" could be None if UUID is provided. + image_params = cast( # type: ignore + ChatCompletionContentPartImageEmbedsParam, part + ) + image_embeds = image_params.get("image_embeds", None) + return "image_embeds", image_embeds + if "audio_url" in part: audio_params = cast( CustomChatCompletionContentSimpleAudioParam, part ) - return "audio_url", audio_params.get("audio_url", "") + audio_url = audio_params.get("audio_url", None) + if isinstance(audio_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + audio_url = audio_url.get("url", None) + return "audio_url", audio_url if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params - if part.get("video_url") is not None: + if "video_url" in part: video_params = cast( CustomChatCompletionContentSimpleVideoParam, part ) - return "video_url", video_params.get("video_url", "") + video_url = video_params.get("video_url", None) + if isinstance(video_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + video_url = video_url.get("url", None) + return "video_url", video_url # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part( return part_type, "unknown part_type content" -VALID_MESSAGE_CONTENT_MM_PART_TYPES = ( +PART_TYPES_TO_SKIP_NONE_CONTENT = ( "text", "refusal", - "image_url", - "image_embeds", - "image_pil", - "audio_url", - "input_audio", - "video_url", ) @@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part( part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # content is None, log a warning and skip - if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: + if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None: logger.warning( "Skipping multimodal part '%s' (type: '%s') " "with empty / unparsable content.", @@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part( modality = None if part_type == "image_pil": - image_content = cast(Image.Image, content) + if content is not None: + image_content = cast(Image.Image, content) + else: + image_content = None mm_parser.parse_image_pil(image_content, uuid) modality = "image" elif part_type in ("image_url", "input_image"): @@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part( mm_parser.parse_image(str_content, uuid) modality = "image" elif part_type == "image_embeds": - content = cast(Union[str, dict[str, str]], content) + if content is not None: + content = cast(Union[str, dict[str, str]], content) + else: + content = None mm_parser.parse_image_embeds(content, uuid) modality = "image" elif part_type == "audio_url": diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 94749eb884512..4b51dbcd8acb9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1491,6 +1491,11 @@ class LLM: for i, prompt in enumerate(it): + if isinstance(prompt, dict): + self._validate_mm_data_and_uuids( + prompt.get("multi_modal_data"), + prompt.get("multi_modal_uuids")) + param = params[i] if isinstance(params, Sequence) else params tokenization_kwargs: dict[str, Any] = {} @@ -1507,6 +1512,41 @@ class LLM: priority=priority[i] if priority else 0, ) + def _validate_mm_data_and_uuids( + self, + multi_modal_data: Optional[Any], # MultiModalDataDict + multi_modal_uuids: Optional[Any], # MultiModalUUIDDict + ): + """ + Validate that if any multi-modal data is skipped (i.e. None), + then its corresponding UUID must be set. + """ + if multi_modal_data is None: + return + + for modality, data in multi_modal_data.items(): + if isinstance(data, list): + for i, d in enumerate(data): + if d is None: + if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[ # noqa: E501 + modality] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if len( + multi_modal_uuids[modality] + ) <= i or multi_modal_uuids[modality][i] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if data is None and (multi_modal_uuids is None + or modality not in multi_modal_uuids + or multi_modal_uuids[modality] is None): + raise ValueError(f"Multi-modal data for {modality} is None" + f" but UUID is not provided") + def _add_request( self, prompt: PromptType, diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f8ea3835f049d..240e34e139cfe 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -85,9 +85,10 @@ which are treated as audio embeddings; these are directly passed to the model without HF processing. """ -ModalityData: TypeAlias = Union[_T, list[_T]] +ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None] """ -Either a single data item, or a list of data items. +Either a single data item, or a list of data items. Can only be None if UUID +is provided. The number of data items allowed per modality is restricted by `--limit-mm-per-prompt`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 88bb99529f200..493dd3560a516 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]): def __init__(self, data: _T, modality: str) -> None: super().__init__() - self.data = data + self.data: _T = data self.modality = modality def __repr__(self) -> str: @@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor], class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): - def __init__(self, data: Sequence[HfAudioItem]) -> None: + def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "audio") def get_audio_length(self, item_idx: int) -> int: @@ -198,7 +200,9 @@ class ImageSize(NamedTuple): class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): - def __init__(self, data: Sequence[HfImageItem]) -> None: + def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "image") def get_image_size(self, item_idx: int) -> ImageSize: @@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__( self, - data: Sequence[HfVideoItem], + data: Optional[Sequence[HfVideoItem]], metadata: Optional[Union[dict[str, Any], list[Optional[dict[str, Any]]]]] = None, ) -> None: + if data is None: + data = [None] super().__init__(data, "video") self.metadata = metadata @@ -385,6 +391,9 @@ class MultiModalDataParser: self, data: ModalityData[AudioItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return AudioProcessorItems(None) + # also check single audio item with sampling rate if self._is_empty(data) or (isinstance(data, tuple) and self._is_empty(data[0])): @@ -420,6 +429,9 @@ class MultiModalDataParser: self, data: ModalityData[ImageItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return ImageProcessorItems(None) + if self._is_empty(data): return None @@ -441,6 +453,9 @@ class MultiModalDataParser: self, data: ModalityData[VideoItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return VideoProcessorItems(None) + if self._is_empty(data): return None diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index e5db356b635f3..7471bfcb4d508 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) - for modality, items in mm_items.items(): self.validate_num_items(modality, len(items)) @@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ] for modality, items_is_cached in mm_is_cached.items() } - mm_missing_data = { - modality: [mm_data_items[modality][idx] for idx in idxs] - for modality, idxs in mm_missing_idxs.items() - } + mm_missing_data = {} + for modality, idxs in mm_missing_idxs.items(): + missing_modality_data = [] + for idx in idxs: + data = mm_data_items[modality][idx] + if data is None: + raise ValueError( + f"Cache miss for {modality} at index {idx} " + f"but data is not provided.") + else: + missing_modality_data.append(data) + mm_missing_data[modality] = missing_modality_data return self._to_mm_items(mm_missing_data)