diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 77baa27c7a958..7fb0337235005 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t
         print(o.outputs[0].text)
     ```
 
+Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [None, img_b]},
+        # Since img_a is expected to be cached, we can skip sending the actual
+        # image entirely.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
 !!! warning
     If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
 
@@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
     )
     ```
 
+For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
+
+    ```python
+        # Image/video/audio URL:
+        {
+            "type": "image_url",
+            "image_url": None,
+            "uuid": image_uuid,
+        },
+
+        # image_embeds
+        {
+            "type": "image_embeds",
+            "image_embeds": None,
+            "uuid": image_uuid
+        },
+
+        # input_audio:
+        {
+            "type": "input_audio",
+            "input_audio": None,
+            "uuid": audio_uuid
+        },
+
+        # PIL Image:
+        {
+            "type": "image_pil",
+            "image_pil": None
+            "uuid": image_uuid
+        }
+
+    ```
+
 !!! note
     Only one message can contain `{"type": "image_embeds"}`.
     If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b104113b88213..4b75eb19fcf94 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1764,6 +1764,7 @@ def apply_image_repeat(
     probs = [1.0 - image_repeat_prob, image_repeat_prob]
 
     inputs = []
+    inputs_with_empty_media = []
     cur_image = data
     for i in range(num_prompts):
         if image_repeat_prob is not None:
@@ -1774,14 +1775,25 @@ def apply_image_repeat(
                 new_val = (i // 256 // 256, i // 256, i % 256)
                 cur_image.putpixel((0, 0), new_val)
 
+        uuid = "uuid_{}".format(i)
+
         inputs.append(
             {
                 "prompt": prompts[i % len(prompts)],
                 "multi_modal_data": {modality: cur_image},
+                "multi_modal_uuids": {modality: uuid},
             }
         )
 
-    return inputs
+        inputs_with_empty_media.append(
+            {
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {modality: None},
+                "multi_modal_uuids": {modality: uuid},
+            }
+        )
+
+    return inputs, inputs_with_empty_media
 
 
 @contextmanager
@@ -1860,6 +1872,13 @@ def parse_args():
         help="If True, then use different prompt (with the same multi-modal "
         "data) for each request.",
     )
+
+    parser.add_argument(
+        "--verify-mm-cache-hit-with-uuids",
+        action="store_true",
+        help="If True, will send all requests in a second batch with empty mm "
+        "data to verify cache hits with UUIDs.",
+    )
     return parser.parse_args()
 
 
@@ -1903,26 +1922,48 @@ def main(args):
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
+        uuid = "uuid_0"
         inputs = {
             "prompt": prompts[0],
             "multi_modal_data": {modality: data},
+            "multi_modal_uuids": {modality: uuid},
+        }
+        inputs_with_empty_media = {
+            "prompt": prompts[0],
+            "multi_modal_data": {modality: None},
+            "multi_modal_uuids": {modality: uuid},
         }
     else:
         # Batch inference
         if args.image_repeat_prob is not None:
             # Repeat images with specified probability of "image_repeat_prob"
-            inputs = apply_image_repeat(
-                args.image_repeat_prob, args.num_prompts, data, prompts, modality
+            inputs, inputs_with_empty_media = apply_image_repeat(
+                args.image_repeat_prob,
+                args.num_prompts,
+                data,
+                prompts,
+                modality,
             )
         else:
             # Use the same image for all prompts
-            inputs = [
-                {
-                    "prompt": prompts[i % len(prompts)],
-                    "multi_modal_data": {modality: data},
-                }
-                for i in range(args.num_prompts)
-            ]
+            inputs = []
+            inputs_with_empty_media = []
+            for i in range(args.num_prompts):
+                uuid = "uuid_{}".format(i)
+                inputs.append(
+                    {
+                        "prompt": prompts[i % len(prompts)],
+                        "multi_modal_data": {modality: data},
+                        "multi_modal_uuids": {modality: uuid},
+                    }
+                )
+                inputs_with_empty_media.append(
+                    {
+                        "prompt": prompts[i % len(prompts)],
+                        "multi_modal_data": {modality: None},
+                        "multi_modal_uuids": {modality: uuid},
+                    }
+                )
 
     # Add LoRA request if applicable
     lora_request = (
@@ -1942,6 +1983,26 @@ def main(args):
         print(generated_text)
         print("-" * 50)
 
+    if args.verify_mm_cache_hit_with_uuids:
+        try:
+            # Verify cache hits with UUIDs
+            print(
+                "Sending a second batch of requests with empty media"
+                " and matching UUIDs."
+            )
+            outputs = llm.generate(
+                inputs_with_empty_media,
+                sampling_params=sampling_params,
+                lora_request=lora_request,
+            )
+            print("-" * 50)
+            for o in outputs:
+                generated_text = o.outputs[0].text
+                print(generated_text)
+                print("-" * 50)
+        except Exception as e:
+            print(f"Failed to verify cache hits with UUIDs. Error: {e}")
+
 
 if __name__ == "__main__":
     args = parse_args()
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 72819f31de206..a324e86666055 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid(
         assert isinstance(chat_completion.choices[0].message.content, str)
         assert len(chat_completion.choices[0].message.content) > 0
 
+        # Second request, with empty image but the same uuid.
+        chat_completion_with_empty_image = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {},
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion_with_empty_image.choices[
+            0].message.content is not None
+        assert isinstance(
+            chat_completion_with_empty_image.choices[0].message.content, str)
+        assert len(
+            chat_completion_with_empty_image.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_with_empty_image_with_uuid_without_cache_hit(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {},
+                            "uuid": "uuid_not_previously_seen"
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5149ca346050e..dd33f5c8c1d8e 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -79,6 +79,28 @@ def phi3v_tokenizer():
     )
 
 
+@pytest.fixture(scope="function")
+def qwen2_audio_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 1,
+        },
+    )
+
+
+@pytest.fixture(scope="module")
+def qwen2_audio_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=QWEN2AUDIO_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
     return ModelConfig(
@@ -169,6 +191,7 @@ def audio_url():
 def _assert_mm_data_is_image_input(
     mm_data: Optional[MultiModalDataDict],
     image_count: int,
+    skipped_image_indices: Optional[list] = None,
 ) -> None:
     assert mm_data is not None
     assert set(mm_data.keys()) == {"image"}
@@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input(
     assert image_data is not None
 
     assert isinstance(image_data, list) and len(image_data) == image_count
+    if skipped_image_indices is not None:
+        for i in skipped_image_indices:
+            assert image_data[i] is None
 
 
 def _assert_mm_uuids(
@@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int]
 
 
 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
-    data_count: MultiModalDataCounts,
+        mm_data: Optional[MultiModalDataDict],
+        data_count: MultiModalDataCounts,
+        skipped_media_indices: Optional[dict[
+            str, list]] = None,  # modality -> list[int]
 ) -> None:
     assert mm_data is not None
     assert set(data_count.keys()) == (set(mm_data.keys()))
@@ -216,6 +244,13 @@ def _assert_mm_data_inputs(
         assert modality_data is not None
         assert isinstance(modality_data, list) and len(modality_data) == n
 
+        if skipped_media_indices is not None:
+            skipped_media_indices_for_modality = skipped_media_indices.get(
+                modality)
+            assert skipped_media_indices_for_modality is not None
+            for i in skipped_media_indices_for_modality:
+                assert modality_data[i] is None
+
 
 def test_parse_chat_messages_single_image(
     phi3v_model_config,
@@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid(
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
 
+def test_parse_chat_messages_single_empty_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
 def test_parse_chat_messages_single_image_with_bad_uuid_format(
     phi3v_model_config,
     phi3v_tokenizer,
@@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids(
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
 
+def test_parse_chat_messages_multiple_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+def test_parse_chat_messages_mixed_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
     phi3v_model_config,
@@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
 
+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   1,
+                                   skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
     phi3v_model_config,
@@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
 
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   2,
+                                   skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
     phi3v_model_config,
@@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images(
     _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
+def test_parse_chat_messages_empty_pil_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_empty_image_embeds_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
     phi3v_model_config,
@@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                      expected_uuids=["audio_123"])
 
 
+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": None,
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": None,
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {
+        "image": 2,
+        "video": 1,
+        "audio": 1
+    },
+                           skipped_media_indices={
+                               "image": [0, 1],
+                               "video": [0],
+                               "audio": [0]
+                           })
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
     qwen25omni_model_config_mm_interleaved,
     qwen25omni_tokenizer,
@@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk():
         r"[INST]Thanks, what is 3+3?[/INST]")
 
     assert string_tokens == expected_tokens
+
+
+def test_parse_chat_messages_single_empty_audio_with_uuid(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(mm_data, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(await mm_future, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b53dbfb3a26a2..aa231de93c0c3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
     type: Required[Literal["audio_url"]]
     """The type of the content part."""
-    uuid: Optional[str]
-    """
-    User-provided UUID of a media. User must guarantee that it is properly
-    generated and unique for different medias.
-    """
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
-    image_embeds: Required[Union[str, dict[str, str]]]
+    image_embeds: Optional[Union[str, dict[str, str]]]
     """
     The image embeddings. It can be either:
     - A single base64 string.
@@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
 
     type: Required[Literal["video_url"]]
     """The type of the content part."""
-    uuid: Optional[str]
-    """
-    User-provided UUID of a media. User must guarantee that it is properly
-    generated and unique for different medias.
-    """
 
 
 class PILImage(BaseModel):
@@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
     }
     """
 
-    image_pil: Required[PILImage]
+    image_pil: Optional[PILImage]
     uuid: Optional[str]
     """
     User-provided UUID of a media. User must guarantee that it is properly
@@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     }
     """
 
-    image_url: Required[str]
+    image_url: Optional[str]
     uuid: Optional[str]
     """
     User-provided UUID of a media. User must guarantee that it is properly
@@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     }
     """
 
-    audio_url: Required[str]
+    audio_url: Optional[str]
 
 
 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
@@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     }
     """
 
-    video_url: Required[str]
+    video_url: Optional[str]
     uuid: Optional[str]
     """
     User-provided UUID of a media. User must guarantee that it is properly
@@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._model_config = model_config
         self._tokenizer = tokenizer
 
-        self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
         self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
 
     @property
@@ -624,14 +614,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         return self.mm_registry.create_processor(self.model_config)
 
     def add(
-        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+        self,
+        modality: ModalityStr,
+        item: Optional[_T],
+        uuid: Optional[str] = None,
     ) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
 
         An optional uuid can be added which serves as a unique identifier of the
-        media. 
+        media.
         """
         input_modality = modality.replace("_embeds", "")
         num_items = len(self._items_by_modality[modality]) + 1
@@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
         if not self._items_by_modality:
             return None
         mm_inputs = {}
-        items_by_modality = {
-            modality: await asyncio.gather(*items)
-            for modality, items in self._items_by_modality.items()
-        }
+        items_by_modality = {}
+        for modality, items in self._items_by_modality.items():
+            coros = []
+            for item in items:
+                if item is not None:
+                    coros.append(item)
+                else:
+                    coros.append(asyncio.sleep(0))
+            items_by_modality[modality] = await asyncio.gather(*coros)
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError(
@@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC):
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
         uuid: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
 
@@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
-        image = self._connector.fetch_image(image_url)
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        image = self._connector.fetch_image(image_url) if image_url else None
 
         placeholder = self._tracker.add("image", image, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
         uuid: Optional[str] = None,
     ) -> None:
         if isinstance(image_embeds, dict):
@@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             embedding = self._connector.fetch_image_embedding(image_embeds)
             placeholder = self._tracker.add("image_embeds", embedding, uuid)
 
+        if image_embeds is None:
+            placeholder = self._tracker.add("image_embeds", None, uuid)
+
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
     ) -> None:
         placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
-        audio = self._connector.fetch_audio(audio_url)
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        audio = self._connector.fetch_audio(audio_url) if audio_url else None
 
         placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
     ) -> None:
-        audio_data = input_audio.get("data", "")
-        audio_format = input_audio.get("format", "")
-        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+        if input_audio:
+            audio_data = input_audio.get("data", "")
+            audio_format = input_audio.get("format", "")
+            if audio_data:
+                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+            else:
+                # If a UUID is provided, audio data may be empty.
+                audio_url = None
+        else:
+            audio_url = None
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
-        video = self._connector.fetch_video(video_url=video_url)
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        video = (
+            self._connector.fetch_video(video_url=video_url)
+            if video_url
+            else None
+        )
 
         placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
@@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
-        image_coro = self._connector.fetch_image_async(image_url)
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        image_coro = (
+            self._connector.fetch_image_async(image_url) if image_url else None
+        )
 
         placeholder = self._tracker.add("image", image_coro, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
         uuid: Optional[str] = None,
     ) -> None:
-        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
+        future: asyncio.Future[Union[str, dict[str, str], None]] = (
+            asyncio.Future()
+        )
 
         if isinstance(image_embeds, dict):
             embeds = {
@@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             embedding = self._connector.fetch_image_embedding(image_embeds)
             future.set_result(embedding)
 
+        if image_embeds is None:
+            future.set_result(None)
+
         placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
     ) -> None:
-        future: asyncio.Future[Image.Image] = asyncio.Future()
-        future.set_result(image_pil)
+        future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
+        if image_pil:
+            future.set_result(image_pil)
+        else:
+            future.set_result(None)
 
         placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
-        audio_coro = self._connector.fetch_audio_async(audio_url)
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        audio_coro = (
+            self._connector.fetch_audio_async(audio_url) if audio_url else None
+        )
 
         placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
     ) -> None:
-        audio_data = input_audio.get("data", "")
-        audio_format = input_audio.get("format", "")
-        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+        if input_audio:
+            audio_data = input_audio.get("data", "")
+            audio_format = input_audio.get("format", "")
+            if audio_data:
+                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+            else:
+                # If a UUID is provided, audio data may be empty.
+                audio_url = None
+        else:
+            audio_url = None
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
-        video = self._connector.fetch_video_async(video_url=video_url)
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        video = (
+            self._connector.fetch_video_async(video_url=video_url)
+            if video_url
+            else None
+        )
 
         placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
@@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part(
         part, dict
     )  # This is needed to avoid mypy errors: part.get() from str
     part_type = part.get("type", None)
+    uuid = part.get("uuid", None)
 
-    if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
+    if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501
         content = MM_PARSER_MAP[part_type](part)
 
         # Special case for 'image_url.detail'
@@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part(
 
     # Handle missing 'type' but provided direct URL fields.
     # 'type' is required field by pydantic
-    if part_type is None:
-        if part.get("image_url") is not None:
+    if part_type is None or uuid is not None:
+        if "image_url" in part:
             image_params = cast(
                 CustomChatCompletionContentSimpleImageParam, part
             )
-            return "image_url", image_params.get("image_url", "")
-        if part.get("audio_url") is not None:
+            image_url = image_params.get("image_url", None)
+            if isinstance(image_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                image_url = image_url.get("url", None)
+            return "image_url", image_url
+        if "image_pil" in part:
+            # "image_pil" could be None if UUID is provided.
+            image_params = cast( # type: ignore 
+                CustomChatCompletionContentPILImageParam, part
+            )
+            image_pil = image_params.get("image_pil", None)
+            return "image_pil", image_pil
+        if "image_embeds" in part:
+            # "image_embeds" could be None if UUID is provided.
+            image_params = cast( # type: ignore 
+                ChatCompletionContentPartImageEmbedsParam, part
+            )
+            image_embeds = image_params.get("image_embeds", None)
+            return "image_embeds", image_embeds
+        if "audio_url" in part:
             audio_params = cast(
                 CustomChatCompletionContentSimpleAudioParam, part
             )
-            return "audio_url", audio_params.get("audio_url", "")
+            audio_url = audio_params.get("audio_url", None)
+            if isinstance(audio_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                audio_url = audio_url.get("url", None)
+            return "audio_url", audio_url
         if part.get("input_audio") is not None:
             input_audio_params = cast(dict[str, str], part)
             return "input_audio", input_audio_params
-        if part.get("video_url") is not None:
+        if "video_url" in part:
             video_params = cast(
                 CustomChatCompletionContentSimpleVideoParam, part
             )
-            return "video_url", video_params.get("video_url", "")
+            video_url = video_params.get("video_url", None)
+            if isinstance(video_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                video_url = video_url.get("url", None)
+            return "video_url", video_url
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
 
@@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part(
     return part_type, "unknown part_type content"
 
 
-VALID_MESSAGE_CONTENT_MM_PART_TYPES = (
+PART_TYPES_TO_SKIP_NONE_CONTENT = (
     "text",
     "refusal",
-    "image_url",
-    "image_embeds",
-    "image_pil",
-    "audio_url",
-    "input_audio",
-    "video_url",
 )
 
 
@@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part(
     part_type, content = _parse_chat_message_content_mm_part(part)
     # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is None, log a warning and skip
-    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
+    if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None:
         logger.warning(
             "Skipping multimodal part '%s' (type: '%s') "
             "with empty / unparsable content.",
@@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part(
 
     modality = None
     if part_type == "image_pil":
-        image_content = cast(Image.Image, content)
+        if content is not None:
+            image_content = cast(Image.Image, content)
+        else:
+            image_content = None
         mm_parser.parse_image_pil(image_content, uuid)
         modality = "image"
     elif part_type in ("image_url", "input_image"):
@@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part(
         mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
-        content = cast(Union[str, dict[str, str]], content)
+        if content is not None:
+            content = cast(Union[str, dict[str, str]], content)
+        else:
+            content = None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 94749eb884512..4b51dbcd8acb9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1491,6 +1491,11 @@ class LLM:
 
         for i, prompt in enumerate(it):
 
+            if isinstance(prompt, dict):
+                self._validate_mm_data_and_uuids(
+                    prompt.get("multi_modal_data"),
+                    prompt.get("multi_modal_uuids"))
+
             param = params[i] if isinstance(params, Sequence) else params
 
             tokenization_kwargs: dict[str, Any] = {}
@@ -1507,6 +1512,41 @@ class LLM:
                 priority=priority[i] if priority else 0,
             )
 
+    def _validate_mm_data_and_uuids(
+            self,
+            multi_modal_data: Optional[Any],  # MultiModalDataDict
+            multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
+    ):
+        """
+        Validate that if any multi-modal data is skipped (i.e. None),
+        then its corresponding UUID must be set. 
+        """
+        if multi_modal_data is None:
+            return
+
+        for modality, data in multi_modal_data.items():
+            if isinstance(data, list):
+                for i, d in enumerate(data):
+                    if d is None:
+                        if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[  # noqa: E501
+                                modality] is None:
+                            raise ValueError(
+                                f"Multi-modal data for {modality} is None "
+                                f"but UUID is not provided")
+                        else:
+                            if len(
+                                    multi_modal_uuids[modality]
+                            ) <= i or multi_modal_uuids[modality][i] is None:
+                                raise ValueError(
+                                    f"Multi-modal data for {modality} is None "
+                                    f"but UUID is not provided")
+            else:
+                if data is None and (multi_modal_uuids is None
+                                     or modality not in multi_modal_uuids
+                                     or multi_modal_uuids[modality] is None):
+                    raise ValueError(f"Multi-modal data for {modality} is None"
+                                     f" but UUID is not provided")
+
     def _add_request(
         self,
         prompt: PromptType,
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index f8ea3835f049d..240e34e139cfe 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -85,9 +85,10 @@ which are treated as audio embeddings;
 these are directly passed to the model without HF processing.
 """
 
-ModalityData: TypeAlias = Union[_T, list[_T]]
+ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
 """
-Either a single data item, or a list of data items.
+Either a single data item, or a list of data items. Can only be None if UUID
+is provided.
 
 The number of data items allowed per modality is restricted by
 `--limit-mm-per-prompt`.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 88bb99529f200..493dd3560a516 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
     def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
 
-        self.data = data
+        self.data: _T = data
         self.modality = modality
 
     def __repr__(self) -> str:
@@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
 
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
-    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+    def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
+        if data is None:
+            data = [None]
         super().__init__(data, "audio")
 
     def get_audio_length(self, item_idx: int) -> int:
@@ -198,7 +200,9 @@ class ImageSize(NamedTuple):
 
 class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
-    def __init__(self, data: Sequence[HfImageItem]) -> None:
+    def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
+        if data is None:
+            data = [None]
         super().__init__(data, "image")
 
     def get_image_size(self, item_idx: int) -> ImageSize:
@@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
     def __init__(
         self,
-        data: Sequence[HfVideoItem],
+        data: Optional[Sequence[HfVideoItem]],
         metadata: Optional[Union[dict[str, Any],
                                  list[Optional[dict[str, Any]]]]] = None,
     ) -> None:
+        if data is None:
+            data = [None]
         super().__init__(data, "video")
         self.metadata = metadata
 
@@ -385,6 +391,9 @@ class MultiModalDataParser:
         self,
         data: ModalityData[AudioItem],
     ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return AudioProcessorItems(None)
+
         # also check single audio item with sampling rate
         if self._is_empty(data) or (isinstance(data, tuple)
                                     and self._is_empty(data[0])):
@@ -420,6 +429,9 @@ class MultiModalDataParser:
         self,
         data: ModalityData[ImageItem],
     ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return ImageProcessorItems(None)
+
         if self._is_empty(data):
             return None
 
@@ -441,6 +453,9 @@ class MultiModalDataParser:
         self,
         data: ModalityData[VideoItem],
     ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return VideoProcessorItems(None)
+
         if self._is_empty(data):
             return None
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index e5db356b635f3..7471bfcb4d508 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
-
         for modality, items in mm_items.items():
             self.validate_num_items(modality, len(items))
 
@@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             ]
             for modality, items_is_cached in mm_is_cached.items()
         }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
+        mm_missing_data = {}
+        for modality, idxs in mm_missing_idxs.items():
+            missing_modality_data = []
+            for idx in idxs:
+                data = mm_data_items[modality][idx]
+                if data is None:
+                    raise ValueError(
+                        f"Cache miss for {modality} at index {idx} "
+                        f"but data is not provided.")
+                else:
+                    missing_modality_data.append(data)
+            mm_missing_data[modality] = missing_modality_data
 
         return self._to_mm_items(mm_missing_data)