From 993d3d122b114cf93bf423fe0b4410ac493d9c45 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Fri, 15 Aug 2025 11:23:06 -0700
Subject: [PATCH] [Benchmarks] Include image data when ShareGPT4V dataset is
 used. (#22955)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 49 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_dataset.py |  8 +++++-
 vllm/benchmarks/datasets.py     |  8 +++++-
 3 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index d6442a4fc3872..caff8f0342141 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -22,6 +22,17 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
     </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
       <td style="text-align: center;">✅</td>
@@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \
 ```
 
 </details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+</details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index ea684f18a7421..572292a5aca46 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset):
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            else:
+                mm_content = None
             if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 )
             )
         self.maybe_oversample_requests(samples, num_requests)
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 4e8ac5162542f..5299dcf54b395 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -454,15 +454,21 @@ class ShareGPTDataset(BenchmarkDataset):
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"): 
+                mm_content = process_image(image_path) 
+            else: 
+                mm_content = None
             if enable_multimodal_chat:
                 prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                    prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 ))
         self.maybe_oversample_requests(samples, num_requests)
         return samples