[Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2026-04-08 18:37:07 +08:00 · 2025-08-15 11:23:06 -07:00 · 2025-08-15 11:23:06 -07:00 · 993d3d122b
commit 993d3d122b
parent 68af77e51c
3 changed files with 63 additions and 2 deletions
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -22,6 +22,17 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
    </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
      <td style="text-align: center;">✅</td>
@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \
 ```

 </details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+</details>
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset):
                skip_min_output_len_check=output_len is not None,
            ):
                continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            else:
+                mm_content = None
            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
+                    multi_modal_data=mm_content,
                )
            )
        self.maybe_oversample_requests(samples, num_requests)
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -454,15 +454,21 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"): 
+                mm_content = process_image(image_path) 
+            else: 
+                mm_content = None
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                    prompt, mm_content)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
+                    multi_modal_data=mm_content,
                ))
        self.maybe_oversample_requests(samples, num_requests)
        return samples