From 1630cc8d0f5e0ff19d4c5736a4b531dd27a3f4d8 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 19 Aug 2025 16:42:31 -0700
Subject: [PATCH] [Benchmarks] Add video inputs to ShareGPTDataset.  (#23199)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 41 ++++++++++++++++++++++++++++++---
 benchmarks/benchmark_dataset.py | 38 +++++++++++++++++++++++++++++-
 vllm/benchmarks/datasets.py     | 40 ++++++++++++++++++++++++++++++--
 3 files changed, 113 insertions(+), 6 deletions(-)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 69d32e222819b..176b40212978f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,6 +32,14 @@ become available.
         <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
         <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
       </td>
+    </tr>
+        <tr>
+      <td><strong>ShareGPT4Video (Video)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
+      </td>
     </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -246,7 +254,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -612,7 +620,7 @@ vllm bench serve \
   --prefix-repetition-prefix-len 512 \
   --prefix-repetition-suffix-len 128 \
   --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128 
+  --prefix-repetition-output-len 128
 ```
 
 </details>
@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
   --endpoint /v1/chat/completion
 ```
 
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"video": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
 </details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c62934ed94cb5..e1a856026c4ae 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
     )
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and "bytes" in video:
+        video_bytes = video["bytes"]
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+        }
+
+    if isinstance(video, str):
+        video_url = (
+            video if video.startswith(("http://", "file://")) else f"file://{video}"
+        )
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"):
                 mm_content = process_image(image_path)
+            elif video_path := entry.get("video"):
+                mm_content = process_video(video_path)
             else:
                 mm_content = None
             if enable_multimodal_chat:
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 3532a083fb4a1..f4fbfad2d1d5d 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    Supports three input types:
+    Supports the following input types:
 
     1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
        containing raw image data.  - Loads the bytes as a PIL.Image.Image.
@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
                      " or str or dictionary with raw image bytes.")
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and 'bytes' in video:
+        video_bytes = video['bytes']
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {
+                "url": f"data:video/mp4;base64,{video_base64}"
+            },
+        }
+
+    if isinstance(video, str):
+        video_url = (video if video.startswith(
+            ("http://", "file://")) else f"file://{video}")
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"): 
                 mm_content = process_image(image_path) 
+            elif video_path := entry.get("video"): 
+                mm_content = process_video(video_path)
             else: 
                 mm_content = None
             if enable_multimodal_chat: