[Benchmarks] Add video inputs to ShareGPTDataset. (#23199)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2026-07-21 15:07:13 +08:00 · 2025-08-19 16:42:31 -07:00 · 2025-08-19 16:42:31 -07:00 · 1630cc8d0f
commit 1630cc8d0f
parent 14e2b0730b
3 changed files with 113 additions and 6 deletions
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -32,6 +32,14 @@ become available.
        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
      </td>
    </tr>
        <tr>
      <td><strong>ShareGPT4Video (Video)</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td>
        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
      </td>
    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -246,7 +254,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -612,7 +620,7 @@ vllm bench serve \
  --prefix-repetition-prefix-len 512 \
  --prefix-repetition-suffix-len 128 \
  --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128 
+  --prefix-repetition-output-len 128
 ```
 </details>
@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
  --endpoint /v1/chat/completion
 ```
 ### Videos (ShareGPT4Video)
 Start vLLM:
 ```bash
 python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"video": 1}' \
  --allowed-local-media-path /path/to/sharegpt4video/videos
 ```
 Send requests with videos:
 ```bash
 python benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
  --num-prompts 100 \
  --save-result \
  --result-dir ~/vllm_benchmark_results \
  --save-detailed \
  --endpoint /v1/chat/completion
 ```
 </details>
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
    )
 def process_video(video: Any) -> Mapping[str, Any]:
    """
    Process a single video input and return a multimedia content dictionary.
    Supports the following input types:
    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
       containing raw video data.
    2. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.
    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(video, dict) and "bytes" in video:
        video_bytes = video["bytes"]
        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
        return {
            "type": "video_url",
            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
        }
    if isinstance(video, str):
        video_url = (
            video if video.startswith(("http://", "file://")) else f"file://{video}"
        )
        return {"type": "video_url", "video_url": {"url": video_url}}
    raise ValueError(
        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
    )
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
                skip_min_output_len_check=output_len is not None,
            ):
                continue
            # TODO: Also support ShareGPT4Video.
            if image_path := entry.get("image"):
                mm_content = process_image(image_path)
            elif video_path := entry.get("video"):
                mm_content = process_video(video_path)
            else:
                mm_content = None
            if enable_multimodal_chat:
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.
-    Supports three input types:
+    Supports the following input types:
    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
                     " or str or dictionary with raw image bytes.")
 def process_video(video: Any) -> Mapping[str, Any]:
    """
    Process a single video input and return a multimedia content dictionary.
    Supports the following input types:
    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
       containing raw video data.
    2. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.
    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(video, dict) and 'bytes' in video:
        video_bytes = video['bytes']
        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
        return {
            "type": "video_url",
            "video_url": {
                "url": f"data:video/mp4;base64,{video_base64}"
            },
        }
    if isinstance(video, str):
        video_url = (video if video.startswith(
            ("http://", "file://")) else f"file://{video}")
        return {"type": "video_url", "video_url": {"url": video_url}}
    raise ValueError(
        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
    )
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
            # TODO: Also support ShareGPT4Video.
            if image_path := entry.get("image"): 
                mm_content = process_image(image_path) 
            elif video_path := entry.get("video"): 
                mm_content = process_video(video_path)
            else: 
                mm_content = None
            if enable_multimodal_chat: