diff --git a/benchmarks/README.md b/benchmarks/README.md
index 69d32e222819b..176b40212978f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,6 +32,14 @@ become available.
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
wget http://images.cocodataset.org/zips/train2017.zip
+
+
+ | ShareGPT4Video (Video) |
+ ✅ |
+ ✅ |
+
+ git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video
+ |
| BurstGPT |
@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash
vllm bench serve \
--backend openai-chat \
- --endpoint-type openai-chat \
+ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@@ -246,7 +254,7 @@ vllm bench serve \
```bash
vllm bench serve \
--backend openai-chat \
- --endpoint-type openai-chat \
+ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@@ -612,7 +620,7 @@ vllm bench serve \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
- --prefix-repetition-output-len 128
+ --prefix-repetition-output-len 128
```
@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
--endpoint /v1/chat/completion
```
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
+ --dtype bfloat16 \
+ --limit-mm-per-prompt '{"video": 1}' \
+ --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+python benchmarks/benchmark_serving.py \
+ --backend openai-chat \
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
+ --dataset-name sharegpt \
+ --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+ --num-prompts 100 \
+ --save-result \
+ --result-dir ~/vllm_benchmark_results \
+ --save-detailed \
+ --endpoint /v1/chat/completion
+```
+
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c62934ed94cb5..e1a856026c4ae 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
)
+def process_video(video: Any) -> Mapping[str, Any]:
+ """
+ Process a single video input and return a multimedia content dictionary.
+
+ Supports the following input types:
+
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+ containing raw video data.
+
+ 2. String input: - Treats the string as a URL or local file path. -
+ Prepends "file://" if the string doesn't start with "http://" or
+ "file://". - Returns a dictionary with the image URL.
+
+ Raises:
+ ValueError: If the input is not a supported type.
+ """
+ if isinstance(video, dict) and "bytes" in video:
+ video_bytes = video["bytes"]
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+ return {
+ "type": "video_url",
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+ }
+
+ if isinstance(video, str):
+ video_url = (
+ video if video.startswith(("http://", "file://")) else f"file://{video}"
+ )
+ return {"type": "video_url", "video_url": {"url": video_url}}
+
+ raise ValueError(
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
+ )
+
+
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
@@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len is not None,
):
continue
- # TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"):
mm_content = process_image(image_path)
+ elif video_path := entry.get("video"):
+ mm_content = process_video(video_path)
else:
mm_content = None
if enable_multimodal_chat:
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 3532a083fb4a1..f4fbfad2d1d5d 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
- Supports three input types:
+ Supports the following input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
" or str or dictionary with raw image bytes.")
+def process_video(video: Any) -> Mapping[str, Any]:
+ """
+ Process a single video input and return a multimedia content dictionary.
+
+ Supports the following input types:
+
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+ containing raw video data.
+
+ 2. String input: - Treats the string as a URL or local file path. -
+ Prepends "file://" if the string doesn't start with "http://" or
+ "file://". - Returns a dictionary with the image URL.
+
+ Raises:
+ ValueError: If the input is not a supported type.
+ """
+ if isinstance(video, dict) and 'bytes' in video:
+ video_bytes = video['bytes']
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+ return {
+ "type": "video_url",
+ "video_url": {
+ "url": f"data:video/mp4;base64,{video_base64}"
+ },
+ }
+
+ if isinstance(video, str):
+ video_url = (video if video.startswith(
+ ("http://", "file://")) else f"file://{video}")
+ return {"type": "video_url", "video_url": {"url": video_url}}
+
+ raise ValueError(
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
+ )
+
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
@@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len
is not None):
continue
- # TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"):
mm_content = process_image(image_path)
+ elif video_path := entry.get("video"):
+ mm_content = process_video(video_path)
else:
mm_content = None
if enable_multimodal_chat: