mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 04:49:09 +08:00
[Benchmarks] Add video inputs to ShareGPTDataset. (#23199)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
This commit is contained in:
parent
14e2b0730b
commit
1630cc8d0f
@ -32,6 +32,14 @@ become available.
|
|||||||
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
|
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
|
||||||
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
|
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
|
||||||
</td>
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>ShareGPT4Video (Video)</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>
|
||||||
|
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>BurstGPT</strong></td>
|
<td><strong>BurstGPT</strong></td>
|
||||||
@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
|
|||||||
```bash
|
```bash
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--endpoint-type openai-chat \
|
--endpoint-type openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -246,7 +254,7 @@ vllm bench serve \
|
|||||||
```bash
|
```bash
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--endpoint-type openai-chat \
|
--endpoint-type openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -612,7 +620,7 @@ vllm bench serve \
|
|||||||
--prefix-repetition-prefix-len 512 \
|
--prefix-repetition-prefix-len 512 \
|
||||||
--prefix-repetition-suffix-len 128 \
|
--prefix-repetition-suffix-len 128 \
|
||||||
--prefix-repetition-num-prefixes 5 \
|
--prefix-repetition-num-prefixes 5 \
|
||||||
--prefix-repetition-output-len 128
|
--prefix-repetition-output-len 128
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
|
|||||||
--endpoint /v1/chat/completion
|
--endpoint /v1/chat/completion
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Videos (ShareGPT4Video)
|
||||||
|
|
||||||
|
Start vLLM:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
|
--dtype bfloat16 \
|
||||||
|
--limit-mm-per-prompt '{"video": 1}' \
|
||||||
|
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||||
|
```
|
||||||
|
|
||||||
|
Send requests with videos:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--save-result \
|
||||||
|
--result-dir ~/vllm_benchmark_results \
|
||||||
|
--save-detailed \
|
||||||
|
--endpoint /v1/chat/completion
|
||||||
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|||||||
@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def process_video(video: Any) -> Mapping[str, Any]:
|
||||||
|
"""
|
||||||
|
Process a single video input and return a multimedia content dictionary.
|
||||||
|
|
||||||
|
Supports the following input types:
|
||||||
|
|
||||||
|
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
|
||||||
|
containing raw video data.
|
||||||
|
|
||||||
|
2. String input: - Treats the string as a URL or local file path. -
|
||||||
|
Prepends "file://" if the string doesn't start with "http://" or
|
||||||
|
"file://". - Returns a dictionary with the image URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the input is not a supported type.
|
||||||
|
"""
|
||||||
|
if isinstance(video, dict) and "bytes" in video:
|
||||||
|
video_bytes = video["bytes"]
|
||||||
|
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
|
||||||
|
return {
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(video, str):
|
||||||
|
video_url = (
|
||||||
|
video if video.startswith(("http://", "file://")) else f"file://{video}"
|
||||||
|
)
|
||||||
|
return {"type": "video_url", "video_url": {"url": video_url}}
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Random Dataset Implementation (Synthetic Data)
|
# Random Dataset Implementation (Synthetic Data)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
skip_min_output_len_check=output_len is not None,
|
skip_min_output_len_check=output_len is not None,
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
# TODO: Also support ShareGPT4Video.
|
|
||||||
if image_path := entry.get("image"):
|
if image_path := entry.get("image"):
|
||||||
mm_content = process_image(image_path)
|
mm_content = process_image(image_path)
|
||||||
|
elif video_path := entry.get("video"):
|
||||||
|
mm_content = process_video(video_path)
|
||||||
else:
|
else:
|
||||||
mm_content = None
|
mm_content = None
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
|
|||||||
@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
"""
|
"""
|
||||||
Process a single image input and return a multimedia content dictionary.
|
Process a single image input and return a multimedia content dictionary.
|
||||||
|
|
||||||
Supports three input types:
|
Supports the following input types:
|
||||||
|
|
||||||
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
||||||
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
||||||
@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
" or str or dictionary with raw image bytes.")
|
" or str or dictionary with raw image bytes.")
|
||||||
|
|
||||||
|
|
||||||
|
def process_video(video: Any) -> Mapping[str, Any]:
|
||||||
|
"""
|
||||||
|
Process a single video input and return a multimedia content dictionary.
|
||||||
|
|
||||||
|
Supports the following input types:
|
||||||
|
|
||||||
|
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
|
||||||
|
containing raw video data.
|
||||||
|
|
||||||
|
2. String input: - Treats the string as a URL or local file path. -
|
||||||
|
Prepends "file://" if the string doesn't start with "http://" or
|
||||||
|
"file://". - Returns a dictionary with the image URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the input is not a supported type.
|
||||||
|
"""
|
||||||
|
if isinstance(video, dict) and 'bytes' in video:
|
||||||
|
video_bytes = video['bytes']
|
||||||
|
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
|
||||||
|
return {
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": f"data:video/mp4;base64,{video_base64}"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(video, str):
|
||||||
|
video_url = (video if video.startswith(
|
||||||
|
("http://", "file://")) else f"file://{video}")
|
||||||
|
return {"type": "video_url", "video_url": {"url": video_url}}
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
|
||||||
|
)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Random Dataset Implementation (Synthetic Data)
|
# Random Dataset Implementation (Synthetic Data)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
skip_min_output_len_check=output_len
|
skip_min_output_len_check=output_len
|
||||||
is not None):
|
is not None):
|
||||||
continue
|
continue
|
||||||
# TODO: Also support ShareGPT4Video.
|
|
||||||
if image_path := entry.get("image"):
|
if image_path := entry.get("image"):
|
||||||
mm_content = process_image(image_path)
|
mm_content = process_image(image_path)
|
||||||
|
elif video_path := entry.get("video"):
|
||||||
|
mm_content = process_video(video_path)
|
||||||
else:
|
else:
|
||||||
mm_content = None
|
mm_content = None
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user