From 993d3d122b114cf93bf423fe0b4410ac493d9c45 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Fri, 15 Aug 2025 11:23:06 -0700 Subject: [PATCH] [Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955) Signed-off-by: Chenheli Hua --- benchmarks/README.md | 49 +++++++++++++++++++++++++++++++++ benchmarks/benchmark_dataset.py | 8 +++++- vllm/benchmarks/datasets.py | 8 +++++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index d6442a4fc3872..caff8f0342141 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -22,6 +22,17 @@ become available. ✅ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + ShareGPT4V (Image) + ✅ + ✅ + + wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +
+
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
+ wget http://images.cocodataset.org/zips/train2017.zip + + BurstGPT ✅ @@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \ ``` + +## 👁️ Example - Multi-Modal Benchmark + +
+Show more + +
+ +Benchmark the performance of multi-modal requests in vLLM. + +### Images (ShareGPT4V) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"image": 1}' \ + --allowed-local-media-path /path/to/sharegpt4v/images +``` + +Send requests with images: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + +
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index ea684f18a7421..572292a5aca46 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None, ): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, ) ) self.maybe_oversample_requests(samples, num_requests) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 4e8ac5162542f..5299dcf54b395 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -454,15 +454,21 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, )) self.maybe_oversample_requests(samples, num_requests) return samples