[Benchmarks] Add MMVU video dataset support and clean up deprecated datasets (#24719)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-05-22 06:44:26 +08:00 · 2025-09-17 11:29:43 +08:00 · 2025-09-17 11:29:43 +08:00 · 5a411ef6c4
commit 5a411ef6c4
parent eeb135eb87
3 changed files with 65 additions and 1290 deletions
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -37,6 +37,7 @@ th {
 | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
+| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
 | HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
 | HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
 | HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -335,7 +335,7 @@ def process_image(image: Any) -> Mapping[str, Any]:

    if isinstance(image, str):
        image_url = (image if image.startswith(
-            ("http://", "file://")) else f"file://{image}")
+            ("http://", "https://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}

    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
@ -370,7 +370,7 @@ def process_video(video: Any) -> Mapping[str, Any]:

    if isinstance(video, str):
        video_url = (video if video.startswith(
-            ("http://", "file://")) else f"file://{video}")
+            ("http://", "https://", "file://")) else f"file://{video}")
        return {"type": "video_url", "video_url": {"url": video_url}}

    raise ValueError(
@ -1405,6 +1405,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
            dataset_class = VisionArenaDataset
            args.hf_split = "train"
            args.hf_subset = None
+        elif (
+            args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
+        ):
+            dataset_class = MMVUDataset
+            args.hf_split = "validation"
+            args.hf_subset = None
        elif (
            args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
            or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
@ -2053,6 +2060,61 @@ class VisionArenaDataset(HuggingFaceDataset):
        return sampled_requests


+class MMVUDataset(HuggingFaceDataset):
+    """
+    MMVU Dataset.
+    https://huggingface.co/datasets/yale-nlp/MMVU
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "yale-nlp/MMVU":
+        lambda x: x["question"] + " " + (
+            " ".join(f"{k}.{v}" for k, v in x["choices"].items())
+        ),
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for i, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
+            if parser_fn is None:
+                raise ValueError(f"Unsupported dataset path: {self.hf_name}")
+            prompt = parser_fn(item)
+            mm_content = process_video(item["video"])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests,
+                                       request_id_prefix, no_oversample)
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Instruct Coder Dataset Implementation
 # -----------------------------------------------------------------------------