[Benchmarks] Add MMVU video dataset support and clean up deprecated datasets (#24719)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py 2025-09-17 11:29:43 +08:00 committed by GitHub
parent eeb135eb87
commit 5a411ef6c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 65 additions and 1290 deletions

File diff suppressed because it is too large Load Diff

View File

@ -37,6 +37,7 @@ th {
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
| Prefix Repetition | ✅ | ✅ | `synthetic` |
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |

View File

@ -335,7 +335,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
if isinstance(image, str):
image_url = (image if image.startswith(
("http://", "file://")) else f"file://{image}")
("http://", "https://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
@ -370,7 +370,7 @@ def process_video(video: Any) -> Mapping[str, Any]:
if isinstance(video, str):
video_url = (video if video.startswith(
("http://", "file://")) else f"file://{video}")
("http://", "https://", "file://")) else f"file://{video}")
return {"type": "video_url", "video_url": {"url": video_url}}
raise ValueError(
@ -1405,6 +1405,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
dataset_class = VisionArenaDataset
args.hf_split = "train"
args.hf_subset = None
elif (
args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = MMVUDataset
args.hf_split = "validation"
args.hf_subset = None
elif (
args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
@ -2053,6 +2060,61 @@ class VisionArenaDataset(HuggingFaceDataset):
return sampled_requests
class MMVUDataset(HuggingFaceDataset):
"""
MMVU Dataset.
https://huggingface.co/datasets/yale-nlp/MMVU
"""
DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = {
"yale-nlp/MMVU":
lambda x: x["question"] + " " + (
" ".join(f"{k}.{v}" for k, v in x["choices"].items())
),
}
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
if parser_fn is None:
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
prompt = parser_fn(item)
mm_content = process_video(item["video"])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix, no_oversample)
return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------