mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-22 03:08:07 +08:00
[Benchmarks] Add MMVU video dataset support and clean up deprecated datasets (#24719)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
eeb135eb87
commit
5a411ef6c4
File diff suppressed because it is too large
Load Diff
@ -37,6 +37,7 @@ th {
|
||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||
| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
|
||||
| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
|
||||
| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |
|
||||
|
||||
@ -335,7 +335,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
||||
|
||||
if isinstance(image, str):
|
||||
image_url = (image if image.startswith(
|
||||
("http://", "file://")) else f"file://{image}")
|
||||
("http://", "https://", "file://")) else f"file://{image}")
|
||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||
|
||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||
@ -370,7 +370,7 @@ def process_video(video: Any) -> Mapping[str, Any]:
|
||||
|
||||
if isinstance(video, str):
|
||||
video_url = (video if video.startswith(
|
||||
("http://", "file://")) else f"file://{video}")
|
||||
("http://", "https://", "file://")) else f"file://{video}")
|
||||
return {"type": "video_url", "video_url": {"url": video_url}}
|
||||
|
||||
raise ValueError(
|
||||
@ -1405,6 +1405,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
dataset_class = VisionArenaDataset
|
||||
args.hf_split = "train"
|
||||
args.hf_subset = None
|
||||
elif (
|
||||
args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
|
||||
or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
dataset_class = MMVUDataset
|
||||
args.hf_split = "validation"
|
||||
args.hf_subset = None
|
||||
elif (
|
||||
args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
@ -2053,6 +2060,61 @@ class VisionArenaDataset(HuggingFaceDataset):
|
||||
return sampled_requests
|
||||
|
||||
|
||||
class MMVUDataset(HuggingFaceDataset):
|
||||
"""
|
||||
MMVU Dataset.
|
||||
https://huggingface.co/datasets/yale-nlp/MMVU
|
||||
"""
|
||||
|
||||
DEFAULT_OUTPUT_LEN = 128
|
||||
SUPPORTED_DATASET_PATHS = {
|
||||
"yale-nlp/MMVU":
|
||||
lambda x: x["question"] + " " + (
|
||||
" ".join(f"{k}.{v}" for k, v in x["choices"].items())
|
||||
),
|
||||
}
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
output_len = (output_len
|
||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||
sampled_requests = []
|
||||
for i, item in enumerate(self.data):
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
|
||||
if parser_fn is None:
|
||||
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
|
||||
prompt = parser_fn(item)
|
||||
mm_content = process_video(item["video"])
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
if enable_multimodal_chat:
|
||||
# Note: when chat is enabled the request prompt_len is no longer
|
||||
# accurate and we will be using request output to count the
|
||||
# actual prompt len
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, mm_content)
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=mm_content,
|
||||
request_id=request_id_prefix + str(i),
|
||||
))
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests,
|
||||
request_id_prefix, no_oversample)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Instruct Coder Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user