diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index c7229dbb8e90..1559ca2d9284 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -31,7 +31,7 @@ class RequestFuncInput: model_name: Optional[str] = None logprobs: Optional[int] = None extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None @@ -364,7 +364,15 @@ async def async_request_openai_chat_completions( ) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] for openai-chat" + ) payload = { "model": request_func_input.model_name if request_func_input.model_name @@ -491,7 +499,10 @@ async def async_request_openai_audio( buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 1ad6cef7a9db..ea684f18a742 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -52,7 +52,7 @@ class SampleRequest: prompt: Union[str, Any] prompt_len: int expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None lora_request: Optional[LoRARequest] = None diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 93b72211eb33..ae38caf7290b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -263,7 +263,14 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert test_mm_content is None or isinstance(test_mm_content, dict) + assert ( + test_mm_content is None + or isinstance(test_mm_content, dict) + or ( + isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content) + ) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 45b58035ebe3..4e8ac5162542 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -71,7 +71,9 @@ class SampleRequest: prompt: Union[str, Any] prompt_len: int expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + multi_modal_data: Optional[ + Union[MultiModalDataDict, dict, list[dict]] + ] = None lora_request: Optional[LoRARequest] = None diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 2d64cc115f00..47bc28877450 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -28,7 +28,7 @@ class RequestFuncInput: model_name: Optional[str] = None logprobs: Optional[int] = None extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None @@ -172,7 +172,16 @@ async def async_request_openai_chat_completions( content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] " + "for openai-chat" + ) payload = { "model": request_func_input.model_name @@ -310,7 +319,10 @@ async def async_request_openai_audio( buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7cdf87cb4c3b..7bf04c753241 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -365,7 +365,14 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert test_mm_content is None or isinstance(test_mm_content, dict) + assert ( + test_mm_content is None + or isinstance(test_mm_content, dict) + or ( + isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content) + ) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name,