diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 7ffc21905924..8e71a7bfb129 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1584,7 +1584,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: if dataset_class.IS_MULTIMODAL and not ( args.backend in ("openai-chat", "openai-audio") - or "openai-embeddings-" in args.backend + or "embeddings-" in args.backend ): # multi-modal benchmark is only available on OpenAI Chat # endpoint-type. diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 34dce5edb0c7..28146ce6200d 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -581,29 +581,6 @@ async def async_request_openai_embeddings_chat( ) -async def async_request_openai_embeddings_clip( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - if request_func_input.multi_modal_content: - # Image input - request_func_input.prompt = "" - - # max_model_len=77 is too short for most datasets, - # so by default we truncate the prompt to max_model_len - if request_func_input.extra_body is None: - request_func_input.extra_body = {} - if "truncate_prompt_tokens" not in request_func_input.extra_body: - request_func_input.extra_body["truncate_prompt_tokens"] = -1 - - return await async_request_openai_embeddings_chat( - request_func_input, - session, - pbar=pbar, - ) - - def _try_extract_request_idx(request_func_input: RequestFuncInput): if request_func_input.request_id: match = re.search(r"(\d+)$", request_func_input.request_id) @@ -616,11 +593,20 @@ def _try_extract_request_idx(request_func_input: RequestFuncInput): return None -async def async_request_openai_embeddings_vlm2vec( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: +def _preprocess_clip(request_func_input: RequestFuncInput): + if request_func_input.multi_modal_content: + # Image input + request_func_input.prompt = "" + + # max_model_len=77 is too short for most datasets, + # so by default we truncate the prompt to max_model_len + if request_func_input.extra_body is None: + request_func_input.extra_body = {} + if "truncate_prompt_tokens" not in request_func_input.extra_body: + request_func_input.extra_body["truncate_prompt_tokens"] = -1 + + +def _preprocess_vlm2vec(request_func_input: RequestFuncInput): if request_func_input.multi_modal_content: request_idx = _try_extract_request_idx(request_func_input) @@ -637,6 +623,28 @@ async def async_request_openai_embeddings_vlm2vec( f"{request_func_input.prompt}" ) + +async def async_request_openai_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_openai_embeddings_chat( + request_func_input, + session, + pbar=pbar, + ) + + +async def async_request_openai_embeddings_vlm2vec( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_vlm2vec(request_func_input) + return await async_request_openai_embeddings_chat( request_func_input, session, @@ -645,6 +653,61 @@ async def async_request_openai_embeddings_vlm2vec( ) +async def async_request_infinity_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "Infinity Embeddings API", "embeddings") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + } + + if request_func_input.prompt: + payload["input"] = request_func_input.prompt + else: + mm_content = request_func_input.multi_modal_content + assert isinstance(mm_content, dict) + + mm_type = mm_content["type"] + payload["input"] = mm_content[mm_type]["url"] + payload["modality"] = mm_type.split("_", 1)[0] + + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_openai_embeddings( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_infinity_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_infinity_embeddings( + request_func_input, + session, + pbar=pbar, + ) + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, @@ -655,6 +718,10 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "openai-embeddings-chat": async_request_openai_embeddings_chat, "openai-embeddings-clip": async_request_openai_embeddings_clip, "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec, + # Infinity embedding server: https://github.com/michaelfeil/infinity + "infinity-embeddings": async_request_infinity_embeddings, + "infinity-embeddings-clip": async_request_infinity_embeddings_clip, + # (Infinity embedding server does not support vlm2vec) } OPENAI_COMPATIBLE_BACKENDS = [