[Benchmark] Support Infinity API (#26641)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-12-10 22:46:01 +08:00 · 2025-10-12 01:45:32 +08:00 · 2025-10-12 01:45:32 +08:00 · 5be7ca1b99
commit 5be7ca1b99
parent f0a30a067b
2 changed files with 96 additions and 29 deletions
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -1584,7 +1584,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
        if dataset_class.IS_MULTIMODAL and not (
            args.backend in ("openai-chat", "openai-audio")
-            or "openai-embeddings-" in args.backend
+            or "embeddings-" in args.backend
        ):
            # multi-modal benchmark is only available on OpenAI Chat
            # endpoint-type.
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@ -581,29 +581,6 @@ async def async_request_openai_embeddings_chat(
    )
 async def async_request_openai_embeddings_clip(
    request_func_input: RequestFuncInput,
    session: aiohttp.ClientSession,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    if request_func_input.multi_modal_content:
        # Image input
        request_func_input.prompt = ""
    # max_model_len=77 is too short for most datasets,
    # so by default we truncate the prompt to max_model_len
    if request_func_input.extra_body is None:
        request_func_input.extra_body = {}
    if "truncate_prompt_tokens" not in request_func_input.extra_body:
        request_func_input.extra_body["truncate_prompt_tokens"] = -1
    return await async_request_openai_embeddings_chat(
        request_func_input,
        session,
        pbar=pbar,
    )
 def _try_extract_request_idx(request_func_input: RequestFuncInput):
    if request_func_input.request_id:
        match = re.search(r"(\d+)$", request_func_input.request_id)
@ -616,11 +593,20 @@ def _try_extract_request_idx(request_func_input: RequestFuncInput):
    return None
-async def async_request_openai_embeddings_vlm2vec(
+def _preprocess_clip(request_func_input: RequestFuncInput):
-    request_func_input: RequestFuncInput,
+    if request_func_input.multi_modal_content:
-    session: aiohttp.ClientSession,
+        # Image input
-    pbar: Optional[tqdm] = None,
+        request_func_input.prompt = ""
-) -> RequestFuncOutput:
+
    # max_model_len=77 is too short for most datasets,
    # so by default we truncate the prompt to max_model_len
    if request_func_input.extra_body is None:
        request_func_input.extra_body = {}
    if "truncate_prompt_tokens" not in request_func_input.extra_body:
        request_func_input.extra_body["truncate_prompt_tokens"] = -1
 def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
    if request_func_input.multi_modal_content:
        request_idx = _try_extract_request_idx(request_func_input)
@ -637,6 +623,28 @@ async def async_request_openai_embeddings_vlm2vec(
                f"{request_func_input.prompt}"
            )
 async def async_request_openai_embeddings_clip(
    request_func_input: RequestFuncInput,
    session: aiohttp.ClientSession,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    _preprocess_clip(request_func_input)
    return await async_request_openai_embeddings_chat(
        request_func_input,
        session,
        pbar=pbar,
    )
 async def async_request_openai_embeddings_vlm2vec(
    request_func_input: RequestFuncInput,
    session: aiohttp.ClientSession,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    _preprocess_vlm2vec(request_func_input)
    return await async_request_openai_embeddings_chat(
        request_func_input,
        session,
@ -645,6 +653,61 @@ async def async_request_openai_embeddings_vlm2vec(
    )
 async def async_request_infinity_embeddings(
    request_func_input: RequestFuncInput,
    session: aiohttp.ClientSession,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    _validate_api_url(api_url, "Infinity Embeddings API", "embeddings")
    payload = {
        "model": request_func_input.model_name
        if request_func_input.model_name
        else request_func_input.model,
    }
    if request_func_input.prompt:
        payload["input"] = request_func_input.prompt
    else:
        mm_content = request_func_input.multi_modal_content
        assert isinstance(mm_content, dict)
        mm_type = mm_content["type"]
        payload["input"] = mm_content[mm_type]["url"]
        payload["modality"] = mm_type.split("_", 1)[0]
    _update_payload_common(payload, request_func_input)
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
    }
    _update_headers_common(headers, request_func_input)
    return await _run_openai_embeddings(
        session,
        api_url,
        payload=payload,
        headers=headers,
        pbar=pbar,
    )
 async def async_request_infinity_embeddings_clip(
    request_func_input: RequestFuncInput,
    session: aiohttp.ClientSession,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    _preprocess_clip(request_func_input)
    return await async_request_infinity_embeddings(
        request_func_input,
        session,
        pbar=pbar,
    )
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
    "vllm": async_request_openai_completions,
@ -655,6 +718,10 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
    "openai-embeddings-chat": async_request_openai_embeddings_chat,
    "openai-embeddings-clip": async_request_openai_embeddings_clip,
    "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec,
    # Infinity embedding server: https://github.com/michaelfeil/infinity
    "infinity-embeddings": async_request_infinity_embeddings,
    "infinity-embeddings-clip": async_request_infinity_embeddings_clip,
    # (Infinity embedding server does not support vlm2vec)
 }
 OPENAI_COMPATIBLE_BACKENDS = [