diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 4f427a31b9ee1..6e09c722bec71 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -527,6 +527,9 @@ async def async_request_openai_embeddings( if request_func_input.model_name else request_func_input.model, "input": request_func_input.prompt, + # Many embedding models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, } _update_payload_common(payload, request_func_input) @@ -564,6 +567,9 @@ async def async_request_vllm_rerank( else request_func_input.model, "query": request_func_input.prompt[0], "documents": request_func_input.prompt[1:], + # Many reranker models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, } headers = { @@ -599,6 +605,9 @@ async def async_request_openai_embeddings_chat( "messages": [ {"role": "user", "content": content}, ], + # Many embedding models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, } _update_payload_common(payload, request_func_input) @@ -634,13 +643,6 @@ def _preprocess_clip(request_func_input: RequestFuncInput): # Image input request_func_input.prompt = "" - # max_model_len=77 is too short for most datasets, - # so by default we truncate the prompt to max_model_len - if request_func_input.extra_body is None: - request_func_input.extra_body = {} - if "truncate_prompt_tokens" not in request_func_input.extra_body: - request_func_input.extra_body["truncate_prompt_tokens"] = -1 - def _preprocess_vlm2vec(request_func_input: RequestFuncInput): if request_func_input.multi_modal_content: