From 1501a4070ec1f5e3f8f1ffd0099dc32d85a9ad98 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sat, 20 Dec 2025 02:29:31 -0800 Subject: [PATCH] [Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013) Signed-off-by: Jeffrey Wang --- vllm/engine/protocol.py | 6 +++++- vllm/v1/engine/async_llm.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..bf656cf23de65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -71,7 +71,11 @@ class EngineClient(ABC): truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model.""" + """Generate outputs for a request from a pooling model. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove this argument in v0.15. + """ ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a6ee241c41151..1cbe4718f2e5c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import os import socket import time +import warnings from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast @@ -627,6 +628,9 @@ class AsyncLLM(EngineClient): The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove truncate_prompt_tokens in v0.15. """ try: @@ -641,9 +645,19 @@ class AsyncLLM(EngineClient): if tokenization_kwargs is None: tokenization_kwargs = {} + + if truncate_prompt_tokens is not None: + warnings.warn( + "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` " + "is deprecated and will be removed in v0.15. " + "Please use `pooling_params.truncate_prompt_tokens` instead.", + DeprecationWarning, + stacklevel=2, + ) + _validate_truncation_size( self.model_config.max_model_len, - truncate_prompt_tokens, + pooling_params.truncate_prompt_tokens, tokenization_kwargs, )