diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..bf656cf23de65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -71,7 +71,11 @@ class EngineClient(ABC): truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model.""" + """Generate outputs for a request from a pooling model. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove this argument in v0.15. + """ ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a6ee241c41151..1cbe4718f2e5c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import os import socket import time +import warnings from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast @@ -627,6 +628,9 @@ class AsyncLLM(EngineClient): The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove truncate_prompt_tokens in v0.15. """ try: @@ -641,9 +645,19 @@ class AsyncLLM(EngineClient): if tokenization_kwargs is None: tokenization_kwargs = {} + + if truncate_prompt_tokens is not None: + warnings.warn( + "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` " + "is deprecated and will be removed in v0.15. " + "Please use `pooling_params.truncate_prompt_tokens` instead.", + DeprecationWarning, + stacklevel=2, + ) + _validate_truncation_size( self.model_config.max_model_len, - truncate_prompt_tokens, + pooling_params.truncate_prompt_tokens, tokenization_kwargs, )