[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
This commit is contained in:
Jeffrey Wang 2025-12-20 02:29:31 -08:00 committed by GitHub
parent ff2168bca3
commit 1501a4070e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 2 deletions

View File

@ -71,7 +71,11 @@ class EngineClient(ABC):
truncate_prompt_tokens: int | None = None, truncate_prompt_tokens: int | None = None,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]: ) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model.""" """Generate outputs for a request from a pooling model.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove this argument in v0.15.
"""
... ...
@abstractmethod @abstractmethod

View File

@ -4,6 +4,7 @@ import asyncio
import os import os
import socket import socket
import time import time
import warnings
from collections.abc import AsyncGenerator, Iterable, Mapping from collections.abc import AsyncGenerator, Iterable, Mapping
from copy import copy from copy import copy
from typing import Any, cast from typing import Any, cast
@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
The caller of generate() iterates the returned AsyncGenerator, The caller of generate() iterates the returned AsyncGenerator,
returning the RequestOutput back to the caller. returning the RequestOutput back to the caller.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove truncate_prompt_tokens in v0.15.
""" """
try: try:
@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
if tokenization_kwargs is None: if tokenization_kwargs is None:
tokenization_kwargs = {} tokenization_kwargs = {}
if truncate_prompt_tokens is not None:
warnings.warn(
"The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
"is deprecated and will be removed in v0.15. "
"Please use `pooling_params.truncate_prompt_tokens` instead.",
DeprecationWarning,
stacklevel=2,
)
_validate_truncation_size( _validate_truncation_size(
self.model_config.max_model_len, self.model_config.max_model_len,
truncate_prompt_tokens, pooling_params.truncate_prompt_tokens,
tokenization_kwargs, tokenization_kwargs,
) )