[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
This commit is contained in:
Jeffrey Wang 2025-12-20 02:29:31 -08:00 committed by GitHub
parent ff2168bca3
commit 1501a4070e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 2 deletions

View File

@ -71,7 +71,11 @@ class EngineClient(ABC):
truncate_prompt_tokens: int | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model."""
"""Generate outputs for a request from a pooling model.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove this argument in v0.15.
"""
...
@abstractmethod

View File

@ -4,6 +4,7 @@ import asyncio
import os
import socket
import time
import warnings
from collections.abc import AsyncGenerator, Iterable, Mapping
from copy import copy
from typing import Any, cast
@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
The caller of generate() iterates the returned AsyncGenerator,
returning the RequestOutput back to the caller.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove truncate_prompt_tokens in v0.15.
"""
try:
@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
if tokenization_kwargs is None:
tokenization_kwargs = {}
if truncate_prompt_tokens is not None:
warnings.warn(
"The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
"is deprecated and will be removed in v0.15. "
"Please use `pooling_params.truncate_prompt_tokens` instead.",
DeprecationWarning,
stacklevel=2,
)
_validate_truncation_size(
self.model_config.max_model_len,
truncate_prompt_tokens,
pooling_params.truncate_prompt_tokens,
tokenization_kwargs,
)