[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
2026-07-19 07:27:09 +08:00 · 2025-12-20 02:29:31 -08:00 · 2025-12-20 02:29:31 -08:00 · 1501a4070e
commit 1501a4070e
parent ff2168bca3
2 changed files with 20 additions and 2 deletions
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@ -71,7 +71,11 @@ class EngineClient(ABC):
        truncate_prompt_tokens: int | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
        NOTE: truncate_prompt_tokens is deprecated in v0.14.
        TODO: Remove this argument in v0.15.
        """
        ...
    @abstractmethod
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
 import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
        The caller of generate() iterates the returned AsyncGenerator,
        returning the RequestOutput back to the caller.
        NOTE: truncate_prompt_tokens is deprecated in v0.14.
        TODO: Remove truncate_prompt_tokens in v0.15.
        """
        try:
@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
            if tokenization_kwargs is None:
                tokenization_kwargs = {}
            if truncate_prompt_tokens is not None:
                warnings.warn(
                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
                    "is deprecated and will be removed in v0.15. "
                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
                    DeprecationWarning,
                    stacklevel=2,
                )
            _validate_truncation_size(
                self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                tokenization_kwargs,
            )