From 1501a4070ec1f5e3f8f1ffd0099dc32d85a9ad98 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Sat, 20 Dec 2025 02:29:31 -0800
Subject: [PATCH] [Bugfix] Read truncate_prompt_tokens from pooling_params in
 AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/engine/protocol.py     |  6 +++++-
 vllm/v1/engine/async_llm.py | 16 +++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc8..bf656cf23de65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
         truncate_prompt_tokens: int | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
         ...
 
     @abstractmethod
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a6ee241c41151..1cbe4718f2e5c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
 
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
         """
 
         try:
@@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
 
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
+
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
             _validate_truncation_size(
                 self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                 tokenization_kwargs,
             )