mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-23 16:05:47 +08:00
feat(api): Eager chat template warmup to eliminate first-request latency (#30700)
Signed-off-by: Nathan Price <nathan@abridge.com>
This commit is contained in:
parent
e3fc374a9a
commit
05a83dc6ee
@ -1082,6 +1082,9 @@ async def init_app_state(
|
|||||||
if "generate" in supported_tasks
|
if "generate" in supported_tasks
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
# Warm up chat template processing to avoid first-request latency
|
||||||
|
if state.openai_serving_chat is not None:
|
||||||
|
await state.openai_serving_chat.warmup()
|
||||||
state.openai_serving_completion = (
|
state.openai_serving_completion = (
|
||||||
OpenAIServingCompletion(
|
OpenAIServingCompletion(
|
||||||
engine_client,
|
engine_client,
|
||||||
|
|||||||
@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
self.supports_code_interpreter = False
|
self.supports_code_interpreter = False
|
||||||
self.python_tool = None
|
self.python_tool = None
|
||||||
|
|
||||||
|
async def warmup(self) -> None:
|
||||||
|
"""
|
||||||
|
Warm up the chat template processing to avoid first-request latency.
|
||||||
|
|
||||||
|
This method triggers Jinja2 template compilation and content format
|
||||||
|
detection that would otherwise happen on the first real request,
|
||||||
|
causing increased latency on the first request.
|
||||||
|
"""
|
||||||
|
logger.info("Warming up chat template processing...")
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the tokenizer from the engine
|
||||||
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
|
# Create a minimal dummy request
|
||||||
|
dummy_request = ChatCompletionRequest(
|
||||||
|
messages=[{"role": "user", "content": "warmup"}],
|
||||||
|
model=None,
|
||||||
|
max_completion_tokens=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call _preprocess_chat to trigger template compilation
|
||||||
|
# This forces:
|
||||||
|
# 1. Chat template content format detection
|
||||||
|
# 2. Jinja2 template compilation
|
||||||
|
# 3. Tokenizer initialization for chat
|
||||||
|
await self._preprocess_chat(
|
||||||
|
dummy_request,
|
||||||
|
tokenizer,
|
||||||
|
dummy_request.messages,
|
||||||
|
chat_template=self.chat_template,
|
||||||
|
chat_template_content_format=self.chat_template_content_format,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
continue_final_message=False,
|
||||||
|
tool_dicts=None,
|
||||||
|
documents=None,
|
||||||
|
chat_template_kwargs=None,
|
||||||
|
tool_parser=None,
|
||||||
|
add_special_tokens=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = (time.perf_counter() - start_time) * 1000
|
||||||
|
logger.info("Chat template warmup completed in %.1fms", elapsed)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Log but don't fail server startup if warmup fails
|
||||||
|
logger.exception("Chat template warmup failed")
|
||||||
|
|
||||||
async def create_chat_completion(
|
async def create_chat_completion(
|
||||||
self,
|
self,
|
||||||
request: ChatCompletionRequest,
|
request: ChatCompletionRequest,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user