mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-23 13:45:50 +08:00
feat(api): Eager chat template warmup to eliminate first-request latency (#30700)
Signed-off-by: Nathan Price <nathan@abridge.com>
This commit is contained in:
parent
e3fc374a9a
commit
05a83dc6ee
@ -1082,6 +1082,9 @@ async def init_app_state(
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
# Warm up chat template processing to avoid first-request latency
|
||||
if state.openai_serving_chat is not None:
|
||||
await state.openai_serving_chat.warmup()
|
||||
state.openai_serving_completion = (
|
||||
OpenAIServingCompletion(
|
||||
engine_client,
|
||||
|
||||
@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
|
||||
self.supports_code_interpreter = False
|
||||
self.python_tool = None
|
||||
|
||||
async def warmup(self) -> None:
|
||||
"""
|
||||
Warm up the chat template processing to avoid first-request latency.
|
||||
|
||||
This method triggers Jinja2 template compilation and content format
|
||||
detection that would otherwise happen on the first real request,
|
||||
causing increased latency on the first request.
|
||||
"""
|
||||
logger.info("Warming up chat template processing...")
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
# Get the tokenizer from the engine
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
# Create a minimal dummy request
|
||||
dummy_request = ChatCompletionRequest(
|
||||
messages=[{"role": "user", "content": "warmup"}],
|
||||
model=None,
|
||||
max_completion_tokens=1,
|
||||
)
|
||||
|
||||
# Call _preprocess_chat to trigger template compilation
|
||||
# This forces:
|
||||
# 1. Chat template content format detection
|
||||
# 2. Jinja2 template compilation
|
||||
# 3. Tokenizer initialization for chat
|
||||
await self._preprocess_chat(
|
||||
dummy_request,
|
||||
tokenizer,
|
||||
dummy_request.messages,
|
||||
chat_template=self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
add_generation_prompt=True,
|
||||
continue_final_message=False,
|
||||
tool_dicts=None,
|
||||
documents=None,
|
||||
chat_template_kwargs=None,
|
||||
tool_parser=None,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
elapsed = (time.perf_counter() - start_time) * 1000
|
||||
logger.info("Chat template warmup completed in %.1fms", elapsed)
|
||||
|
||||
except Exception:
|
||||
# Log but don't fail server startup if warmup fails
|
||||
logger.exception("Chat template warmup failed")
|
||||
|
||||
async def create_chat_completion(
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user