feat(api): Eager chat template warmup to eliminate first-request latency (#30700)

Signed-off-by: Nathan Price <nathan@abridge.com>
This commit is contained in:
Nathan Price 2025-12-17 18:01:29 -06:00 committed by GitHub
parent e3fc374a9a
commit 05a83dc6ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 52 additions and 0 deletions

View File

@ -1082,6 +1082,9 @@ async def init_app_state(
if "generate" in supported_tasks if "generate" in supported_tasks
else None else None
) )
# Warm up chat template processing to avoid first-request latency
if state.openai_serving_chat is not None:
await state.openai_serving_chat.warmup()
state.openai_serving_completion = ( state.openai_serving_completion = (
OpenAIServingCompletion( OpenAIServingCompletion(
engine_client, engine_client,

View File

@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
self.supports_code_interpreter = False self.supports_code_interpreter = False
self.python_tool = None self.python_tool = None
async def warmup(self) -> None:
"""
Warm up the chat template processing to avoid first-request latency.
This method triggers Jinja2 template compilation and content format
detection that would otherwise happen on the first real request,
causing increased latency on the first request.
"""
logger.info("Warming up chat template processing...")
start_time = time.perf_counter()
try:
# Get the tokenizer from the engine
tokenizer = await self.engine_client.get_tokenizer()
# Create a minimal dummy request
dummy_request = ChatCompletionRequest(
messages=[{"role": "user", "content": "warmup"}],
model=None,
max_completion_tokens=1,
)
# Call _preprocess_chat to trigger template compilation
# This forces:
# 1. Chat template content format detection
# 2. Jinja2 template compilation
# 3. Tokenizer initialization for chat
await self._preprocess_chat(
dummy_request,
tokenizer,
dummy_request.messages,
chat_template=self.chat_template,
chat_template_content_format=self.chat_template_content_format,
add_generation_prompt=True,
continue_final_message=False,
tool_dicts=None,
documents=None,
chat_template_kwargs=None,
tool_parser=None,
add_special_tokens=False,
)
elapsed = (time.perf_counter() - start_time) * 1000
logger.info("Chat template warmup completed in %.1fms", elapsed)
except Exception:
# Log but don't fail server startup if warmup fails
logger.exception("Chat template warmup failed")
async def create_chat_completion( async def create_chat_completion(
self, self,
request: ChatCompletionRequest, request: ChatCompletionRequest,