[Optimization] Cache chat template result when processor fails to be loaded (#25341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-12-21 19:15:02 +08:00 · 2025-09-21 19:41:02 +08:00 · 2025-09-21 19:41:02 +08:00 · a31d353b71
commit a31d353b71
parent 80cad257da
1 changed files with 49 additions and 22 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -421,6 +421,51 @@ def resolve_mistral_chat_template(
    return None
 _PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
 """
 Used in `_try_get_processor_chat_template` to avoid calling
 `cached_get_processor` again if the processor fails to be loaded.
 This is needed because `lru_cache` does not cache when an exception happens.
 """
 def _try_get_processor_chat_template(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    model_config: ModelConfig,
 ) -> Optional[str]:
    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
    if cache_key in _PROCESSOR_CHAT_TEMPLATES:
        return _PROCESSOR_CHAT_TEMPLATES[cache_key]
    try:
        processor = cached_get_processor(
            tokenizer.name_or_path,
            processor_cls=(
                PreTrainedTokenizer,
                PreTrainedTokenizerFast,
                ProcessorMixin,
            ),
            trust_remote_code=model_config.trust_remote_code,
        )
        if (
            isinstance(processor, ProcessorMixin)
            and hasattr(processor, "chat_template")
            and (chat_template := processor.chat_template) is not None
        ):
            _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
            return chat_template
    except Exception:
        logger.debug(
            "Failed to load AutoProcessor chat template for %s",
            tokenizer.name_or_path,
            exc_info=True,
        )
    _PROCESSOR_CHAT_TEMPLATES[cache_key] = None
    return None
 def resolve_hf_chat_template(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    chat_template: Optional[str],
@ -434,28 +479,10 @@ def resolve_hf_chat_template(
    # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
    if tools is None:
-        try:
+        chat_template = _try_get_processor_chat_template(tokenizer,
-            processor = cached_get_processor(
+                                                         model_config)
-                tokenizer.name_or_path,
+        if chat_template is not None:
-                processor_cls=(
+            return chat_template
                    PreTrainedTokenizer,
                    PreTrainedTokenizerFast,
                    ProcessorMixin,
                ),
                trust_remote_code=model_config.trust_remote_code,
            )
            if (
                isinstance(processor, ProcessorMixin)
                and hasattr(processor, "chat_template")
                and processor.chat_template is not None
            ):
                return processor.chat_template
        except Exception:
            logger.debug(
                "Failed to load AutoProcessor chat template for %s",
                tokenizer.name_or_path,
                exc_info=True,
            )  # noqa: E501
    # 3rd priority: AutoTokenizer chat template
    try: