mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 19:15:02 +08:00
[Optimization] Cache chat template result when processor fails to be loaded (#25341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
80cad257da
commit
a31d353b71
@ -421,6 +421,51 @@ def resolve_mistral_chat_template(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
|
||||||
|
"""
|
||||||
|
Used in `_try_get_processor_chat_template` to avoid calling
|
||||||
|
`cached_get_processor` again if the processor fails to be loaded.
|
||||||
|
|
||||||
|
This is needed because `lru_cache` does not cache when an exception happens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _try_get_processor_chat_template(
|
||||||
|
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||||
|
model_config: ModelConfig,
|
||||||
|
) -> Optional[str]:
|
||||||
|
cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
|
||||||
|
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
|
||||||
|
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
|
||||||
|
|
||||||
|
try:
|
||||||
|
processor = cached_get_processor(
|
||||||
|
tokenizer.name_or_path,
|
||||||
|
processor_cls=(
|
||||||
|
PreTrainedTokenizer,
|
||||||
|
PreTrainedTokenizerFast,
|
||||||
|
ProcessorMixin,
|
||||||
|
),
|
||||||
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
isinstance(processor, ProcessorMixin)
|
||||||
|
and hasattr(processor, "chat_template")
|
||||||
|
and (chat_template := processor.chat_template) is not None
|
||||||
|
):
|
||||||
|
_PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
|
||||||
|
return chat_template
|
||||||
|
except Exception:
|
||||||
|
logger.debug(
|
||||||
|
"Failed to load AutoProcessor chat template for %s",
|
||||||
|
tokenizer.name_or_path,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
_PROCESSOR_CHAT_TEMPLATES[cache_key] = None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def resolve_hf_chat_template(
|
def resolve_hf_chat_template(
|
||||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||||
chat_template: Optional[str],
|
chat_template: Optional[str],
|
||||||
@ -434,28 +479,10 @@ def resolve_hf_chat_template(
|
|||||||
|
|
||||||
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
||||||
if tools is None:
|
if tools is None:
|
||||||
try:
|
chat_template = _try_get_processor_chat_template(tokenizer,
|
||||||
processor = cached_get_processor(
|
model_config)
|
||||||
tokenizer.name_or_path,
|
if chat_template is not None:
|
||||||
processor_cls=(
|
return chat_template
|
||||||
PreTrainedTokenizer,
|
|
||||||
PreTrainedTokenizerFast,
|
|
||||||
ProcessorMixin,
|
|
||||||
),
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
isinstance(processor, ProcessorMixin)
|
|
||||||
and hasattr(processor, "chat_template")
|
|
||||||
and processor.chat_template is not None
|
|
||||||
):
|
|
||||||
return processor.chat_template
|
|
||||||
except Exception:
|
|
||||||
logger.debug(
|
|
||||||
"Failed to load AutoProcessor chat template for %s",
|
|
||||||
tokenizer.name_or_path,
|
|
||||||
exc_info=True,
|
|
||||||
) # noqa: E501
|
|
||||||
|
|
||||||
# 3rd priority: AutoTokenizer chat template
|
# 3rd priority: AutoTokenizer chat template
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user