[Frontend] Automatic detection of chat content format from AST (#9919)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-12-14 06:55:01 +08:00 · 2024-11-16 13:35:40 +08:00 · 2024-11-16 13:35:40 +08:00 · 32e46e000f
commit 32e46e000f
parent 4f168f69a3
16 changed files with 788 additions and 350 deletions
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -172,12 +172,20 @@ completion = client.chat.completions.create(
  ]
 )
 ```
 Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like
 `meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
 format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
 between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match
 this, unless explicitly specified.
 Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
 `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
 request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
 *"Detected the chat template content format to be..."*, and internally converts incoming requests to match
 the detected format, which can be one of:
 - `"string"`: A string.
  - Example: `"Hello world"`
 - `"openai"`: A list of dictionaries, similar to OpenAI schema.
  - Example: `[{"type": "text", "text": "Hello world!"}]`
 If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
 to override which format to use.
 ## Command line arguments for the server
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -26,7 +26,6 @@ class MockModelConfig:
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
    chat_template_text_format = "string"
    max_model_len = 100
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
@ -49,6 +48,7 @@ async def _async_serving_chat_init():
                                           BASE_MODEL_PATHS,
                                           response_role="assistant",
                                           chat_template=CHAT_TEMPLATE,
                                           chat_template_content_format="auto",
                                           lora_modules=None,
                                           prompt_adapters=None,
                                           request_logger=None)
@ -70,6 +70,7 @@ def test_serving_chat_should_set_correct_max_tokens():
                                     BASE_MODEL_PATHS,
                                     response_role="assistant",
                                     chat_template=CHAT_TEMPLATE,
                                     chat_template_content_format="auto",
                                     lora_modules=None,
                                     prompt_adapters=None,
                                     request_logger=None)
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -6,15 +6,24 @@ from PIL import Image
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (parse_chat_messages,
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
-                                         parse_chat_messages_futures)
+                                         parse_chat_messages,
                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from ..utils import VLLM_PATH
 EXAMPLES_DIR = VLLM_PATH / "examples"
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
@pytest.fixture(scope="function")
@ -26,7 +35,6 @@ def phi3v_model_config():
                       trust_remote_code=True,
                       dtype="bfloat16",
                       seed=0,
                       chat_template_text_format="string",
                       limit_mm_per_prompt={
                           "image": 2,
                       })
@ -94,7 +102,8 @@ def test_parse_chat_messages_single_image(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [{
@ -106,7 +115,11 @@ def test_parse_chat_messages_single_image(
                "type": "text",
                "text": "What's in the image?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role": "user",
@ -121,7 +134,8 @@ async def test_parse_chat_messages_single_image_async(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
+    conversation, mm_future = parse_chat_messages_futures(
        [{
            "role":
            "user",
            "content": [{
@ -133,7 +147,11 @@ async def test_parse_chat_messages_single_image_async(
                "type": "text",
                "text": "What's in the image?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role": "user",
@ -147,7 +165,8 @@ def test_parse_chat_messages_multiple_images(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [{
@ -164,7 +183,11 @@ def test_parse_chat_messages_multiple_images(
                "type": "text",
                "text": "What's in these images?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role":
@ -181,7 +204,8 @@ async def test_parse_chat_messages_multiple_images_async(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
+    conversation, mm_future = parse_chat_messages_futures(
        [{
            "role":
            "user",
            "content": [{
@ -198,7 +222,11 @@ async def test_parse_chat_messages_multiple_images_async(
                "type": "text",
                "text": "What's in these images?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role":
@ -214,7 +242,8 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [{
@ -233,8 +262,11 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                "text":
                "What's in <|image_1|> and how does it compare to <|image_2|>?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
-
+        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role":
        "user",
@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
-        "content": [{
+            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
-        }, {
+                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
-        }, {
+                },
                {
                    "type":
                    "text",
                    "text":
-            "What's in <|image_1|> and how does it compare to the other one?"
+                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
-        }]
+                }
-    }], phi3v_model_config, phi3v_tokenizer)
+            ]
        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role":
@ -285,7 +326,8 @@ def test_parse_chat_messages_multiple_images_across_messages(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [{
@ -312,7 +354,11 @@ def test_parse_chat_messages_multiple_images_across_messages(
                "type": "text",
                "text": "What about this one?"
            }]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [
        {
@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format(
    phi3v_model_config,
    phi3v_tokenizer,
 ):
    phi3v_model_config.chat_template_text_format = "openai"
    conversation, mm_data = parse_chat_messages(
        [{
            "role": "user",
@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format(
        }, {
            "role": "user",
            "content": "What about this one?"
-        }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="openai",
    )
    assert conversation == [
        {
@ -389,7 +438,8 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                ValueError,
                match="At most 2 image\\(s\\) may be provided in one request\\."
        ):
-            parse_chat_messages([{
+            parse_chat_messages(
                [{
                    "role":
                    "user",
                    "content": [{
@ -411,7 +461,11 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                        "type": "text",
                        "text": "What's in these images?"
                    }]
-            }], phi3v_model_config, phi3v_tokenizer)
+                }],
                phi3v_model_config,
                phi3v_tokenizer,
                content_format="string",
            )
 def test_parse_chat_messages_rejects_too_many_images_across_messages(
@ -427,7 +481,8 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                ValueError,
                match="At most 2 image\\(s\\) may be provided in one request\\."
        ):
-            parse_chat_messages([{
+            parse_chat_messages(
                [{
                    "role":
                    "user",
                    "content": [{
@ -459,7 +514,11 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                        "type": "text",
                        "text": "What about these two?"
                    }]
-            }], phi3v_model_config, phi3v_tokenizer)
+                }],
                phi3v_model_config,
                phi3v_tokenizer,
                content_format="string",
            )
 def test_parse_chat_messages_multiple_images_uncommon_input(
@ -467,7 +526,8 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
    phi3v_tokenizer,
    image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [
@ -477,7 +537,11 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
                    "image_url": image_url
                }
            ]
-    }], phi3v_model_config, phi3v_tokenizer)
+        }],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [{
        "role":
@ -495,7 +559,8 @@ def test_mllama_single_image(
    image_url,
 ):
    """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [{
@ -504,7 +569,11 @@ def test_mllama_single_image(
            }, {
                "image_url": image_url
            }]
-    }], mllama_model_config, mllama_tokenizer)
+        }],
        mllama_model_config,
        mllama_tokenizer,
        content_format="openai",
    )
    _assert_mm_data_is_image_input(mm_data, 1)
    assert conversation == [{
        'role':
@ -524,7 +593,8 @@ def test_mllama_interleaved_images(
    image_url,
 ):
    """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages([{
+    conversation, mm_data = parse_chat_messages(
        [{
            "role":
            "user",
            "content": [
@ -543,7 +613,11 @@ def test_mllama_interleaved_images(
                    "image_url": image_url
                },
            ]
-    }], mllama_model_config, mllama_tokenizer)
+        }],
        mllama_model_config,
        mllama_tokenizer,
        content_format="openai",
    )
    _assert_mm_data_is_image_input(mm_data, 2)
    assert conversation == [{
        'role':
@ -626,6 +700,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
        vllm_conversation,
        model_config,
        tokenizer_group,
        content_format="openai",
    )
    vllm_result = apply_hf_chat_template(
@ -636,3 +711,89 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
    )
    assert hf_result == vllm_result
 # yapf: disable
@pytest.mark.parametrize(
    ("model", "expected_format"),
    [(PHI3V_MODEL_ID, "string"),
     (QWEN2VL_MODEL_ID, "openai"),
     (ULTRAVOX_MODEL_ID, "string"),
     (MLLAMA_MODEL_ID, "openai"),
     (LLAMA_GUARD_MODEL_ID, "openai")],
 )
 # yapf: enable
 def test_resolve_content_format_hf_defined(model, expected_format):
    tokenizer_group = TokenizerGroup(
        model,
        enable_lora=False,
        max_num_seqs=5,
        max_input_length=None,
    )
    tokenizer = tokenizer_group.tokenizer
    chat_template = tokenizer.chat_template
    assert isinstance(chat_template, str)
    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))
    resolved_format = resolve_chat_template_content_format(
        None,  # Test detecting the tokenizer's chat_template
        "auto",
        tokenizer,
    )
    assert resolved_format == expected_format
 # yapf: disable
@pytest.mark.parametrize(
    ("template_path", "expected_format"),
    [("template_alpaca.jinja", "string"),
     ("template_baichuan.jinja", "string"),
     ("template_blip2.jinja", "string"),
     ("template_chatglm.jinja", "string"),
     ("template_chatglm2.jinja", "string"),
     ("template_chatml.jinja", "string"),
     ("template_falcon_180b.jinja", "string"),
     ("template_falcon.jinja", "string"),
     ("template_inkbot.jinja", "string"),
     ("template_llava.jinja", "string"),
     ("template_vlm2vec.jinja", "openai"),
     ("tool_chat_template_granite_20b_fc.jinja", "string"),
     ("tool_chat_template_hermes.jinja", "string"),
     ("tool_chat_template_internlm2_tool.jinja", "string"),
     ("tool_chat_template_llama3.1_json.jinja", "string"),
     ("tool_chat_template_llama3.2_json.jinja", "string"),
     ("tool_chat_template_mistral_parallel.jinja", "string"),
     ("tool_chat_template_mistral.jinja", "string")],
 )
 # yapf: enable
 def test_resolve_content_format_examples(template_path, expected_format):
    tokenizer_group = TokenizerGroup(
        PHI3V_MODEL_ID,
        enable_lora=False,
        max_num_seqs=5,
        max_input_length=None,
    )
    dummy_tokenizer = tokenizer_group.tokenizer
    dummy_tokenizer.chat_template = None
    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
    assert isinstance(chat_template, str)
    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))
    resolved_format = resolve_chat_template_content_format(
        chat_template,
        "auto",
        dummy_tokenizer,
    )
    assert resolved_format == expected_format
--- a/vllm/config.py
+++ b/vllm/config.py
@ -155,7 +155,6 @@ class ModelConfig:
            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
            use_async_output_proc: bool = True,
            config_format: ConfigFormat = ConfigFormat.AUTO,
            chat_template_text_format: str = "string",
            hf_overrides: Optional[HfOverrides] = None,
            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
            override_neuron_config: Optional[Dict[str, Any]] = None,
@ -216,7 +215,6 @@ class ModelConfig:
            self.model, revision)
        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
        self.use_async_output_proc = use_async_output_proc
        self.chat_template_text_format = chat_template_text_format
        self.mm_processor_kwargs = mm_processor_kwargs
        # Set enforce_eager to False if the value is unset.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -90,7 +90,6 @@ class EngineArgs:
    task: TaskOption = "auto"
    skip_tokenizer_init: bool = False
    tokenizer_mode: str = 'auto'
    chat_template_text_format: str = 'string'
    trust_remote_code: bool = False
    allowed_local_media_path: str = ""
    download_dir: Optional[str] = None
@ -258,14 +257,6 @@ class EngineArgs:
            'fast tokenizer if available.\n* "slow" will '
            'always use the slow tokenizer. \n* '
            '"mistral" will always use the `mistral_common` tokenizer.')
        parser.add_argument(
            '--chat-template-text-format',
            type=str,
            default=EngineArgs.chat_template_text_format,
            choices=['string', 'openai'],
            help='The format to render text content within a chat template. '
            '"string" will keep the content field as a string whereas '
            '"openai" will parse content in the current OpenAI format.')
        parser.add_argument('--trust-remote-code',
                            action='store_true',
                            help='Trust remote code from huggingface.')
@ -894,7 +885,6 @@ class EngineArgs:
            # We know this is not None because we set it in __post_init__
            tokenizer=cast(str, self.tokenizer),
            tokenizer_mode=self.tokenizer_mode,
            chat_template_text_format=self.chat_template_text_format,
            trust_remote_code=self.trust_remote_code,
            allowed_local_media_path=self.allowed_local_media_path,
            dtype=self.dtype,
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -262,8 +262,7 @@ class LLMEngine:
            "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
            "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
            "use_async_output_proc=%s, use_cached_outputs=%s, "
-            "chat_template_text_format=%s, mm_processor_kwargs=%s, "
+            "mm_processor_kwargs=%s, pooler_config=%r)",
            "pooler_config=%r)",
            VLLM_VERSION,
            model_config.model,
            speculative_config,
@ -296,7 +295,6 @@ class LLMEngine:
            cache_config.enable_prefix_caching,
            model_config.use_async_output_proc,
            use_cached_outputs,
            model_config.chat_template_text_format,
            model_config.mm_processor_kwargs,
            model_config.pooler_config,
        )
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -2,12 +2,14 @@ import asyncio
 import codecs
 import json
 from abc import ABC, abstractmethod
-from collections import defaultdict
+from collections import defaultdict, deque
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                    Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
@ -153,6 +155,199 @@ class ConversationMessage(TypedDict, total=False):
    """The tool calls generated by the model, such as function calls."""
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
 # Used internally
 _ChatTemplateContentFormat = Literal["string", "openai"]
 def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
    if isinstance(node, jinja2.nodes.Name):
        return node.ctx == "load" and node.name == varname
    return False
 def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
    if isinstance(node, jinja2.nodes.Getitem):
        return (_is_var_access(node.node, varname)
                and isinstance(node.arg, jinja2.nodes.Const)
                and node.arg.value == key)
    if isinstance(node, jinja2.nodes.Getattr):
        return _is_var_access(node.node, varname) and node.attr == key
    return False
 def _is_var_or_elems_access(
    node: jinja2.nodes.Node,
    varname: str,
    key: Optional[str] = None,
 ) -> bool:
    if isinstance(node, jinja2.nodes.Filter):
        return (node.node is not None
                and _is_var_or_elems_access(node.node, varname, key))
    if isinstance(node, jinja2.nodes.Test):
        return _is_var_or_elems_access(node.node, varname, key)
    if (isinstance(node, jinja2.nodes.Getitem)
            and isinstance(node.arg, jinja2.nodes.Slice)):
        return _is_var_or_elems_access(node.node, varname, key)
    # yapf: disable
    return (
        _is_attr_access(node, varname, key) if key
        else _is_var_access(node, varname)
    ) # yapf: enable
 def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
    # Global variable that is implicitly defined at the root
    yield root, varname
    # Iterative BFS
    related_varnames = deque([varname])
    while related_varnames:
        related_varname = related_varnames.popleft()
        for assign_ast in root.find_all(jinja2.nodes.Assign):
            lhs = assign_ast.target
            rhs = assign_ast.node
            if _is_var_or_elems_access(rhs, related_varname):
                assert isinstance(lhs, jinja2.nodes.Name)
                yield assign_ast, lhs.name
                # Avoid infinite looping for self-assignment
                if lhs.name != related_varname:
                    related_varnames.append(lhs.name)
 # NOTE: The proper way to handle this is to build a CFG so that we can handle
 # the scope in which each variable is defined, but that is too complicated
 def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
    messages_varnames = [
        varname
        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
    ]
    # Search for {%- for message in messages -%} loops
    for loop_ast in root.find_all(jinja2.nodes.For):
        loop_iter = loop_ast.iter
        loop_target = loop_ast.target
        for varname in messages_varnames:
            if _is_var_or_elems_access(loop_iter, varname):
                assert isinstance(loop_target, jinja2.nodes.Name)
                yield loop_ast, loop_target.name
                break
 def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
    message_varnames = [
        varname for _, varname in _iter_nodes_assign_messages_item(root)
    ]
    # Search for {%- for content in message['content'] -%} loops
    for loop_ast in root.find_all(jinja2.nodes.For):
        loop_iter = loop_ast.iter
        loop_target = loop_ast.target
        for varname in message_varnames:
            if _is_var_or_elems_access(loop_iter, varname, "content"):
                assert isinstance(loop_target, jinja2.nodes.Name)
                yield loop_ast, loop_target.name
                break
 def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
    try:
        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
        return jinja_compiled.environment.parse(chat_template)
    except Exception:
        logger.exception("Error when compiling Jinja template")
        return None
 def _detect_content_format(
    chat_template: str,
    *,
    default: _ChatTemplateContentFormat,
 ) -> _ChatTemplateContentFormat:
    jinja_ast = _try_extract_ast(chat_template)
    if jinja_ast is None:
        return default
    try:
        next(_iter_nodes_assign_content_item(jinja_ast))
    except StopIteration:
        return "string"
    except Exception:
        logger.exception("Error when parsing AST of Jinja template")
        return default
    else:
        return "openai"
 def _resolve_chat_template_content_format(
    chat_template: Optional[str],
    given_format: ChatTemplateContentFormatOption,
    tokenizer: AnyTokenizer,
 ) -> _ChatTemplateContentFormat:
    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
        tokenizer_chat_template = tokenizer.chat_template
    else:
        tokenizer_chat_template = None
    jinja_text: Optional[str]
    if isinstance(tokenizer_chat_template, str) and chat_template is None:
        jinja_text = tokenizer_chat_template
    elif (isinstance(tokenizer_chat_template, dict)
            and chat_template in tokenizer_chat_template):
        jinja_text = tokenizer_chat_template[chat_template]
    else:
        jinja_text = load_chat_template(chat_template, is_literal=True)
    detected_format = ("string" if jinja_text is None else
                       _detect_content_format(jinja_text, default="string"))
    return detected_format if given_format == "auto" else given_format
@lru_cache
 def resolve_chat_template_content_format(
    chat_template: Optional[str],
    given_format: ChatTemplateContentFormatOption,
    tokenizer: AnyTokenizer,
 ) -> _ChatTemplateContentFormat:
    detected_format = _resolve_chat_template_content_format(
        chat_template,
        given_format,
        tokenizer,
    )
    logger.info(
        "Detected the chat template content format to be '%s'. "
        "You can set `--chat-template-content-format` to override this.",
        detected_format,
    )
    if given_format != "auto" and given_format != detected_format:
        logger.warning(
            "You specified `--chat-template-content-format %s` "
            "which is different from the detected format '%s'. "
            "If our automatic detection is incorrect, please consider "
            "opening a GitHub issue so that we can improve it: "
            "https://github.com/vllm-project/vllm/issues/new/choose",
            given_format,
            detected_format,
        )
    return detected_format
 ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")
@ -407,12 +602,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
 def load_chat_template(
-        chat_template: Optional[Union[Path, str]]) -> Optional[str]:
+    chat_template: Optional[Union[Path, str]],
    *,
    is_literal: bool = False,
 ) -> Optional[str]:
    if chat_template is None:
        return None
    if is_literal:
        if isinstance(chat_template, Path):
            raise TypeError("chat_template is expected to be read directly "
                            "from its value")
        return codecs.decode(chat_template, "unicode_escape")
    try:
        with open(chat_template) as f:
-            resolved_chat_template = f.read()
+            return f.read()
    except OSError as e:
        if isinstance(chat_template, Path):
            raise
@ -426,10 +632,7 @@ def load_chat_template(
        # If opening a file fails, set chat template to be args to
        # ensure we decode so our escape are interpreted correctly
-        resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
+        return load_chat_template(chat_template, is_literal=True)
    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
    return resolved_chat_template
 # TODO: Let user specify how to insert multimodal tokens into prompt
@ -464,7 +667,6 @@ _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'}
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
@ -542,18 +744,12 @@ def _parse_chat_message_content_parts(
    role: str,
    parts: Iterable[ChatCompletionContentPartParam],
    mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    *,
    wrap_dicts: bool,
 ) -> List[ConversationMessage]:
    content: List[Union[str, Dict[str, str]]] = []
    mm_parser = mm_tracker.create_parser()
    model_config = mm_tracker.model_config
    wrap_dicts = (chat_template_text_format == "openai"
                  or (model_config.task == "embedding"
                      and model_config.is_multimodal_model)
                  or (model_config.hf_config.model_type
                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
    for part in parts:
        parse_res = _parse_chat_message_content_part(
@ -580,7 +776,9 @@ def _parse_chat_message_content_parts(
 def _parse_chat_message_content_part(
    part: ChatCompletionContentPartParam,
    mm_parser: BaseMultiModalContentParser,
-        wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]:
+    *,
    wrap_dicts: bool,
 ) -> Optional[Union[str, Dict[str, str]]]:
    """Parses a single part of a conversation. If wrap_dicts is True,
    structured dictionary pieces for texts and images will be
    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@ -629,7 +827,7 @@ _ToolParser = partial(cast, ChatCompletionToolMessageParam)
 def _parse_chat_message_content(
    message: ChatCompletionMessageParam,
    mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    content_format: _ChatTemplateContentFormat,
 ) -> List[ConversationMessage]:
    role = message["role"]
    content = message.get("content")
@ -645,7 +843,7 @@ def _parse_chat_message_content(
        role,
        content,  # type: ignore
        mm_tracker,
-        chat_template_text_format,
+        wrap_dicts=(content_format == "openai"),
    )
    for result_msg in result:
@ -684,6 +882,7 @@ def parse_chat_messages(
    messages: List[ChatCompletionMessageParam],
    model_config: ModelConfig,
    tokenizer: AnyTokenizer,
    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
    conversation: List[ConversationMessage] = []
    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@ -692,7 +891,7 @@ def parse_chat_messages(
        sub_messages = _parse_chat_message_content(
            msg,
            mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
        )
        conversation.extend(sub_messages)
@ -706,6 +905,7 @@ def parse_chat_messages_futures(
    messages: List[ChatCompletionMessageParam],
    model_config: ModelConfig,
    tokenizer: AnyTokenizer,
    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
    conversation: List[ConversationMessage] = []
    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
@ -714,7 +914,7 @@ def parse_chat_messages_futures(
        sub_messages = _parse_chat_message_content(
            msg,
            mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
        )
        conversation.extend(sub_messages)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -13,9 +13,11 @@ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
                                   TaskOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         ChatTemplateContentFormatOption,
                                         apply_hf_chat_template,
                                         apply_mistral_chat_template,
-                                         parse_chat_messages)
+                                         parse_chat_messages,
                                         resolve_chat_template_content_format)
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
@ -523,6 +525,7 @@ class LLM:
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
        chat_template: Optional[str] = None,
        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tools: Optional[List[Dict[str, Any]]] = None,
@ -540,8 +543,10 @@ class LLM:
        Args:
            messages: A list of conversations or a single conversation.
              - Each conversation is represented as a list of messages.
              - Each message is a dictionary with 'role' and 'content' keys.
            sampling_params: The sampling parameters for text generation.
                If None, we use the default sampling parameters. When it
                is a single value, it is applied to every prompt. When it
@ -551,11 +556,19 @@ class LLM:
            lora_request: LoRA request to use for generation, if any.
            chat_template: The template to use for structuring the chat.
              If not provided, the model's default chat template will be used.
            chat_template_content_format: The format to render message content.
              - "string" will render the content as a string.
                Example: ``"Who are you?"``
              - "openai" will render the content as a list of dictionaries,
                similar to OpenAI schema.
                Example: ``[{"type": "text", "text": "Who are you?"}]``
            add_generation_prompt: If True, adds a generation template
                to each message.
            continue_final_message: If True, continues the final message in
-                the conversation instead of starting a new one. Cannot be `True`
+                the conversation instead of starting a new one. Cannot be
-                if `add_generation_prompt` is also `True`.
+                ``True`` if ``add_generation_prompt`` is also ``True``.
            mm_processor_kwargs: Multimodal processor kwarg overrides for this
                chat request. Only used for offline requests.
@ -576,17 +589,26 @@ class LLM:
                cast(List[ChatCompletionMessageParam], messages)
            ]
        tokenizer = self.get_tokenizer()
        model_config = self.llm_engine.get_model_config()
        resolved_content_format = resolve_chat_template_content_format(
            chat_template,
            chat_template_content_format,
            tokenizer,
        )
        prompts: List[Union[TokensPrompt, TextPrompt]] = []
        for msgs in list_of_messages:
            tokenizer = self.get_tokenizer()
            model_config = self.llm_engine.get_model_config()
            # NOTE: _parse_chat_message_content_parts() currently doesn't
            # handle mm_processor_kwargs, since there is no implementation in
            # the chat message parsing for it.
            conversation, mm_data = parse_chat_messages(
-                msgs, model_config, tokenizer)
+                msgs,
                model_config,
                tokenizer,
                content_format=resolved_content_format,
            )
            prompt_data: Union[str, List[int]]
            if isinstance(tokenizer, MistralTokenizer):
@ -737,7 +759,7 @@ class LLM:
                generation, if any.
        Returns:
-            A list of `EmbeddingRequestOutput` objects containing the
+            A list of ``EmbeddingRequestOutput`` objects containing the
            generated embeddings in the same order as the input prompts.
        Note:
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -29,6 +29,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@ -529,6 +530,9 @@ def init_app_state(
    state.engine_client = engine_client
    state.log_stats = not args.disable_log_stats
    resolved_chat_template = load_chat_template(args.chat_template)
    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
    state.openai_serving_chat = OpenAIServingChat(
        engine_client,
        model_config,
@ -537,7 +541,8 @@ def init_app_state(
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
        enable_auto_tools=args.enable_auto_tool_choice,
        tool_parser=args.tool_call_parser,
@ -557,7 +562,8 @@ def init_app_state(
        model_config,
        base_model_paths,
        request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
    ) if model_config.task == "embedding" else None
    state.openai_serving_tokenization = OpenAIServingTokenization(
        engine_client,
@ -565,7 +571,8 @@ def init_app_state(
        base_model_paths,
        lora_modules=args.lora_modules,
        request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
        chat_template_content_format=args.chat_template_content_format,
    )
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@ -7,10 +7,11 @@ purposes.
 import argparse
 import json
 import ssl
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence, Union, get_args
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.chat_utils import validate_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                         validate_chat_template)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                    PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@ -132,6 +133,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                        help="The file path to the chat template, "
                        "or the template in single-line form "
                        "for the specified model")
    parser.add_argument(
        '--chat-template-content-format',
        type=str,
        default="auto",
        choices=get_args(ChatTemplateContentFormatOption),
        help='The format to render message content within a chat template.'
        '\n\n'
        '* "string" will render the content as a string. '
        'Example: "Hello World"\n'
        '* "openai" will render the content as a list of dictionaries, '
        'similar to OpenAI schema. '
        'Example: [{"type": "text", "text": "Hello world!"}]')
    parser.add_argument("--response-role",
                        type=nullable_str,
                        default="assistant",
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -5,9 +5,8 @@ from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
 import torch
 from openai.types.chat import ChatCompletionContentPartParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Annotated, Required, TypedDict
+from typing_extensions import Annotated
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.pooling_params import PoolingParams
@ -35,26 +34,6 @@ assert _LONG_INFO.min == _MOCK_LONG_INFO.min
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 class CustomChatCompletionMessageParam(TypedDict, total=False):
    """Enables custom roles in the Chat Completion API."""
    role: Required[str]
    """The role of the message's author."""
    content: Union[str, List[ChatCompletionContentPartParam]]
    """The contents of the message."""
    name: str
    """An optional name for the participant.
    Provides the model information to differentiate between participants of the
    same role.
    """
    tool_call_id: Optional[str]
    tool_calls: Optional[List[dict]]
 class OpenAIBaseModel(BaseModel):
    # OpenAI API does not allow extra fields
    model_config = ConfigDict(extra="forbid")
@ -1054,16 +1033,56 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
    model: str
    prompt: str
-    add_special_tokens: bool = Field(default=True)
+    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
 class TokenizeChatRequest(OpenAIBaseModel):
    model: str
    messages: List[ChatCompletionMessageParam]
-    add_generation_prompt: bool = Field(default=True)
+    add_generation_prompt: bool = Field(
-    continue_final_message: bool = Field(default=False)
+        default=True,
-    add_special_tokens: bool = Field(default=False)
+        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
    continue_final_message: bool = Field(
        default=False,
        description=
        ("If this is set, the chat will be formatted so that the final "
         "message in the chat is open-ended, without any EOS tokens. The "
         "model will continue this message rather than starting a new one. "
         "This allows you to \"prefill\" part of the model's response for it. "
         "Cannot be used at the same time as `add_generation_prompt`."),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the template renderer. "
                     "Will be accessible by the chat template."),
    )
    @model_validator(mode="before")
    @classmethod
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -222,6 +222,7 @@ async def main(args):
        prompt_adapters=None,
        request_logger=request_logger,
        chat_template=None,
        chat_template_content_format="auto",
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
    ) if model_config.task == "generate" else None
    openai_serving_embedding = OpenAIServingEmbedding(
@ -230,6 +231,7 @@ async def main(args):
        base_model_paths,
        request_logger=request_logger,
        chat_template=None,
        chat_template_content_format="auto",
    ) if model_config.task == "embedding" else None
    tracker = BatchProgressTracker()
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -10,7 +10,8 @@ from fastapi import Request
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                         ConversationMessage)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionLogProb, ChatCompletionLogProbs,
@ -38,7 +39,8 @@ logger = init_logger(__name__)
 class OpenAIServingChat(OpenAIServing):
-    def __init__(self,
+    def __init__(
        self,
        engine_client: EngineClient,
        model_config: ModelConfig,
        base_model_paths: List[BaseModelPath],
@ -48,10 +50,12 @@ class OpenAIServingChat(OpenAIServing):
        prompt_adapters: Optional[List[PromptAdapterPath]],
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
        chat_template_content_format: ChatTemplateContentFormatOption,
        return_tokens_as_token_ids: bool = False,
        enable_auto_tools: bool = False,
        tool_parser: Optional[str] = None,
-                 enable_prompt_tokens_details: bool = False):
+        enable_prompt_tokens_details: bool = False,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         base_model_paths=base_model_paths,
@ -61,8 +65,8 @@ class OpenAIServingChat(OpenAIServing):
                         return_tokens_as_token_ids=return_tokens_as_token_ids)
        self.response_role = response_role
-        self.use_tool_use_model_template = False
+        self.chat_template = chat_template
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template_content_format: Final = chat_template_content_format
        # set up tool use
        self.enable_auto_tools: bool = enable_auto_tools
@ -120,6 +124,7 @@ class OpenAIServingChat(OpenAIServing):
            ) = self._maybe_get_adapters(request)
            tokenizer = await self.engine_client.get_tokenizer(lora_request)
            tool_parser = self.tool_parser
            # validation for OpenAI tools
@ -157,6 +162,7 @@ class OpenAIServingChat(OpenAIServing):
                tokenizer,
                request.messages,
                chat_template=request.chat_template or self.chat_template,
                chat_template_content_format=self.chat_template_content_format,
                add_generation_prompt=request.add_generation_prompt,
                continue_final_message=request.continue_final_message,
                tool_dicts=tool_dicts,
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@ -1,7 +1,7 @@
 import asyncio
 import base64
 import time
-from typing import AsyncGenerator, List, Literal, Optional, Union, cast
+from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
 import numpy as np
 from fastapi import Request
@ -9,7 +9,7 @@ from typing_extensions import assert_never
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                              EmbeddingRequest,
@ -77,7 +77,8 @@ class OpenAIServingEmbedding(OpenAIServing):
        *,
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         base_model_paths=base_model_paths,
@ -85,7 +86,8 @@ class OpenAIServingEmbedding(OpenAIServing):
                         prompt_adapters=None,
                         request_logger=request_logger)
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = chat_template
        self.chat_template_content_format: Final = chat_template_content_format
    async def create_embedding(
        self,
@ -144,6 +146,8 @@ class OpenAIServingEmbedding(OpenAIServing):
                    tokenizer,
                    request.messages,
                    chat_template=request.chat_template or self.chat_template,
                    chat_template_content_format=self.
                    chat_template_content_format,
                    add_generation_prompt=request.add_generation_prompt,
                    continue_final_message=request.continue_final_message,
                    truncate_prompt_tokens=truncate_prompt_tokens,
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -11,14 +11,16 @@ from typing_extensions import Annotated
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         ChatTemplateContentFormatOption,
                                         ConversationMessage,
                                         apply_hf_chat_template,
                                         apply_mistral_chat_template,
-                                         parse_chat_messages_futures)
+                                         parse_chat_messages_futures,
                                         resolve_chat_template_content_format)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest,
                                              DetokenizeRequest,
@ -426,7 +428,8 @@ class OpenAIServing:
        request: ChatLikeRequest,
        tokenizer: AnyTokenizer,
        messages: List[ChatCompletionMessageParam],
-        chat_template: Optional[str] = None,
+        chat_template: Optional[str],
        chat_template_content_format: ChatTemplateContentFormatOption,
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tool_dicts: Optional[List[Dict[str, Any]]] = None,
@ -437,10 +440,16 @@ class OpenAIServing:
        add_special_tokens: bool = False,
    ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
               List[TokensPrompt]]:
        resolved_content_format = resolve_chat_template_content_format(
            chat_template,
            chat_template_content_format,
            tokenizer,
        )
        conversation, mm_data_future = parse_chat_messages_futures(
            messages,
            self.model_config,
            tokenizer,
            content_format=resolved_content_format,
        )
        _chat_template_kwargs: Dict[str, Any] = dict(
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@ -1,8 +1,8 @@
-from typing import List, Optional, Union
+from typing import Final, List, Optional, Union
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@ -33,7 +33,8 @@ class OpenAIServingTokenization(OpenAIServing):
        lora_modules: Optional[List[LoRAModulePath]],
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         base_model_paths=base_model_paths,
@ -41,12 +42,8 @@ class OpenAIServingTokenization(OpenAIServing):
                         prompt_adapters=None,
                         request_logger=request_logger)
-        # If this is None we use the tokenizer's default chat template
+        self.chat_template = chat_template
-        # the list of commonly-used chat template names for HF named templates
+        self.chat_template_content_format: Final = chat_template_content_format
        hf_chat_templates: List[str] = ['default', 'tool_use']
        self.chat_template = chat_template \
            if chat_template in hf_chat_templates \
            else load_chat_template(chat_template)
    async def create_tokenize(
        self,
@ -75,9 +72,12 @@ class OpenAIServingTokenization(OpenAIServing):
                    request,
                    tokenizer,
                    request.messages,
-                    chat_template=self.chat_template,
+                    chat_template=request.chat_template or self.chat_template,
                    chat_template_content_format=self.
                    chat_template_content_format,
                    add_generation_prompt=request.add_generation_prompt,
                    continue_final_message=request.continue_final_message,
                    chat_template_kwargs=request.chat_template_kwargs,
                    add_special_tokens=request.add_special_tokens,
                )
            else: