diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ddf11c0b268f..4d98ee40a4bbb 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -316,7 +316,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
+ - tests/tokenizers_
- tests/test_sequence
- tests/test_config
- tests/test_logger
@@ -324,7 +324,7 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- - pytest -v -s tokenization
+ - pytest -v -s tokenizers_
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c38068a9b22c0..16d4907549587 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -282,7 +282,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
+ - tests/tokenizers_
- tests/test_sequence
- tests/test_config
- tests/test_logger
@@ -290,7 +290,7 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- - pytest -v -s tokenization
+ - pytest -v -s tokenizers_
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede72153..d69d74ca61f54 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.transformers_utils.tokenizer import MistralTokenizer
+ from vllm.tokenizers import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 5f26c7cf182b9..08a0dd69efa90 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
# import the required packages
from vllm.reasoning import ReasoningParser, ReasoningParserManager
- from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
- DeltaMessage)
+ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
# define a reasoning parser and register it to vllm
# the name list in register_module can be used
# in --reasoning-parser.
class ExampleParser(ReasoningParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
def extract_reasoning_streaming(
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 22dda37279ac6..b6dfbf10b4568 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -422,7 +422,7 @@ Here is a summary of a plugin file:
# in --tool-call-parser. you can define as many
# tool parsers as you want here.
class ExampleToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# adjust request. e.g.: set skip special tokens
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 29892d0bf38aa..956a06dc5487c 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
@pytest.fixture()
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py
index f2ac5e5b9a8fa..a40d0ab44cf7f 100644
--- a/tests/entrypoints/openai/tool_parsers/conftest.py
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -4,9 +4,9 @@
import pytest
from transformers import AutoTokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
@pytest.fixture(scope="function")
-def default_tokenizer() -> AnyTokenizer:
+def default_tokenizer() -> TokenizerLike:
return AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 38008dafe32b2..b2303ab0e7b7c 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -7,7 +7,7 @@ import pytest
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from ....utils import RemoteOpenAIServer
@@ -270,14 +270,14 @@ async def test_streaming_product_tool_call():
@pytest.fixture
-def qwen_tokenizer() -> AnyTokenizer:
+def qwen_tokenizer() -> TokenizerLike:
from vllm.transformers_utils.tokenizer import get_tokenizer
return get_tokenizer("Qwen/Qwen3-32B")
@pytest.fixture
-def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
return Hermes2ProToolParser(qwen_tokenizer)
@@ -291,7 +291,7 @@ def any_chat_request() -> ChatCompletionRequest:
def test_hermes_parser_streaming_just_forward_text(
- qwen_tokenizer: AnyTokenizer,
+ qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
@@ -323,7 +323,7 @@ def test_hermes_parser_streaming_just_forward_text(
def test_hermes_parser_streaming_failure_case_bug_19056(
- qwen_tokenizer: AnyTokenizer,
+ qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
@@ -357,7 +357,7 @@ def test_hermes_parser_streaming_failure_case_bug_19056(
def test_hermes_parser_streaming(
- qwen_tokenizer: AnyTokenizer,
+ qwen_tokenizer: TokenizerLike,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 37e52d2cdf609..6c286ca90ce48 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -7,11 +7,11 @@ import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
@pytest.fixture
-def parser(default_tokenizer: AnyTokenizer):
+def parser(default_tokenizer: TokenizerLike):
return Llama3JsonToolParser(default_tokenizer)
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index d297432eab644..8aa88a007188f 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
# Test cases similar to pythonic parser but with Llama4 specific format
SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
@@ -64,7 +64,7 @@ PYTHON_TAG_FUNCTION_OUTPUT = (
@pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
@@ -208,7 +208,7 @@ def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
- default_tokenizer: AnyTokenizer,
+ default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
@@ -224,7 +224,7 @@ def test_tool_call(
assert actual.function == expected
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
)
@@ -246,7 +246,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
@pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index 13cff9a8ebf1e..a0b9a3c563bc2 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -69,7 +69,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
@pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
@@ -188,7 +188,7 @@ def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
- default_tokenizer: AnyTokenizer,
+ default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
@@ -205,7 +205,7 @@ def test_tool_call(
assert actual.function == expected
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
)
@@ -228,7 +228,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
@pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fcd3df16e5cfa..52202c55e8405 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -61,7 +61,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
@pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
@@ -168,7 +168,7 @@ def test_tool_call(
streaming: bool,
model_output: str,
expected_tool_calls: list[FunctionCall],
- default_tokenizer: AnyTokenizer,
+ default_tokenizer: TokenizerLike,
):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
@@ -185,7 +185,7 @@ def test_tool_call(
assert actual.function == expected
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
)
@@ -208,7 +208,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
@pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
"""test regex timeout is handled gracefully"""
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 38899f2632554..2d4f5f1734102 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
class StreamingToolReconstructor:
@@ -111,7 +111,7 @@ def run_tool_extraction_nonstreaming(
return tool_parser.extract_tool_calls(model_output, request)
-def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]:
+def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
# Split a string into a series of deltas using the provided tokenizer. Each
# delta will be the string equivalent of a single token.
token_ids = tokenizer.encode(text, add_special_tokens=False)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7baf564ad01a4..a351cda60621f 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,8 @@ from vllm.multimodal.utils import (
encode_image_base64,
encode_video_base64,
)
+from vllm.tokenizers import MistralTokenizer
from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import VLLM_PATH
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 80e337d570a36..1377776a6d84b 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
from ...utils import check_logprobs_close
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 18a50c3a555da..9e9087cb0fc4d 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 03ff3bcf6307b..08cf4b2202dcd 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -9,7 +9,7 @@ import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
@@ -33,7 +33,7 @@ def run_test(
auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool,
comparator: Callable[..., None],
- get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+ get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
stop_str: list[str] | None,
limit_mm_per_prompt: dict[str, int],
vllm_runner_kwargs: dict[str, Any] | None,
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 5c1bc6ac28fe3..0c03c84497125 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -14,7 +14,7 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config.model import RunnerOption
from vllm.logprobs import SampleLogprobs
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from .....conftest import (
AUDIO_ASSETS,
@@ -126,7 +126,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs: dict[str, Any] | None = None
# Optional callable which gets a list of token IDs from the model tokenizer
- get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
+ get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: list[str] | None = None
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 9638791ab5caa..c39e522100901 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.tokenizers import MistralTokenizer
from vllm.transformers_utils.tokenizer import (
- MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens,
)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d860c50e7899a..f7fa8da54d54e 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
from contextlib import nullcontext
from typing import cast
@@ -23,7 +24,7 @@ from vllm.multimodal.processing import (
replace_token_matches,
)
from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from .utils import random_image
@@ -238,7 +239,7 @@ def test_find_token_matches(
update_type,
):
# Should not be used since there is nothing to convert to token IDs
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
prompt_updates = {
key: update_type(key, target, []).resolve(0)
@@ -385,7 +386,7 @@ def test_find_text_matches(
update_type,
):
# Should not be used since there is nothing to convert to text
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
prompt_updates = {
key: update_type(key, target, []).resolve(0)
@@ -545,7 +546,7 @@ def test_find_update_text(
expected_by_update_type_mm_count,
):
# Should not be used since there is nothing to convert to text
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
for (
update_type,
@@ -750,7 +751,7 @@ def test_find_update_tokens(
expected_by_update_type_mm_count,
):
# Should not be used since there is nothing to convert to tokens
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
for (
update_type,
@@ -900,7 +901,7 @@ def test_find_mm_placeholders(
update_type,
):
# Should not be used since there is nothing to convert to tokens
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
mm_prompt_updates = {
key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
@@ -1029,7 +1030,7 @@ def test_hf_processor_init_kwargs(
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1065,7 +1066,7 @@ def test_hf_processor_call_kwargs(
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1088,9 +1089,7 @@ def test_apply_matches_no_match_exits_quickly():
With the fix, it should exit immediately when no match is found.
"""
- import time
-
- mock_tokenizer = cast(AnyTokenizer, object())
+ mock_tokenizer = cast(TokenizerLike, object())
# Create a long prompt with no placeholder
long_prompt = "x" * 10000
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 5163c863863a7..0fe315c2567f9 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest
from tests.reasoning.utils import run_reasoning_extraction_mistral
from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
parser_name = "mistral"
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index bd0b230a847cb..695312a0cadfe 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
class StreamingReasoningReconstructor:
diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py
deleted file mode 100644
index 8aff50b351e31..0000000000000
--- a/tests/tokenization/test_do_lower_case.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = ["BAAI/bge-base-en"]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-@pytest.mark.parametrize("n_tokens", [510])
-def test_special_tokens(tokenizer_name: str, n_tokens: int):
- tokenizer = get_tokenizer(tokenizer_name, revision="main")
-
- prompts = "[UNK]" * n_tokens
- prompt_token_ids = tokenizer.encode(prompts)
- assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
deleted file mode 100644
index 921d77b1b335e..0000000000000
--- a/tests/tokenization/test_get_eos.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This test file includes some cases where it is inappropriate to
-only get the `eos_token_id` from the tokenizer as defined by
-{meth}`vllm.LLMEngine._get_eos_token_id`.
-"""
-
-from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-
-def test_get_llama3_eos_token():
- model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
- tokenizer = get_tokenizer(model_name)
- assert tokenizer.eos_token_id == 128009
-
- generation_config = try_get_generation_config(model_name, trust_remote_code=False)
- assert generation_config is not None
- assert generation_config.eos_token_id == [128001, 128008, 128009]
-
-
-def test_get_blip2_eos_token():
- model_name = "Salesforce/blip2-opt-2.7b"
-
- tokenizer = get_tokenizer(model_name)
- assert tokenizer.eos_token_id == 2
-
- generation_config = try_get_generation_config(model_name, trust_remote_code=False)
- assert generation_config is not None
- assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
deleted file mode 100644
index e86bb03883b5e..0000000000000
--- a/tests/tokenization/test_tokenizer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-from transformers import PreTrainedTokenizerBase
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = [
- "facebook/opt-125m",
- "gpt2",
-]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
- # Assume that "main" branch always exists
- tokenizer = get_tokenizer(tokenizer_name, revision="main")
- assert isinstance(tokenizer, PreTrainedTokenizerBase)
-
- # Assume that "never" branch always does not exist
- with pytest.raises(OSError, match="not a valid git identifier"):
- get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
deleted file mode 100644
index f13bb4333d619..0000000000000
--- a/tests/tokenization/test_tokenizer_registry.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import TYPE_CHECKING, Any
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
-
-if TYPE_CHECKING:
- from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-
-
-class TestTokenizer(TokenizerBase):
- @classmethod
- def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
- return TestTokenizer()
-
- @property
- def all_special_tokens(self) -> list[str]:
- raise NotImplementedError()
-
- @property
- def all_special_ids(self) -> list[int]:
- raise NotImplementedError()
-
- @property
- def bos_token_id(self) -> int:
- return 0
-
- @property
- def eos_token_id(self) -> int:
- return 1
-
- @property
- def sep_token(self) -> str:
- raise NotImplementedError()
-
- @property
- def pad_token(self) -> str:
- raise NotImplementedError()
-
- @property
- def is_fast(self) -> bool:
- raise NotImplementedError()
-
- @property
- def vocab_size(self) -> int:
- raise NotImplementedError()
-
- @property
- def max_token_id(self) -> int:
- raise NotImplementedError()
-
- @property
- def truncation_side(self) -> str:
- raise NotImplementedError()
-
- def __call__(
- self,
- text: str | list[str] | list[int],
- text_pair: str | None = None,
- add_special_tokens: bool = False,
- truncation: bool = False,
- max_length: int | None = None,
- ):
- raise NotImplementedError()
-
- def get_vocab(self) -> dict[str, int]:
- raise NotImplementedError()
-
- def get_added_vocab(self) -> dict[str, int]:
- raise NotImplementedError()
-
- def encode_one(
- self,
- text: str,
- truncation: bool = False,
- max_length: int | None = None,
- ) -> list[int]:
- raise NotImplementedError()
-
- def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
- raise NotImplementedError()
-
- def apply_chat_template(
- self,
- messages: list["ChatCompletionMessageParam"],
- tools: list[dict[str, Any]] | None = None,
- **kwargs,
- ) -> list[int]:
- raise NotImplementedError()
-
- def convert_tokens_to_string(self, tokens: list[str]) -> str:
- raise NotImplementedError()
-
- def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
- raise NotImplementedError()
-
- def convert_ids_to_tokens(
- self,
- ids: list[int],
- skip_special_tokens: bool = True,
- ) -> list[str]:
- raise NotImplementedError()
-
-
-def test_customized_tokenizer():
- TokenizerRegistry.register(
- "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
- )
-
- tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
- assert isinstance(tokenizer, TestTokenizer)
- assert tokenizer.bos_token_id == 0
- assert tokenizer.eos_token_id == 1
-
- tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
- assert isinstance(tokenizer, TestTokenizer)
- assert tokenizer.bos_token_id == 0
- assert tokenizer.eos_token_id == 1
diff --git a/tests/tokenizers_/__init__.py b/tests/tokenizers_/__init__.py
new file mode 100644
index 0000000000000..a5d7f4b031032
--- /dev/null
+++ b/tests/tokenizers_/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename
+# this module to avoid conflicting with HF's `tokenizers` package
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
new file mode 100644
index 0000000000000..1fca633cc5cd7
--- /dev/null
+++ b/tests/tokenizers_/test_basic.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import _get_protocol_attrs # type: ignore
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def _get_missing_attrs(obj: object, target: type):
+ return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
+
+
+def test_tokenizer_like_protocol():
+ assert not (
+ missing_attrs := _get_missing_attrs(
+ get_tokenizer("gpt2", use_fast=False),
+ TokenizerLike,
+ )
+ ), f"Missing attrs: {missing_attrs}"
+
+ assert not (
+ missing_attrs := _get_missing_attrs(
+ get_tokenizer("gpt2", use_fast=True),
+ TokenizerLike,
+ )
+ ), f"Missing attrs: {missing_attrs}"
+
+ assert not (
+ missing_attrs := _get_missing_attrs(
+ get_tokenizer(
+ "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+ ),
+ TokenizerLike,
+ )
+ ), f"Missing attrs: {missing_attrs}"
+
+
+@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
+def test_tokenizer_revision(tokenizer_name: str):
+ # Assume that "main" branch always exists
+ tokenizer = get_tokenizer(tokenizer_name, revision="main")
+ assert isinstance(tokenizer, PreTrainedTokenizerBase)
+
+ # Assume that "never" branch always does not exist
+ with pytest.raises(OSError, match="not a valid git identifier"):
+ get_tokenizer(tokenizer_name, revision="never")
+
+
+@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"])
+@pytest.mark.parametrize("n_tokens", [510])
+def test_special_tokens(tokenizer_name: str, n_tokens: int):
+ tokenizer = get_tokenizer(tokenizer_name, revision="main")
+
+ prompts = "[UNK]" * n_tokens
+ prompt_token_ids = tokenizer.encode(prompts)
+ assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenizers_/test_cached_tokenizer.py
similarity index 88%
rename from tests/tokenization/test_cached_tokenizer.py
rename to tests/tokenizers_/test_cached_tokenizer.py
index a5bb3dbcfe29d..48234687ea1ea 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_cached_tokenizer.py
@@ -6,7 +6,8 @@ from copy import deepcopy
import pytest
from transformers import AutoTokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
@@ -25,7 +26,7 @@ def test_cached_tokenizer(model_id: str):
_check_consistency(unpickled_tokenizer, reference_tokenizer)
-def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
assert isinstance(target, type(expected))
# Cached attributes
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
similarity index 99%
rename from tests/tokenization/test_detokenize.py
rename to tests/tokenizers_/test_detokenize.py
index f4b43a21daaa8..ae1d6b0956722 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@ import pytest
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import (
FastIncrementalDetokenizer,
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenizers_/test_mistral.py
similarity index 98%
rename from tests/tokenization/test_mistral_tokenizer.py
rename to tests/tokenizers_/test_mistral.py
index 4cdfa9df95e1a..0706a94791dc9 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -7,7 +7,7 @@ import pytest
from mistral_common.exceptions import InvalidMessageStructureException
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
-from vllm.transformers_utils.tokenizers.mistral import (
+from vllm.tokenizers.mistral import (
MistralTokenizer,
_prepare_apply_chat_template_tools_and_messages,
)
@@ -308,25 +308,6 @@ class TestMistralTokenizer:
def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer):
assert mistral_tokenizer.get_added_vocab() == {}
- def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
- token_ids = (
- [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686]
- )
-
- assert mistral_tokenizer.encode_one("Hello world !") == token_ids
- assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids
- assert (
- mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1)
- == token_ids[:-2]
- )
- assert (
- mistral_tokenizer.encode_one(
- "Hello world !", truncation=False, max_length=1
- )
- == token_ids
- )
- assert mistral_tokenizer.encode_one("") == []
-
def test_encode(self, mistral_tokenizer: MistralTokenizer):
token_ids = (
[1, 22177, 4304, 2662]
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
new file mode 100644
index 0000000000000..1eb19a0996dd9
--- /dev/null
+++ b/tests/tokenizers_/test_registry.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+class TestTokenizer(TokenizerLike):
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+ return TestTokenizer() # type: ignore
+
+ @property
+ def bos_token_id(self) -> int:
+ return 0
+
+ @property
+ def eos_token_id(self) -> int:
+ return 1
+
+
+def test_customized_tokenizer():
+ TokenizerRegistry.register(
+ "test_tokenizer",
+ __name__,
+ TestTokenizer.__name__,
+ )
+
+ tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+ assert isinstance(tokenizer, TestTokenizer)
+ assert tokenizer.bos_token_id == 0
+ assert tokenizer.eos_token_id == 1
+
+ tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+ assert isinstance(tokenizer, TestTokenizer)
+ assert tokenizer.bos_token_id == 0
+ assert tokenizer.eos_token_id == 1
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 36a07bb561d9e..ee9da4fd6464b 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,8 +14,9 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
# Use a common model that is likely to be available
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
@@ -173,7 +174,7 @@ def test_extract_tool_calls(
def stream_delta_message_generator(
ernie45_tool_parser: Ernie45ToolParser,
- ernie45_tokenizer: AnyTokenizer,
+ ernie45_tokenizer: TokenizerLike,
model_output: str,
request: ChatCompletionRequest | None = None,
) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 9eb73b80fa9b4..2413b983fe871 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,8 +10,9 @@ from partial_json_parser.core.options import Allow
from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
@@ -44,7 +45,9 @@ def assert_tool_calls(
def stream_delta_message_generator(
- jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+ jamba_tool_parser: JambaToolParser,
+ jamba_tokenizer: TokenizerLike,
+ model_output: str,
) -> Generator[DeltaMessage, None, None]:
all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 93ef1049fc07e..3cf1f4ef89f14 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,8 +17,9 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
Qwen3CoderToolParser,
)
from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
@@ -104,7 +105,7 @@ def assert_tool_calls(
def stream_delta_message_generator(
qwen3_tool_parser,
- qwen3_tokenizer: AnyTokenizer,
+ qwen3_tokenizer: TokenizerLike,
model_output: str,
request: ChatCompletionRequest | None = None,
) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 1367ad87cb019..8e1ad5e9cedc8 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,8 +15,9 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
@@ -256,7 +257,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
def stream_delta_message_generator(
seed_oss_tool_parser: SeedOssToolParser,
- seed_oss_tokenizer: AnyTokenizer,
+ seed_oss_tokenizer: TokenizerLike,
model_output: str,
request: ChatCompletionRequest | None = None,
) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 122b427d60409..a1852c368eeb8 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,8 +13,9 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
@@ -49,7 +50,7 @@ def assert_tool_calls(
def stream_delta_message_generator(
xlam_tool_parser: xLAMToolParser,
- xlam_tokenizer: AnyTokenizer,
+ xlam_tokenizer: TokenizerLike,
model_output: str,
request: ChatCompletionRequest | None = None,
) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index 7107ad0f7b99d..7b56c9f0189d4 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -1,62 +1,32 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+`vllm.LLMEngine._get_eos_token_id`.
+"""
+
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, call, patch
+def test_get_llama3_eos_token():
+ model_name = "meta-llama/Llama-3.2-1B-Instruct"
-import pytest
+ tokenizer = get_tokenizer(model_name)
+ assert tokenizer.eos_token_id == 128009
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+ generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+ assert generation_config is not None
+ assert generation_config.eos_token_id == [128001, 128008, 128009]
-@pytest.mark.parametrize(
- "allow_patterns,expected_relative_files",
- [
- (
- ["*.json", "correct*.txt"],
- ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
- ),
- ],
-)
-def test_list_filtered_repo_files(
- allow_patterns: list[str], expected_relative_files: list[str]
-):
- with tempfile.TemporaryDirectory() as tmp_dir:
- # Prep folder and files
- path_tmp_dir = Path(tmp_dir)
- subfolder = path_tmp_dir / "subfolder"
- subfolder.mkdir()
- (path_tmp_dir / "json_file.json").touch()
- (path_tmp_dir / "correct_2.txt").touch()
- (path_tmp_dir / "uncorrect.txt").touch()
- (path_tmp_dir / "uncorrect.jpeg").touch()
- (subfolder / "correct.txt").touch()
- (subfolder / "uncorrect_sub.txt").touch()
+def test_get_blip2_eos_token():
+ model_name = "Salesforce/blip2-opt-2.7b"
- def _glob_path() -> list[str]:
- return [
- str(file.relative_to(path_tmp_dir))
- for file in path_tmp_dir.glob("**/*")
- if file.is_file()
- ]
+ tokenizer = get_tokenizer(model_name)
+ assert tokenizer.eos_token_id == 2
- # Patch list_repo_files called by fn
- with patch(
- "vllm.transformers_utils.repo_utils.list_repo_files",
- MagicMock(return_value=_glob_path()),
- ) as mock_list_repo_files:
- out_files = sorted(
- list_filtered_repo_files(
- tmp_dir, allow_patterns, "revision", "model", "token"
- )
- )
- assert out_files == sorted(expected_relative_files)
- assert mock_list_repo_files.call_count == 1
- assert mock_list_repo_files.call_args_list[0] == call(
- repo_id=tmp_dir,
- revision="revision",
- repo_type="model",
- token="token",
- )
+ generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+ assert generation_config is not None
+ assert generation_config.eos_token_id == 50118
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_processor.py
similarity index 100%
rename from tests/transformers_utils/test_get_processor_kwargs_from_processor.py
rename to tests/transformers_utils/test_processor.py
diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py
new file mode 100644
index 0000000000000..7107ad0f7b99d
--- /dev/null
+++ b/tests/transformers_utils/test_repo_utils.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+ "allow_patterns,expected_relative_files",
+ [
+ (
+ ["*.json", "correct*.txt"],
+ ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+ ),
+ ],
+)
+def test_list_filtered_repo_files(
+ allow_patterns: list[str], expected_relative_files: list[str]
+):
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ # Prep folder and files
+ path_tmp_dir = Path(tmp_dir)
+ subfolder = path_tmp_dir / "subfolder"
+ subfolder.mkdir()
+ (path_tmp_dir / "json_file.json").touch()
+ (path_tmp_dir / "correct_2.txt").touch()
+ (path_tmp_dir / "uncorrect.txt").touch()
+ (path_tmp_dir / "uncorrect.jpeg").touch()
+ (subfolder / "correct.txt").touch()
+ (subfolder / "uncorrect_sub.txt").touch()
+
+ def _glob_path() -> list[str]:
+ return [
+ str(file.relative_to(path_tmp_dir))
+ for file in path_tmp_dir.glob("**/*")
+ if file.is_file()
+ ]
+
+ # Patch list_repo_files called by fn
+ with patch(
+ "vllm.transformers_utils.repo_utils.list_repo_files",
+ MagicMock(return_value=_glob_path()),
+ ) as mock_list_repo_files:
+ out_files = sorted(
+ list_filtered_repo_files(
+ tmp_dir, allow_patterns, "revision", "model", "token"
+ )
+ )
+ assert out_files == sorted(expected_relative_files)
+ assert mock_list_repo_files.call_count == 1
+ assert mock_list_repo_files.call_args_list[0] == call(
+ repo_id=tmp_dir,
+ revision="revision",
+ repo_type="model",
+ token="token",
+ )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 8e1198b315bd1..990aa9d925855 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -18,7 +18,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.v1.engine import (
EngineCoreEvent,
EngineCoreEventType,
@@ -31,7 +31,7 @@ from vllm.v1.metrics.stats import IterationStats, SchedulerStats
def _ref_convert_id_to_token(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
token_id: int,
) -> str:
"""Reference impl of logprobs detokenization.
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index b96a6701333de..2bb468da68c2a 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,8 +27,8 @@ ALLOWED_FILES = {
"vllm/distributed/device_communicators/shm_broadcast.py",
"vllm/distributed/device_communicators/shm_object_storage.py",
"vllm/utils/hashing.py",
+ "tests/tokenizers_/test_cached_tokenizer.py",
"tests/utils_/test_hashing.py",
- "tests/tokenization/test_cached_tokenizer.py",
"benchmarks/kernels/graph_machete_bench.py",
"benchmarks/kernels/benchmark_lora.py",
"benchmarks/kernels/benchmark_machete.py",
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 47e01fc93b48b..724b393044266 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -35,6 +35,7 @@ FILES = [
"vllm/multimodal",
"vllm/platforms",
"vllm/plugins",
+ "vllm/tokenizers",
"vllm/transformers_utils",
"vllm/triton_utils",
"vllm/usage",
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5411ecbb27b27..ec9b0fd6e969c 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -39,7 +39,7 @@ from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.import_utils import PlaceholderModule
try:
@@ -293,7 +293,7 @@ def lora_path_on_disk(lora_path: str) -> str:
# Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+lora_tokenizer_cache: dict[int, TokenizerLike] = {}
def process_image(image: Any) -> Mapping[str, Any]:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 02741e50f6aa0..f2b19c845018c 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -13,7 +13,7 @@ from vllm.plugins.io_processors import IOProcessor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.input_processor import InputProcessor
@@ -85,7 +85,7 @@ class EngineClient(ABC):
...
@abstractmethod
- async def get_tokenizer(self) -> AnyTokenizer:
+ async def get_tokenizer(self) -> TokenizerLike:
"""Get the tokenizer"""
...
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bf80856c1bbfc..1643906894c66 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -49,9 +49,9 @@ from vllm.logger import init_logger
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid
from vllm.utils.func_utils import supports_kw
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
def _resolve_chat_template_content_format(
chat_template: str | None,
tools: list[dict[str, Any]] | None,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
chat_template: str | None,
tools: list[dict[str, Any]] | None,
given_format: ChatTemplateContentFormatOption,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
@@ -627,7 +627,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
maximum per prompt.
"""
- def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+ def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
super().__init__()
self._model_config = model_config
@@ -1592,7 +1592,7 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
def parse_chat_messages(
messages: list[ChatCompletionMessageParam],
model_config: ModelConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
@@ -1624,7 +1624,7 @@ def parse_chat_messages(
def parse_chat_messages_futures(
messages: list[ChatCompletionMessageParam],
model_config: ModelConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2b34f36253edf..4ea213752e394 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -71,11 +71,8 @@ from vllm.platforms import current_platform
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import (
- AnyTokenizer,
- MistralTokenizer,
- get_cached_tokenizer,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils.collection_utils import as_iter, is_list_of
from vllm.utils.counter import Counter
@@ -350,11 +347,11 @@ class LLM:
self.input_processor = self.llm_engine.input_processor
self.io_processor = self.llm_engine.io_processor
- def get_tokenizer(self) -> AnyTokenizer:
+ def get_tokenizer(self) -> TokenizerLike:
return self.llm_engine.get_tokenizer()
@deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
- def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+ def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
# While CachedTokenizer is dynamic, have no choice but
# compare class name. Misjudgment will arise from
# user-defined tokenizer started with 'Cached'
@@ -1244,7 +1241,7 @@ class LLM:
def _embedding_score(
self,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
text_1: list[str | TextPrompt | TokensPrompt],
text_2: list[str | TextPrompt | TokensPrompt],
truncate_prompt_tokens: int | None = None,
@@ -1276,7 +1273,7 @@ class LLM:
def _cross_encoding_score(
self,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
data_1: list[str] | list[ScoreContentPartParam],
data_2: list[str] | list[ScoreContentPartParam],
truncate_prompt_tokens: int | None = None,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9a7051e0920af..cecd1da1e5548 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -62,8 +62,9 @@ from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import (
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import (
+ MistralTokenizer,
maybe_serialize_tool_calls,
truncate_tool_call_ids,
validate_request_params,
@@ -530,7 +531,7 @@ class OpenAIServingChat(OpenAIServing):
request_id: str,
model_name: str,
conversation: list[ConversationMessage],
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
) -> AsyncGenerator[str, None]:
created_time = int(time.time())
@@ -1296,7 +1297,7 @@ class OpenAIServingChat(OpenAIServing):
request_id: str,
model_name: str,
conversation: list[ConversationMessage],
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
) -> ErrorResponse | ChatCompletionResponse:
created_time = int(time.time())
@@ -1624,7 +1625,7 @@ class OpenAIServingChat(OpenAIServing):
self,
logprobs: dict[int, Logprob],
top_logprobs: int | None,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
should_return_as_token_id: bool,
) -> list[ChatCompletionLogProb]:
return [
@@ -1648,7 +1649,7 @@ class OpenAIServingChat(OpenAIServing):
self,
token_ids: GenericSequence[int],
top_logprobs: GenericSequence[dict[int, Logprob] | None],
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
num_output_top_logprobs: int | None = None,
return_as_token_id: bool | None = None,
) -> ChatCompletionLogProbs:
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 167ee152fece3..3b973eb125a83 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -221,7 +221,7 @@ class ServingClassification(ClassificationMixin):
def _create_pooling_params(
self,
- ctx: ClassificationServeContext,
+ ctx: ServeContext[ClassificationRequest],
) -> PoolingParams | ErrorResponse:
pooling_params = super()._create_pooling_params(ctx)
if isinstance(pooling_params, ErrorResponse):
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 9681aa8c71e6d..3e421e21e3e80 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,7 +33,7 @@ from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
@@ -326,7 +326,7 @@ class OpenAIServingCompletion(OpenAIServing):
created_time: int,
model_name: str,
num_prompts: int,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
request_metadata: RequestResponseMetadata,
) -> AsyncGenerator[str, None]:
num_choices = 1 if request.n is None else request.n
@@ -511,7 +511,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_id: str,
created_time: int,
model_name: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
request_metadata: RequestResponseMetadata,
) -> CompletionResponse:
choices: list[CompletionResponseChoice] = []
@@ -622,7 +622,7 @@ class OpenAIServingCompletion(OpenAIServing):
token_ids: GenericSequence[int],
top_logprobs: GenericSequence[dict[int, Logprob] | None],
num_output_top_logprobs: int,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
initial_text_offset: int = 0,
return_as_token_id: bool | None = None,
) -> CompletionLogProbs:
@@ -642,9 +642,15 @@ class OpenAIServingCompletion(OpenAIServing):
for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None:
- token = tokenizer.decode(token_id)
if should_return_as_token_id:
token = f"token_id:{token_id}"
+ else:
+ if tokenizer is None:
+ raise ValueError(
+ "Unable to get tokenizer because `skip_tokenizer_init=True`"
+ )
+
+ token = tokenizer.decode(token_id)
out_tokens.append(token)
out_token_logprobs.append(None)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index cca2fd982fe0f..e7a632e025103 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -7,13 +7,14 @@ import time
import traceback
from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
import numpy as np
import torch
from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
+from pydantic import ConfigDict, TypeAdapter
from starlette.datastructures import Headers
from typing_extensions import TypeIs
@@ -96,12 +97,12 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,
log_tracing_disabled_warning,
)
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid
from vllm.utils.async_utils import (
AsyncMicrobatchTokenizer,
@@ -184,19 +185,19 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
RequestT = TypeVar("RequestT", bound=AnyRequest)
-class RequestProcessingMixin(BaseModel):
+@dataclass(kw_only=True)
+class RequestProcessingMixin:
"""
Mixin for request processing,
handling prompt preparation and engine input.
"""
- request_prompts: Sequence[RequestPrompt] | None = []
- engine_prompts: list[EngineTokensPrompt] | None = []
-
- model_config = ConfigDict(arbitrary_types_allowed=True)
+ request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
+ engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
-class ResponseGenerationMixin(BaseModel):
+@dataclass(kw_only=True)
+class ResponseGenerationMixin:
"""
Mixin for response generation,
managing result generators and final batch results.
@@ -205,54 +206,38 @@ class ResponseGenerationMixin(BaseModel):
result_generator: (
AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
) = None
- final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
+ final_res_batch: list[RequestOutput | PoolingRequestOutput] = field(
default_factory=list
)
model_config = ConfigDict(arbitrary_types_allowed=True)
-class ServeContext(
- RequestProcessingMixin,
- ResponseGenerationMixin,
- BaseModel,
- Generic[RequestT],
-):
+@dataclass(kw_only=True)
+class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, Generic[RequestT]):
# Shared across all requests
request: RequestT
raw_request: Request | None = None
model_name: str
request_id: str
- created_time: int = Field(default_factory=lambda: int(time.time()))
+ created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
# Shared across most requests
- tokenizer: AnyTokenizer | None = None
-
- # `protected_namespaces` resolves Pydantic v2's warning
- # on conflict with protected namespace "model_"
- model_config = ConfigDict(
- protected_namespaces=(),
- arbitrary_types_allowed=True,
- )
+ tokenizer: TokenizerLike | None = None
-ClassificationServeContext = ServeContext[ClassificationRequest]
+@dataclass(kw_only=True)
+class ClassificationServeContext(ServeContext[ClassificationRequest]):
+ pass
+@dataclass(kw_only=True)
class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
chat_template: str | None = None
chat_template_content_format: ChatTemplateContentFormatOption
-# Used to resolve the Pydantic error related to
-# forward reference of MultiModalDataDict in TokensPrompt
-RequestProcessingMixin.model_rebuild()
-ServeContext.model_rebuild()
-ClassificationServeContext.model_rebuild()
-EmbeddingServeContext.model_rebuild()
-
-
class OpenAIServing:
request_id_prefix: ClassVar[str] = """
A short string prepended to every request’s ID (e.g. "embd", "classify")
@@ -281,7 +266,7 @@ class OpenAIServing:
apply_mistral_chat_template, executor=self._tokenizer_executor
)
- self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
+ self._async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] = {}
self.log_error_stack = log_error_stack
self.input_processor = self.models.input_processor
@@ -291,7 +276,7 @@ class OpenAIServing:
def _get_tool_parser(
self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
- ) -> Callable[[AnyTokenizer], ToolParser] | None:
+ ) -> Callable[[TokenizerLike], ToolParser] | None:
"""Get the tool parser based on the name."""
parser = None
if not enable_auto_tools or tool_parser_name is None:
@@ -317,7 +302,7 @@ class OpenAIServing:
def _get_reasoning_parser(
self,
reasoning_parser_name: str,
- ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
+ ) -> Callable[[TokenizerLike], ReasoningParser] | None:
"""Get the reasoning parser based on the name."""
parser = None
if not reasoning_parser_name:
@@ -547,7 +532,7 @@ class OpenAIServing:
prompt_logprobs=None,
)
- def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
+ def _get_renderer(self, tokenizer: TokenizerLike | None) -> BaseRenderer:
"""
Get a Renderer instance with the provided tokenizer.
Uses shared async tokenizer pool for efficiency.
@@ -877,7 +862,7 @@ class OpenAIServing:
self,
request: AnyRequest,
prompt: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
add_special_tokens: bool,
) -> TextTokensPrompt:
async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -919,7 +904,7 @@ class OpenAIServing:
self,
request: AnyRequest,
prompt_ids: list[int],
- tokenizer: AnyTokenizer | None,
+ tokenizer: TokenizerLike | None,
) -> TextTokensPrompt:
truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
@@ -1015,7 +1000,7 @@ class OpenAIServing:
async def _tokenize_prompt_input_async(
self,
request: AnyRequest,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
prompt_input: str | list[int],
add_special_tokens: bool = True,
) -> TextTokensPrompt:
@@ -1034,7 +1019,7 @@ class OpenAIServing:
async def _tokenize_prompt_inputs_async(
self,
request: AnyRequest,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
prompt_inputs: Iterable[str | list[int]],
add_special_tokens: bool = True,
) -> AsyncGenerator[TextTokensPrompt, None]:
@@ -1079,7 +1064,7 @@ class OpenAIServing:
async def _preprocess_chat(
self,
request: ChatLikeRequest | ResponsesRequest,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
messages: list[ChatCompletionMessageParam],
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
@@ -1088,13 +1073,18 @@ class OpenAIServing:
tool_dicts: list[dict[str, Any]] | None = None,
documents: list[dict[str, str]] | None = None,
chat_template_kwargs: dict[str, Any] | None = None,
- tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
+ tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
add_special_tokens: bool = False,
) -> tuple[
list[ConversationMessage],
Sequence[RequestPrompt],
list[EngineTokensPrompt],
]:
+ if tokenizer is None:
+ raise ValueError(
+ "Unable to get tokenizer because `skip_tokenizer_init=True`"
+ )
+
model_config = self.model_config
resolved_content_format = resolve_chat_template_content_format(
@@ -1370,9 +1360,9 @@ class OpenAIServing:
@staticmethod
def _parse_tool_calls_from_content(
request: ResponsesRequest | ChatCompletionRequest,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
enable_auto_tools: bool,
- tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+ tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
content: str | None = None,
) -> tuple[list[FunctionCall] | None, str | None]:
function_calls = list[FunctionCall]()
@@ -1442,7 +1432,7 @@ class OpenAIServing:
def _get_decoded_token(
logprob: Logprob,
token_id: int,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
return_as_token_id: bool = False,
) -> str:
if return_as_token_id:
@@ -1450,6 +1440,12 @@ class OpenAIServing:
if logprob.decoded_token is not None:
return logprob.decoded_token
+
+ if tokenizer is None:
+ raise ValueError(
+ "Unable to get tokenizer because `skip_tokenizer_init=True`"
+ )
+
return tokenizer.decode(token_id)
def _is_model_supported(self, model_name: str | None) -> bool:
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f546dbda7fef5..5144916ba71e9 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -105,7 +105,7 @@ from vllm.logprobs import Logprob as SampleLogprob
from vllm.logprobs import SampleLogprobs
from vllm.outputs import CompletionOutput
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils import random_uuid
logger = init_logger(__name__)
@@ -492,7 +492,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
request: ResponsesRequest,
prev_response: ResponsesResponse | None,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
):
if request.tools is None or (
request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
@@ -563,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing):
result_generator: AsyncIterator[ConversationContext],
context: ConversationContext,
model_name: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
created_time: int | None = None,
) -> ErrorResponse | ResponsesResponse:
@@ -675,7 +675,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
logprobs: dict[int, SampleLogprob],
top_logprobs: int,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> list[LogprobTopLogprob]:
"""Returns the top-k logprobs from the logprobs dictionary."""
out = []
@@ -700,7 +700,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
token_ids: Sequence[int],
logprobs: SampleLogprobs | None,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
top_logprobs: int | None = None,
) -> list[Logprob]:
assert logprobs is not None, "logprobs must be provided"
@@ -736,7 +736,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
token_ids: Sequence[int],
logprobs: SampleLogprobs | None,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
top_logprobs: int | None = None,
) -> list[response_text_delta_event.Logprob]:
lgs = self._create_response_logprobs(
@@ -763,7 +763,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
request: ResponsesRequest,
final_output: CompletionOutput,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> list[ResponseOutputItem]:
if self.reasoning_parser:
try:
@@ -1135,7 +1135,7 @@ class OpenAIServingResponses(OpenAIServing):
result_generator: AsyncIterator[ConversationContext | None],
context: ConversationContext,
model_name: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
created_time: int,
_increment_sequence_number_and_return: Callable[
@@ -1438,7 +1438,7 @@ class OpenAIServingResponses(OpenAIServing):
result_generator: AsyncIterator[ConversationContext | None],
context: ConversationContext,
model_name: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
created_time: int,
_increment_sequence_number_and_return: Callable[
@@ -1891,7 +1891,7 @@ class OpenAIServingResponses(OpenAIServing):
result_generator: AsyncIterator[ConversationContext | None],
context: ConversationContext,
model_name: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
request_metadata: RequestResponseMetadata,
created_time: int | None = None,
) -> AsyncGenerator[StreamingResponsesResponse, None]:
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9cbfc9791819e..0874c01c1f2a7 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -36,7 +36,7 @@ from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.utils.async_utils import make_async, merge_async_iterators
logger = init_logger(__name__)
@@ -60,7 +60,7 @@ class ServingScores(OpenAIServing):
async def _embedding_score(
self,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
texts_1: list[str],
texts_2: list[str],
request: RerankRequest | ScoreRequest,
@@ -153,7 +153,7 @@ class ServingScores(OpenAIServing):
def _preprocess_score(
self,
request: RerankRequest | ScoreRequest,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
@@ -175,7 +175,7 @@ class ServingScores(OpenAIServing):
async def _cross_encoding_score(
self,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
data_1: list[str] | list[ScoreContentPartParam],
data_2: list[str] | list[ScoreContentPartParam],
request: RerankRequest | ScoreRequest,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 39aae0cd04956..979da02d14500 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -22,7 +22,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -170,7 +170,7 @@ class OpenAIServingTokenization(OpenAIServing):
@dataclass
class TokenizerInfo:
- tokenizer: AnyTokenizer
+ tokenizer: TokenizerLike
chat_template: str | None
def to_dict(self) -> dict[str, Any]:
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index e99e405f5de65..87ef2e0786a94 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -22,7 +22,7 @@ from vllm.logger import init_logger
from vllm.sampling_params import (
StructuredOutputsParams,
)
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import import_from_path
@@ -36,7 +36,7 @@ class ToolParser:
derived classes.
"""
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
self.prev_tool_call_arr: list[dict] = []
# the index of the tool call that is currently being parsed
self.current_tool_id: int = -1
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index cbeb879969ece..10de3dabf985c 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class DeepSeekV31ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index bf7f6fa61ab90..66b14875dce25 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class DeepSeekV3ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
index 82370323cb00d..d054d8e4b8651 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class Ernie45ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
"""
Ernie thinking model format:
abc\n\n\n\n\ndef\n\n
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 389e9754b34da..165346adb3d93 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class Glm4MoeModelToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.current_tool_name_sent = False
self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index ae9217426fb51..df1b590526b1a 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -29,7 +29,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import (
partial_json_loads,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -44,7 +44,7 @@ class Granite20bFCToolParser(ToolParser):
are all set
"""
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.bot_token = ""
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index d29c427694dc9..14b0ca0abe357 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -27,7 +27,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import (
partial_json_loads,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -42,7 +42,7 @@ class GraniteToolParser(ToolParser):
are all set
"""
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# for granite 3.0, the token `<|tool_call|>`
self.bot_token = "<|tool_call|>"
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4336a5438109f..19c1c83268ed4 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,18 +22,18 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
logger = init_logger(__name__)
class Hermes2ProToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
- if isinstance(self.model_tokenizer, MistralTokenizer):
+ if isinstance(tokenizer, MistralTokenizer):
logger.error("Detected Mistral tokenizer when using a Hermes model")
- self.model_tokenizer = self.model_tokenizer.tokenizer
+ self.model_tokenizer = tokenizer.tokenizer
self.current_tool_name_sent: bool = False
self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 920675c8389b8..d2419b5d84ead 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -22,14 +22,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
)
from vllm.entrypoints.openai.tool_parsers.utils import consume_space
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils import random_uuid
logger = init_logger(__name__)
class HunyuanA13BToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# Initialize state for streaming mode
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 1dd327f645b3a..67788358543e9 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
)
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class Internlm2ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.position = 0
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 6f53ddea4f0ef..4655da8dd4542 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,14 +21,13 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
logger = init_logger(__name__)
class JambaToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
if isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 2b84c60a3b841..07db52ebd5af1 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class KimiK2ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.current_tool_name_sent: bool = False
self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index c6c8ae8ae95f1..76d76a4aa35a1 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -4,11 +4,11 @@
import regex as re
from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
class LongcatFlashToolParser(Hermes2ProToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.tool_call_start_token: str = ""
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
index 5c2258ba62b29..b595a98f35555 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -21,13 +21,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class MinimaxM2ToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 982518a52e3da..1025041037c6e 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
)
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class MinimaxToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# Initialize streaming state for tracking tool call progress
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 85671271522d3..7e2d67a1fb659 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
)
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
logger = init_logger(__name__)
@@ -46,7 +46,7 @@ class MistralToolCall(ToolCall):
return id.isalnum() and len(id) == 9
-def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
return (
isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
)
@@ -61,7 +61,7 @@ class MistralToolParser(ToolParser):
Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
"""
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
if not isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index d1b36a297e0b1..8bdf35d408805 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -18,15 +18,15 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
if TYPE_CHECKING:
- from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.tokenizers import TokenizerLike
else:
- AnyTokenizer = object
+ TokenizerLike = object
logger = init_logger(__name__)
class OpenAIToolParser(ToolParser):
- def __init__(self, tokenizer: "AnyTokenizer"):
+ def __init__(self, tokenizer: "TokenizerLike"):
super().__init__(tokenizer)
def extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 9d4c079eba188..d49b14690ef03 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class Qwen3CoderToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 432c419db189a..03862ff432a5d 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -1165,7 +1165,7 @@ class StreamingXMLToolCallParser:
class Qwen3XMLToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.parser = StreamingXMLToolCallParser()
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 8aed7f0e9fc96..c7947faad1923 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -34,7 +34,7 @@ class SeedOssToolParser(ToolParser):
TOOL_CALL_START = ""
TOOL_CALL_END = ""
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# --- streaming state ---
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index adcb9f4765473..9213d6859dd93 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils import random_uuid
logger = init_logger(__name__)
@@ -41,7 +41,7 @@ class Step3ToolParser(ToolParser):
TOOL_SEP = "<|tool_sep|>"
SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END]
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
self.position = 0
# Explicit state flags for robust streaming
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 9d308af4de601..effd2bd08b42a 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -21,14 +21,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils import random_uuid
logger = init_logger(__name__)
class xLAMToolParser(ToolParser):
- def __init__(self, tokenizer: AnyTokenizer):
+ def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer)
# Initialize state for streaming mode
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 3c5a396a99f93..10b90bbbb0f32 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -16,7 +16,7 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
from vllm.inputs.data import TextPrompt as EngineTextPrompt
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@@ -85,7 +85,7 @@ class BaseRenderer(ABC):
def __init__(
self,
model_config: ModelConfig,
- tokenizer: AnyTokenizer | None = None,
+ tokenizer: TokenizerLike | None = None,
):
super().__init__()
self.model_config = model_config
@@ -200,8 +200,8 @@ class CompletionRenderer(BaseRenderer):
def __init__(
self,
model_config: ModelConfig,
- tokenizer: AnyTokenizer | None = None,
- async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+ tokenizer: TokenizerLike | None = None,
+ async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer]
| None = None,
):
super().__init__(model_config, tokenizer)
@@ -373,7 +373,7 @@ class CompletionRenderer(BaseRenderer):
return async_tokenizer
tokenizer = self.tokenizer
- if self.tokenizer is None:
+ if tokenizer is None:
raise ValueError("No tokenizer available for text input processing")
if self.async_tokenizer_pool is None:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 309a4c996392d..04d5a192918dd 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,11 +19,7 @@ from vllm.inputs import TokensPrompt
from vllm.model_executor.models.interfaces import supports_score_template
from vllm.multimodal.inputs import MultiModalDataDict
from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (
- AnyTokenizer,
- PreTrainedTokenizer,
- PreTrainedTokenizerFast,
-)
+from vllm.transformers_utils.tokenizer import TokenizerLike
ScoreContentPartParam: TypeAlias = (
ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
@@ -45,7 +41,7 @@ class ScoreMultiModalParam(TypedDict, total=False):
def _cosine_similarity(
- tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+ tokenizer: TokenizerLike,
embed_1: list[PoolingRequestOutput],
embed_2: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]:
@@ -93,7 +89,7 @@ def parse_score_data(
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
model_config: ModelConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> tuple[str, str, MultiModalDataDict | None]:
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@@ -118,12 +114,14 @@ def _parse_score_content(
mm_tracker: BaseMultiModalItemTracker,
) -> _ContentPart | None:
if isinstance(data, str):
- data = ChatCompletionContentPartTextParam(type="text", text=data)
+ part = ChatCompletionContentPartTextParam(type="text", text=data)
+ else:
+ part = data
mm_parser = mm_tracker.create_parser()
parse_res = _parse_chat_message_content_part(
- data,
+ part,
mm_parser,
wrap_dicts=False,
interleave_strings=False,
@@ -181,7 +179,7 @@ def post_process_tokens(
def get_score_prompt(
model_config: ModelConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 088bb679fef40..daeeb995bc749 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 839c13868a16c..46d1bed38aa85 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -17,7 +17,7 @@ from vllm.multimodal.inputs import (
MultiModalUUIDDict,
)
from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.jsontree import json_iter_leaves
from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -46,7 +46,7 @@ class InputPreprocessor:
def __init__(
self,
model_config: ModelConfig,
- tokenizer: AnyTokenizer | None,
+ tokenizer: TokenizerLike | None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
) -> None:
@@ -59,7 +59,7 @@ class InputPreprocessor:
self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
- def get_tokenizer(self) -> AnyTokenizer:
+ def get_tokenizer(self) -> TokenizerLike:
if self.tokenizer is None:
raise ValueError(
"You cannot pass text prompts when `skip_tokenizer_init` is True"
@@ -228,11 +228,11 @@ class InputPreprocessor:
return tokenizer.encode(prompt, **tokenization_kwargs)
- def _get_mm_tokenizer(self) -> AnyTokenizer:
+ def _get_mm_tokenizer(self) -> TokenizerLike:
# PrithviGeoSpatialMAE needs to be initialized without a tokenizer
# while using also multi-modal input
if not self.tokenizer:
- return cast(AnyTokenizer, object()) # Dummy
+ return cast(TokenizerLike, object()) # Dummy
tokenizer = self.get_tokenizer()
return tokenizer
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7b6a6528e20e8..1bf97c2535fb7 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -5,7 +5,7 @@ from typing import TypeAlias
import torch
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
LogitsProcessor: TypeAlias = (
Callable[[list[int], torch.Tensor], torch.Tensor]
@@ -19,7 +19,7 @@ to sample from."""
def get_bad_words_logits_processors(
- bad_words: list[str], tokenizer: AnyTokenizer
+ bad_words: list[str], tokenizer: TokenizerLike
) -> list[LogitsProcessor]:
bad_words_ids: list[list[int]] = list()
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 81c6b34bd6ce0..6276c3d675411 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from .intern_vit import InternVisionModel
from .internvl import (
@@ -241,7 +241,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index ccbde115009d2..fccddf3a6b293 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -50,7 +50,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_num_threads
@@ -347,7 +347,7 @@ class BaseInternVLProcessor(ABC):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
@@ -561,7 +561,7 @@ class InternVLProcessor(BaseInternVLProcessor):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 5529089e06ae9..11beeddabe307 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,9 +73,9 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.tokenizer import (
- AnyTokenizer,
cached_tokenizer_from_config,
encode_tokens,
)
@@ -284,7 +284,7 @@ class BaseNanoNemotronVLProcessor(ABC):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*args,
max_num_tiles: int | None = None,
**kwargs,
@@ -434,7 +434,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
max_num_tiles: int | None = None,
min_dynamic_patch: int | None = None,
@@ -645,7 +645,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame: list[int],
frames_indices: list[int],
frame_duration_ms: int,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
img_start_token_ids: list[int],
img_end_token_ids: list[int],
img_context_token_ids: list[int],
@@ -670,7 +670,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
- tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+ tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5a1dda8aac2c1..a57668b21fb86 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -34,8 +34,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_image_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer
from .interfaces import (
MultiModalEmbeddings,
@@ -203,7 +203,7 @@ class NemotronVLProcessor(InternVLProcessor):
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
*,
min_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 121bf896fa6ba..4338918663378 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -31,7 +31,7 @@ from vllm.multimodal.processing import (
PromptReplacement,
PromptUpdate,
)
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from .qwen2_5_vl import (
Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
@@ -79,7 +79,7 @@ class OpenCUAProcessor(Qwen2VLProcessor):
def __init__(
self,
vision_config: dict,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
**kwargs,
):
image_processor = Qwen2VLImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 3464de472add5..54bde75cc0131 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,10 +59,8 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
- MistralTokenizer,
- cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 672659aa6042c..8fbd896223944 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -91,7 +91,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (
@@ -1533,7 +1533,7 @@ class Tarsier2Processor(Qwen2VLProcessor):
def __init__(
self,
vision_config: dict,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
**kwargs,
):
self.image_processor = Tarsier2ImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index d825eb3a1c134..55c25ce6190fb 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -47,7 +47,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -282,7 +282,7 @@ class SkyworkR1VProcessor:
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 1c60cb4148121..3e55ada0ed2e1 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -43,8 +43,8 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.configs import Step3VisionEncoderConfig
-from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -321,7 +321,7 @@ class Step3VLProcessor:
def __init__(
self,
config: PretrainedConfig,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> None:
super().__init__()
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 18ad8851fccda..0a39ea7ef5bff 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,10 +51,8 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
- MistralTokenizer,
- cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 27bf12a5f3169..aab657b24ba23 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@ import torch
from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -76,7 +77,7 @@ PromptSeq: TypeAlias = str | list[int]
@lru_cache(maxsize=2048)
def _cached_encode(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
text: str,
*,
add_special_tokens: bool | None = None,
@@ -86,7 +87,7 @@ def _cached_encode(
@lru_cache(maxsize=2048)
def _cached_decode(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
token_ids: tuple[int, ...],
*,
skip_special_tokens: bool | None = None,
@@ -96,14 +97,14 @@ def _cached_decode(
)
-def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
+def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str:
if isinstance(seq, str):
return seq
return _cached_decode(tokenizer, tuple(seq))
-def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
+def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
if isinstance(seq, str):
return _cached_encode(tokenizer, seq, add_special_tokens=False)
@@ -113,7 +114,7 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
class _GetMatchIndex(Protocol):
def __call__(
self,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
prompt: PromptSeq,
start_idx: int = 0,
) -> int | None: ...
@@ -143,7 +144,7 @@ class PromptIndexTargets:
"""
def get_match_index(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
prompt: PromptSeq,
start_idx: int = 0,
) -> int | None:
@@ -199,7 +200,7 @@ class PromptUpdateDetails(Generic[_S]):
full: _S
"""The full content."""
- is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
+ is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None
"""
Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
return a boolean mask of shape `(len(full),)` indicating which positions
@@ -220,7 +221,7 @@ class PromptUpdateDetails(Generic[_S]):
seq: _S,
embed_text: str,
) -> "PromptUpdateDetails[_S]":
- def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+ def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
embed_token_ids = encode_tokens(tokenizer, embed_text)
token_ids = _seq2tokens(tokenizer, full)
@@ -236,7 +237,7 @@ class PromptUpdateDetails(Generic[_S]):
seq: _S,
embed_token_id: int,
) -> "PromptUpdateDetails[_S]":
- def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+ def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
token_ids = _seq2tokens(tokenizer, full)
return torch.tensor(token_ids) == embed_token_id
@@ -522,7 +523,7 @@ class ResolvedPromptUpdate:
def iter_token_matches(
self,
prompt: list[int],
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
start_idx: int = 0,
) -> Generator[PromptTargetMatch]:
@@ -544,7 +545,7 @@ class ResolvedPromptUpdate:
def iter_text_matches(
self,
prompt: str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
start_idx: int = 0,
) -> Generator[PromptTargetMatch]:
@@ -566,7 +567,7 @@ class ResolvedPromptUpdate:
def iter_matches(
self,
prompt: list[int] | str,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
start_idx: int = 0,
) -> Generator[PromptTargetMatch]:
@@ -675,7 +676,7 @@ _MatchToApply = tuple[tuple[str, int], tuple[PromptTargetMatch, int]]
def _find_matches(
prompt: _S,
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
*,
prev_end_idx: int = 0,
current_result: "MultiModalPromptUpdatesApplyResult",
@@ -740,7 +741,7 @@ def _all_items_found(
def _apply_matches(
prompt: _S,
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
@@ -806,7 +807,7 @@ def _apply_matches(
def apply_token_matches(
prompt: list[int],
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
"""
Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -823,7 +824,7 @@ def apply_token_matches(
def apply_text_matches(
prompt: str,
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
"""
Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -840,7 +841,7 @@ def apply_text_matches(
def _iter_placeholders(
prompt: list[int],
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> Iterable[PlaceholderFeaturesInfo]:
"""
Yield each set of placeholder tokens found in `prompt`.
@@ -909,7 +910,7 @@ def _iter_placeholders(
def find_mm_placeholders(
prompt: list[int],
mm_prompt_updates: "MultiModalPromptUpdates",
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
return dict(full_groupby_modality(it))
@@ -930,7 +931,7 @@ class InputProcessingContext:
model_config: ModelConfig
"""The configuration of the model."""
- tokenizer: AnyTokenizer
+ tokenizer: TokenizerLike
"""The tokenizer used to tokenize the inputs."""
@overload
@@ -1146,7 +1147,7 @@ class BaseProcessingInfo:
def model_id(self) -> str:
return self.ctx.model_config.model
- def get_tokenizer(self) -> AnyTokenizer:
+ def get_tokenizer(self) -> TokenizerLike:
return self.ctx.tokenizer
def get_hf_config(self) -> PretrainedConfig:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a7eafa76ad17e..ee90570b24aaf 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,8 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .cache import BaseMultiModalProcessorCache
from .processing import (
@@ -231,17 +232,20 @@ class MultiModalRegistry:
def _create_processing_ctx(
self,
model_config: "ModelConfig",
- tokenizer: AnyTokenizer | None = None,
+ tokenizer: TokenizerLike | None = None,
) -> InputProcessingContext:
- if tokenizer is None and not model_config.skip_tokenizer_init:
+ if model_config.skip_tokenizer_init:
+ tokenizer = cast(TokenizerLike, object())
+ elif tokenizer is None:
tokenizer = cached_tokenizer_from_config(model_config)
+
return InputProcessingContext(model_config, tokenizer)
def _create_processing_info(
self,
model_config: "ModelConfig",
*,
- tokenizer: AnyTokenizer | None = None,
+ tokenizer: TokenizerLike | None = None,
) -> BaseProcessingInfo:
model_cls = self._get_model_cls(model_config)
factories = model_cls._processor_factory
@@ -252,7 +256,7 @@ class MultiModalRegistry:
self,
model_config: "ModelConfig",
*,
- tokenizer: AnyTokenizer | None = None,
+ tokenizer: TokenizerLike | None = None,
cache: BaseMultiModalProcessorCache | None = None,
) -> BaseMultiModalProcessor[BaseProcessingInfo]:
"""
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d26e4ffc9c163..4a04292be009e 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
DeltaMessage,
ResponsesRequest,
)
- from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.tokenizers import TokenizerLike
else:
ChatCompletionRequest = Any
DeltaMessage = Any
ResponsesRequest = Any
- AnyTokenizer = Any
+ TokenizerLike = Any
logger = init_logger(__name__)
@@ -37,7 +37,7 @@ class ReasoningParser:
It is used to extract reasoning content from the model output.
"""
- def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
self.model_tokenizer = tokenizer
@cached_property
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 0268947732726..35084c0e7cc86 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any
from vllm.entrypoints.openai.protocol import DeltaMessage
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
if TYPE_CHECKING:
from vllm.entrypoints.openai.protocol import (
@@ -43,7 +43,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
"""The token that ends reasoning content."""
raise NotImplementedError
- def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
if not self.model_tokenizer:
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 30f5f2f88caf7..138d1b4e6dacf 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (
from vllm.logger import init_logger
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -37,7 +37,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
Reasoning parser for MiniMax M2 model.
"""
- def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.end_token_id = self.vocab.get("")
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index af6d179bf6d01..b61e50c188f8c 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -6,7 +6,7 @@ from functools import cached_property
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
logger = init_logger(__name__)
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index 7149f8c4123b3..2742a24a2c3e7 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING
import regex as re
if TYPE_CHECKING:
- from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.tokenizers import TokenizerLike
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
@@ -220,7 +220,7 @@ class Olmo3ReasoningParser(ReasoningParser):
token is missing from generation.
"""
- def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
+ def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.think_start = r""
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8de961e62db1b..453100f2e5135 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
from vllm.logger import init_logger
from vllm.logits_process import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
from vllm.v1.serial_utils import PydanticMsgspecMixin
logger = init_logger(__name__)
@@ -477,7 +477,7 @@ class SamplingParams(
eos_ids.update(self.stop_token_ids)
self.stop_token_ids = list(eos_ids)
- def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+ def update_from_tokenizer(self, tokenizer: TokenizerLike) -> None:
if not self.bad_words:
return
self._bad_words_token_ids = []
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
new file mode 100644
index 0000000000000..e26b4e8797ec8
--- /dev/null
+++ b/vllm/tokenizers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .mistral import MistralTokenizer
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
+
+__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
similarity index 96%
rename from vllm/transformers_utils/tokenizers/mistral.py
rename to vllm/tokenizers/mistral.py
index 1954e2a815b03..a42fb0e1e5f14 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -4,7 +4,8 @@
from typing import TYPE_CHECKING, Any, cast
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_base import TokenizerBase
+
+from .protocol import TokenizerLike
if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import (
@@ -163,7 +164,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return tokenizer.unk_id
-class MistralTokenizer(TokenizerBase):
+class MistralTokenizer(TokenizerLike):
def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
from mistral_common.protocol.instruct.validator import ValidationMode
from mistral_common.tokens.tokenizers.sentencepiece import (
@@ -270,14 +271,6 @@ class MistralTokenizer(TokenizerBase):
def eos_token_id(self) -> int:
return self.tokenizer.eos_id
- @property
- def sep_token(self) -> str:
- raise NotImplementedError()
-
- @property
- def pad_token(self) -> str:
- return self.transformers_tokenizer.pad_token
-
@property
def is_fast(self) -> bool:
return True
@@ -292,11 +285,14 @@ class MistralTokenizer(TokenizerBase):
@property
def truncation_side(self) -> str:
- raise NotImplementedError()
+ return self.transformers_tokenizer.truncation_side
def _is_special_token_id(self, token_id: int) -> bool:
return token_id in self._special_token_ids_set
+ def __hash__(self) -> int:
+ return hash(id(self))
+
def __len__(self) -> int:
return self.vocab_size
@@ -341,17 +337,6 @@ class MistralTokenizer(TokenizerBase):
# Mistral tokenizers have no added vocabulary
return {}
- def encode_one(
- self,
- text: str,
- truncation: bool = False,
- max_length: int | None = None,
- ) -> list[int]:
- # Mistral Tokenizers should not add special tokens
- return self.transformers_tokenizer.encode(
- text, add_special_tokens=False, truncation=truncation, max_length=max_length
- )
-
def encode(
self,
text: str,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
new file mode 100644
index 0000000000000..58a1a7c23f21c
--- /dev/null
+++ b/vllm/tokenizers/protocol.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Any, Protocol
+
+from typing_extensions import Self
+
+if TYPE_CHECKING:
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerLike(Protocol):
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path: str,
+ /,
+ *,
+ revision: str | None = None,
+ ) -> Self:
+ raise NotImplementedError
+
+ @property
+ def all_special_tokens(self) -> list[str]:
+ raise NotImplementedError
+
+ @property
+ def all_special_ids(self) -> list[int]:
+ raise NotImplementedError
+
+ @property
+ def bos_token_id(self) -> int:
+ raise NotImplementedError
+
+ @property
+ def eos_token_id(self) -> int:
+ raise NotImplementedError
+
+ @property
+ def is_fast(self) -> bool:
+ raise NotImplementedError
+
+ @property
+ def vocab_size(self) -> int:
+ raise NotImplementedError
+
+ @property
+ def max_token_id(self) -> int:
+ raise NotImplementedError
+
+ @property
+ def truncation_side(self) -> str:
+ raise NotImplementedError
+
+ def __hash__(self) -> int:
+ return hash(id(self))
+
+ def __len__(self) -> int:
+ return self.vocab_size
+
+ def __call__(
+ self,
+ text: str | list[str] | list[int],
+ text_pair: str | None = None,
+ add_special_tokens: bool = False,
+ truncation: bool = False,
+ max_length: int | None = None,
+ ):
+ raise NotImplementedError
+
+ def get_vocab(self) -> dict[str, int]:
+ raise NotImplementedError
+
+ def get_added_vocab(self) -> dict[str, int]:
+ raise NotImplementedError
+
+ def encode(
+ self,
+ text: str,
+ truncation: bool | None = None,
+ max_length: int | None = None,
+ add_special_tokens: bool | None = None,
+ ) -> list[int]:
+ raise NotImplementedError
+
+ def apply_chat_template(
+ self,
+ messages: list["ChatCompletionMessageParam"],
+ tools: list[dict[str, Any]] | None = None,
+ **kwargs,
+ ) -> list[int]:
+ raise NotImplementedError
+
+ def convert_tokens_to_string(self, tokens: list[str]) -> str:
+ raise NotImplementedError
+
+ def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+ raise NotImplementedError
+
+ def convert_ids_to_tokens(
+ self,
+ ids: list[int],
+ skip_special_tokens: bool = True,
+ ) -> list[str]:
+ raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
new file mode 100644
index 0000000000000..3a236c99b3564
--- /dev/null
+++ b/vllm/tokenizers/registry.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+
+from .protocol import TokenizerLike
+
+
+class TokenizerRegistry:
+ # Tokenizer name -> (tokenizer module, tokenizer class)
+ REGISTRY: dict[str, tuple[str, str]] = {}
+
+ @staticmethod
+ def register(name: str, module: str, class_name: str) -> None:
+ TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+ @staticmethod
+ def get_tokenizer(
+ tokenizer_name: str,
+ *args,
+ **kwargs,
+ ) -> "TokenizerLike":
+ tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+ if tokenizer_cls is None:
+ raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+ tokenizer_module = importlib.import_module(tokenizer_cls[0])
+ class_ = getattr(tokenizer_module, tokenizer_cls[1])
+ return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 45c4358bbc8f2..8f2cd3315ab94 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,9 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
from vllm import envs
from vllm.logger import init_logger
-from vllm.transformers_utils.config_parser_base import ConfigParserBase
-from vllm.transformers_utils.repo_utils import (
+
+from .config_parser_base import ConfigParserBase
+from .repo_utils import (
_get_hf_token,
file_or_path_exists,
get_hf_file_to_dict,
@@ -35,7 +36,7 @@ from vllm.transformers_utils.repo_utils import (
try_get_local_file,
with_retry,
)
-from vllm.transformers_utils.utils import (
+from .utils import (
check_gguf_file,
is_gguf,
is_remote_gguf,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 560526bfd823e..e586a5d46cb82 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from .tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
def _replace_none_with_empty(tokens: list[str | None]):
@@ -12,7 +12,7 @@ def _replace_none_with_empty(tokens: list[str | None]):
def _convert_tokens_to_string_with_added_encoders(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
output_tokens: list[str],
skip_special_tokens: bool,
spaces_between_special_tokens: bool,
@@ -57,7 +57,7 @@ INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
def convert_prompt_ids_to_tokens(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
prompt_ids: list[int],
skip_special_tokens: bool = False,
) -> tuple[list[str], int, int]:
@@ -81,7 +81,7 @@ def convert_prompt_ids_to_tokens(
def convert_ids_list_to_tokens(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
token_ids: list[int],
) -> list[str]:
"""Detokenize the input ids individually.
@@ -108,7 +108,7 @@ def convert_ids_list_to_tokens(
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
# under Apache 2.0 license
def detokenize_incrementally(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
all_input_ids: list[int],
prev_tokens: list[str] | None,
prefix_offset: int,
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index c5b4d3f000901..cb1fc2d092e01 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,7 +9,8 @@ from gguf.constants import Keys, VisionProjectorType
from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
from vllm.logger import init_logger
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+from .repo_utils import list_filtered_repo_files
logger = init_logger(__name__)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index be4325ab9101d..87d5cc2b483fb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -5,41 +5,48 @@ import contextlib
import copy
import importlib.util
import os
+import warnings
from functools import lru_cache
from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any
import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import assert_never
from vllm import envs
from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import (
- check_gguf_file,
- is_gguf,
- is_remote_gguf,
- split_remote_gguf,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry
+
+from .config import get_sentence_transformer_tokenizer_config
+from .gguf_utils import get_gguf_file_path_from_hf
+from .repo_utils import list_filtered_repo_files
+from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
if TYPE_CHECKING:
from vllm.config import ModelConfig
- from vllm.transformers_utils.tokenizer_base import TokenizerBase
-else:
- ModelConfig = Any
- TokenizerBase = Any
+
logger = init_logger(__name__)
-AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
+
+def __getattr__(name: str):
+ if name == "AnyTokenizer":
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
+ "`vllm.tokenizers.TokenizerLike`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+ return TokenizerLike
+
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def decode_tokens(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
token_ids: list[int],
*,
skip_special_tokens: bool | None = None,
@@ -58,7 +65,7 @@ def decode_tokens(
def encode_tokens(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
text: str,
*,
truncation: bool | None = None,
@@ -86,7 +93,7 @@ def encode_tokens(
return tokenizer.encode(text, **kw_args)
-def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike:
"""
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown.
@@ -144,7 +151,7 @@ def get_tokenizer(
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
-) -> AnyTokenizer:
+) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
@@ -206,15 +213,13 @@ def get_tokenizer(
if len(files_list) > 0:
tokenizer_mode = "mistral"
- tokenizer: AnyTokenizer
+ tokenizer: TokenizerLike
if tokenizer_mode == "mistral":
logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
tokenizer = MistralTokenizer.from_pretrained(
str(tokenizer_name), revision=revision
)
elif tokenizer_mode == "custom":
- from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
-
logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
tokenizer = TokenizerRegistry.get_tokenizer(
str(tokenizer_name),
@@ -260,12 +265,13 @@ def get_tokenizer(
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False
):
+ assert isinstance(tokenizer, PreTrainedTokenizerBase)
special_tokens_map = {
k: v.lower() for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
- if not isinstance(tokenizer, PreTrainedTokenizerFast):
+ if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
@@ -279,7 +285,7 @@ cached_get_tokenizer = lru_cache(get_tokenizer)
def cached_tokenizer_from_config(
- model_config: ModelConfig,
+ model_config: "ModelConfig",
**kwargs: Any,
):
return cached_get_tokenizer(
@@ -291,7 +297,7 @@ def cached_tokenizer_from_config(
)
-def init_tokenizer_from_configs(model_config: ModelConfig):
+def init_tokenizer_from_configs(model_config: "ModelConfig"):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 52f221d1e373e..78fb6edc8b9ed 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,150 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
- from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+import warnings
-class TokenizerBase(ABC):
- @property
- @abstractmethod
- def all_special_tokens(self) -> list[str]:
- raise NotImplementedError()
+def __getattr__(name: str):
+ if name == "TokenizerBase":
+ from vllm.tokenizers import TokenizerLike
- @property
- @abstractmethod
- def all_special_ids(self) -> list[int]:
- raise NotImplementedError()
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
+ "moved to `vllm.tokenizers.TokenizerLike`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
- @property
- @abstractmethod
- def bos_token_id(self) -> int:
- raise NotImplementedError()
+ return TokenizerLike
+ if name == "TokenizerRegistry":
+ from vllm.tokenizers import TokenizerRegistry
- @property
- @abstractmethod
- def eos_token_id(self) -> int:
- raise NotImplementedError()
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
+ "moved to `vllm.tokenizers.TokenizerRegistry`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
- @property
- @abstractmethod
- def sep_token(self) -> str:
- raise NotImplementedError()
+ return TokenizerRegistry
- @property
- @abstractmethod
- def pad_token(self) -> str:
- raise NotImplementedError()
-
- @property
- @abstractmethod
- def is_fast(self) -> bool:
- raise NotImplementedError()
-
- @property
- @abstractmethod
- def vocab_size(self) -> int:
- raise NotImplementedError()
-
- @property
- @abstractmethod
- def max_token_id(self) -> int:
- raise NotImplementedError()
-
- @property
- @abstractmethod
- def truncation_side(self) -> str:
- raise NotImplementedError()
-
- def __len__(self) -> int:
- return self.vocab_size
-
- @abstractmethod
- def __call__(
- self,
- text: str | list[str] | list[int],
- text_pair: str | None = None,
- add_special_tokens: bool = False,
- truncation: bool = False,
- max_length: int | None = None,
- ):
- raise NotImplementedError()
-
- @abstractmethod
- def get_vocab(self) -> dict[str, int]:
- raise NotImplementedError()
-
- @abstractmethod
- def get_added_vocab(self) -> dict[str, int]:
- raise NotImplementedError()
-
- @abstractmethod
- def encode_one(
- self,
- text: str,
- truncation: bool = False,
- max_length: int | None = None,
- ) -> list[int]:
- raise NotImplementedError()
-
- @abstractmethod
- def encode(
- self,
- text: str,
- truncation: bool | None = None,
- max_length: int | None = None,
- add_special_tokens: bool | None = None,
- ) -> list[int]:
- raise NotImplementedError()
-
- @abstractmethod
- def apply_chat_template(
- self,
- messages: list["ChatCompletionMessageParam"],
- tools: list[dict[str, Any]] | None = None,
- **kwargs,
- ) -> list[int]:
- raise NotImplementedError()
-
- @abstractmethod
- def convert_tokens_to_string(self, tokens: list[str]) -> str:
- raise NotImplementedError()
-
- @abstractmethod
- def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
- raise NotImplementedError()
-
- @abstractmethod
- def convert_ids_to_tokens(
- self,
- ids: list[int],
- skip_special_tokens: bool = True,
- ) -> list[str]:
- raise NotImplementedError()
-
-
-class TokenizerRegistry:
- # Tokenizer name -> (tokenizer module, tokenizer class)
- REGISTRY: dict[str, tuple[str, str]] = {}
-
- @staticmethod
- def register(name: str, module: str, class_name: str) -> None:
- TokenizerRegistry.REGISTRY[name] = (module, class_name)
-
- @staticmethod
- def get_tokenizer(
- tokenizer_name: str,
- *args,
- **kwargs,
- ) -> TokenizerBase:
- tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
- if tokenizer_cls is None:
- raise ValueError(f"Tokenizer {tokenizer_name} not found.")
-
- tokenizer_module = importlib.import_module(tokenizer_cls[0])
- class_ = getattr(tokenizer_module, tokenizer_cls[1])
- return class_.from_pretrained(*args, **kwargs)
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
deleted file mode 100644
index b63cb26af46dd..0000000000000
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .mistral import (
- MistralTokenizer,
- maybe_serialize_tool_calls,
- truncate_tool_call_ids,
- validate_request_params,
-)
-
-__all__ = [
- "MistralTokenizer",
- "maybe_serialize_tool_calls",
- "truncate_tool_call_ids",
- "validate_request_params",
-]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index bd28c41fb50e8..336d3e9fa1d20 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,9 +26,10 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils.async_utils import cancel_task_threadsafe
from vllm.utils.collection_utils import as_list
@@ -120,9 +121,10 @@ class AsyncLLM(EngineClient):
)
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
- stream_interval = self.vllm_config.scheduler_config.stream_interval
self.output_processor = OutputProcessor(
- self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+ self.tokenizer,
+ log_stats=self.log_stats,
+ stream_interval=self.vllm_config.scheduler_config.stream_interval,
)
endpoint = self.observability_config.otlp_traces_endpoint
if endpoint is not None:
@@ -703,17 +705,17 @@ class AsyncLLM(EngineClient):
raise EngineGenerateError() from e
@property
- def tokenizer(self) -> AnyTokenizer | None:
+ def tokenizer(self) -> TokenizerLike | None:
return self.input_processor.tokenizer
@tokenizer.setter
- def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+ def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
self.input_processor.tokenizer = tokenizer
- async def get_tokenizer(self) -> AnyTokenizer:
+ async def get_tokenizer(self) -> TokenizerLike:
if self.tokenizer is None:
raise ValueError(
- "Unable to get tokenizer because skip_tokenizer_init is True"
+ "Unable to get tokenizer because `skip_tokenizer_init=True`"
)
return self.tokenizer
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b7a24096bf15f..c55240c40f6f0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -10,7 +10,7 @@ from transformers import PreTrainedTokenizerFast
from vllm.logger import init_logger
from vllm.transformers_utils.detokenizer_utils import (
- AnyTokenizer,
+ TokenizerLike,
convert_prompt_ids_to_tokens,
detokenize_incrementally,
)
@@ -45,7 +45,7 @@ class IncrementalDetokenizer:
@classmethod
def from_new_request(
cls,
- tokenizer: AnyTokenizer | None,
+ tokenizer: TokenizerLike | None,
request: EngineCoreRequest,
) -> "IncrementalDetokenizer":
assert request.sampling_params is not None
@@ -256,7 +256,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
- def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+ def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest):
super().__init__(request)
self.tokenizer = tokenizer
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index cfd637931a1ce..e6a94f4e3de5d 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -19,8 +19,7 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import argsort_mm_positions
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -40,7 +39,7 @@ class InputProcessor:
def __init__(
self,
vllm_config: VllmConfig,
- tokenizer: AnyTokenizer | None,
+ tokenizer: TokenizerLike | None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
) -> None:
self.vllm_config = vllm_config
@@ -62,11 +61,11 @@ class InputProcessor:
)
@property
- def tokenizer(self) -> AnyTokenizer | None:
+ def tokenizer(self) -> TokenizerLike | None:
return self.input_preprocessor.tokenizer
@tokenizer.setter
- def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+ def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
self.input_preprocessor.tokenizer = tokenizer
def _validate_logprobs(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ead553e98a978..a3bde7ba8d64d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,8 +23,9 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
@@ -95,9 +96,10 @@ class LLMEngine:
)
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
- stream_interval = self.vllm_config.scheduler_config.stream_interval
self.output_processor = OutputProcessor(
- self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+ self.tokenizer,
+ log_stats=self.log_stats,
+ stream_interval=self.vllm_config.scheduler_config.stream_interval,
)
endpoint = self.observability_config.otlp_traces_endpoint
if endpoint is not None:
@@ -350,17 +352,17 @@ class LLMEngine:
return get_metrics_snapshot()
@property
- def tokenizer(self) -> AnyTokenizer | None:
+ def tokenizer(self) -> TokenizerLike | None:
return self.input_processor.tokenizer
@tokenizer.setter
- def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+ def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
self.input_processor.tokenizer = tokenizer
- def get_tokenizer(self) -> AnyTokenizer:
+ def get_tokenizer(self) -> TokenizerLike:
if self.tokenizer is None:
raise ValueError(
- "Unable to get tokenizer because skip_tokenizer_init is True"
+ "Unable to get tokenizer because `skip_tokenizer_init=True`"
)
return self.tokenizer
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 63064a2c65d67..1c8f808bc25ba 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -13,7 +13,7 @@ from vllm.logprobs import (
create_sample_logprobs,
)
from vllm.transformers_utils.detokenizer_utils import (
- AnyTokenizer,
+ TokenizerLike,
convert_ids_list_to_tokens,
)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
@@ -28,7 +28,7 @@ NONES = itertools.repeat(None)
class LogprobsProcessor:
# Tokenizer for this request,
# None if detokenization is disabled.
- tokenizer: AnyTokenizer | None
+ tokenizer: TokenizerLike | None
# Logprobs for this request
logprobs: SampleLogprobs | None
@@ -40,7 +40,7 @@ class LogprobsProcessor:
@classmethod
def from_new_request(
cls,
- tokenizer: AnyTokenizer | None,
+ tokenizer: TokenizerLike | None,
request: EngineCoreRequest,
) -> "LogprobsProcessor":
sampling_params = request.sampling_params
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 0453c4a77f0cd..e85fbb4ee0fb0 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -15,8 +15,8 @@ from vllm.outputs import (
RequestOutput,
)
from vllm.sampling_params import RequestOutputKind
+from vllm.tokenizers import TokenizerLike
from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
-from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
@@ -139,7 +139,7 @@ class RequestState:
@classmethod
def from_new_request(
cls,
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike | None,
request: EngineCoreRequest,
prompt: str | None,
parent_req: ParentRequest | None,
@@ -341,7 +341,10 @@ class OutputProcessor:
"""Process EngineCoreOutputs into RequestOutputs."""
def __init__(
- self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+ self,
+ tokenizer: TokenizerLike | None,
+ log_stats: bool,
+ stream_interval: int = 1,
):
self.log_stats = log_stats
self.tokenizer = tokenizer
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 7dc9589b63b86..5c09b7b0634f2 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -10,10 +10,10 @@ if TYPE_CHECKING:
import torch
from vllm.config import VllmConfig
- from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.tokenizers import TokenizerLike
else:
VllmConfig = object
- AnyTokenizer = object
+ TokenizerLike = object
class StructuredOutputOptions(enum.Enum):
@@ -100,7 +100,7 @@ class StructuredOutputBackend(ABC):
"""Engine-level backend for structured output requests."""
vllm_config: VllmConfig
- tokenizer: AnyTokenizer
+ tokenizer: TokenizerLike
vocab_size: int
@abstractmethod
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c9f2dc07da786..f8a2df43dd90e 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@ import torch
import vllm.envs
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import (
StructuredOutputBackend,
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index d2d14fcfc4362..ae42b33f80f88 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
import xgrammar as xgr
- from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.tokenizers import TokenizerLike
from vllm.v1.worker.gpu_input_batch import InputBatch
else:
xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -36,7 +36,7 @@ else:
"transformers.models.gpt2.tokenization_gpt2",
)
- AnyTokenizer = object
+ TokenizerLike = object
SchedulerOutput = object
InputBatch = object
@@ -195,7 +195,7 @@ re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
def _reduced_vocabulary(
- tokenizer: AnyTokenizer,
+ tokenizer: TokenizerLike,
eos_token_id: int,
) -> dict[bytes, list[int]]:
"""Create a map from vocabulary tokens to lists of equivalent token ids.
@@ -222,7 +222,7 @@ def _reduced_vocabulary(
vocabulary: dict[bytes, list[int]] = {}
empty_token_ids: list[int] = []
for token, token_idx in tokenizer.get_vocab().items():
- if token in tokenizer.all_special_tokens: # type: ignore
+ if token in tokenizer.all_special_tokens:
continue
token_str = convert_token_to_string(token)
@@ -261,7 +261,7 @@ def _reduced_vocabulary(
return vocabulary
-def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
"""Get the `Vocabulary` object for a given tokenizer."""
if hasattr(tokenizer, "_outlines_vocabulary"):
return tokenizer._outlines_vocabulary # type: ignore