From 34a984274eae2f8fb9d1d6413abd08d7fcde741c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 29 Nov 2025 20:02:21 +0800 Subject: [PATCH] [Misc] Refactor tokenizer interface (#29693) Signed-off-by: DarkLight1337 --- .buildkite/test-amd.yaml | 4 +- .buildkite/test-pipeline.yaml | 4 +- benchmarks/backend_request_func.py | 2 +- docs/features/reasoning_outputs.md | 5 +- docs/features/tool_calling.md | 2 +- .../entrypoints/openai/test_serving_engine.py | 2 +- .../openai/tool_parsers/conftest.py | 4 +- .../tool_parsers/test_hermes_tool_parser.py | 12 +- .../test_llama3_json_tool_parser.py | 4 +- .../test_llama4_pythonic_tool_parser.py | 10 +- .../tool_parsers/test_olmo3_tool_parser.py | 10 +- .../tool_parsers/test_pythonic_tool_parser.py | 10 +- .../entrypoints/openai/tool_parsers/utils.py | 4 +- tests/entrypoints/test_chat_utils.py | 2 +- .../language/generation/test_mistral.py | 2 +- .../multimodal/generation/test_voxtral.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 4 +- .../multimodal/generation/vlm_utils/types.py | 4 +- .../multimodal/processing/test_common.py | 2 +- tests/multimodal/test_processing.py | 21 ++- .../test_mistral_reasoning_parser.py | 2 +- tests/reasoning/utils.py | 2 +- tests/tokenization/__init__.py | 0 tests/tokenization/test_do_lower_case.py | 18 -- tests/tokenization/test_get_eos.py | 32 ---- tests/tokenization/test_tokenizer.py | 23 --- tests/tokenization/test_tokenizer_registry.py | 120 ------------- tests/tokenizers_/__init__.py | 4 + tests/tokenizers_/test_basic.py | 59 +++++++ .../test_cached_tokenizer.py | 5 +- .../test_detokenize.py | 2 +- .../test_mistral.py} | 21 +-- tests/tokenizers_/test_registry.py | 36 ++++ .../tool_use/test_ernie45_moe_tool_parser.py | 5 +- tests/tool_use/test_jamba_tool_parser.py | 7 +- tests/tool_use/test_qwen3coder_tool_parser.py | 5 +- tests/tool_use/test_seed_oss_tool_parser.py | 5 +- tests/tool_use/test_xlam_tool_parser.py | 5 +- tests/transformers_utils/test_config.py | 74 +++----- ...gs_from_processor.py => test_processor.py} | 0 tests/transformers_utils/test_repo_utils.py | 62 +++++++ tests/v1/engine/test_output_processor.py | 4 +- tools/pre_commit/check_pickle_imports.py | 2 +- tools/pre_commit/mypy.py | 1 + vllm/benchmarks/datasets.py | 4 +- vllm/engine/protocol.py | 4 +- vllm/entrypoints/chat_utils.py | 12 +- vllm/entrypoints/llm.py | 15 +- vllm/entrypoints/openai/serving_chat.py | 13 +- .../openai/serving_classification.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 16 +- vllm/entrypoints/openai/serving_engine.py | 88 +++++----- vllm/entrypoints/openai/serving_responses.py | 20 +-- vllm/entrypoints/openai/serving_score.py | 8 +- .../openai/serving_tokenization.py | 4 +- .../tool_parsers/abstract_tool_parser.py | 4 +- .../tool_parsers/deepseekv31_tool_parser.py | 4 +- .../tool_parsers/deepseekv3_tool_parser.py | 4 +- .../tool_parsers/ernie45_tool_parser.py | 4 +- .../tool_parsers/glm4_moe_tool_parser.py | 4 +- .../granite_20b_fc_tool_parser.py | 4 +- .../tool_parsers/granite_tool_parser.py | 4 +- .../openai/tool_parsers/hermes_tool_parser.py | 8 +- .../tool_parsers/hunyuan_a13b_tool_parser.py | 4 +- .../tool_parsers/internlm2_tool_parser.py | 4 +- .../openai/tool_parsers/jamba_tool_parser.py | 5 +- .../tool_parsers/kimi_k2_tool_parser.py | 4 +- .../tool_parsers/longcat_tool_parser.py | 4 +- .../tool_parsers/minimax_m2_tool_parser.py | 4 +- .../tool_parsers/minimax_tool_parser.py | 4 +- .../tool_parsers/mistral_tool_parser.py | 6 +- .../openai/tool_parsers/openai_tool_parser.py | 6 +- .../tool_parsers/qwen3coder_tool_parser.py | 4 +- .../tool_parsers/qwen3xml_tool_parser.py | 4 +- .../tool_parsers/seed_oss_tool_parser.py | 4 +- .../openai/tool_parsers/step3_tool_parser.py | 4 +- .../openai/tool_parsers/xlam_tool_parser.py | 4 +- vllm/entrypoints/renderer.py | 10 +- vllm/entrypoints/score_utils.py | 18 +- vllm/entrypoints/utils.py | 2 +- vllm/inputs/preprocess.py | 10 +- vllm/logits_process.py | 4 +- vllm/model_executor/models/h2ovl.py | 4 +- vllm/model_executor/models/internvl.py | 6 +- .../model_executor/models/nano_nemotron_vl.py | 10 +- vllm/model_executor/models/nemotron_vl.py | 4 +- vllm/model_executor/models/opencua.py | 4 +- vllm/model_executor/models/pixtral.py | 6 +- vllm/model_executor/models/qwen2_vl.py | 4 +- vllm/model_executor/models/skyworkr1v.py | 4 +- vllm/model_executor/models/step3_vl.py | 4 +- vllm/model_executor/models/voxtral.py | 6 +- vllm/multimodal/processing.py | 43 ++--- vllm/multimodal/registry.py | 14 +- vllm/reasoning/abs_reasoning_parsers.py | 6 +- vllm/reasoning/basic_parsers.py | 4 +- vllm/reasoning/minimax_m2_reasoning_parser.py | 4 +- vllm/reasoning/mistral_reasoning_parser.py | 2 +- vllm/reasoning/olmo3_reasoning_parser.py | 4 +- vllm/sampling_params.py | 4 +- vllm/tokenizers/__init__.py | 8 + .../tokenizers/mistral.py | 29 +--- vllm/tokenizers/protocol.py | 105 +++++++++++ vllm/tokenizers/registry.py | 28 +++ vllm/transformers_utils/config.py | 7 +- vllm/transformers_utils/detokenizer_utils.py | 10 +- vllm/transformers_utils/gguf_utils.py | 3 +- vllm/transformers_utils/tokenizer.py | 60 ++++--- vllm/transformers_utils/tokenizer_base.py | 163 +++--------------- .../transformers_utils/tokenizers/__init__.py | 16 -- vllm/v1/engine/async_llm.py | 16 +- vllm/v1/engine/detokenizer.py | 6 +- vllm/v1/engine/input_processor.py | 9 +- vllm/v1/engine/llm_engine.py | 16 +- vllm/v1/engine/logprobs.py | 6 +- vllm/v1/engine/output_processor.py | 9 +- vllm/v1/structured_output/backend_types.py | 6 +- vllm/v1/structured_output/backend_xgrammar.py | 2 +- vllm/v1/structured_output/utils.py | 10 +- 119 files changed, 752 insertions(+), 821 deletions(-) delete mode 100644 tests/tokenization/__init__.py delete mode 100644 tests/tokenization/test_do_lower_case.py delete mode 100644 tests/tokenization/test_get_eos.py delete mode 100644 tests/tokenization/test_tokenizer.py delete mode 100644 tests/tokenization/test_tokenizer_registry.py create mode 100644 tests/tokenizers_/__init__.py create mode 100644 tests/tokenizers_/test_basic.py rename tests/{tokenization => tokenizers_}/test_cached_tokenizer.py (88%) rename tests/{tokenization => tokenizers_}/test_detokenize.py (99%) rename tests/{tokenization/test_mistral_tokenizer.py => tokenizers_/test_mistral.py} (98%) create mode 100644 tests/tokenizers_/test_registry.py rename tests/transformers_utils/{test_get_processor_kwargs_from_processor.py => test_processor.py} (100%) create mode 100644 tests/transformers_utils/test_repo_utils.py create mode 100644 vllm/tokenizers/__init__.py rename vllm/{transformers_utils => }/tokenizers/mistral.py (96%) create mode 100644 vllm/tokenizers/protocol.py create mode 100644 vllm/tokenizers/registry.py delete mode 100644 vllm/transformers_utils/tokenizers/__init__.py diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4ddf11c0b268f..4d98ee40a4bbb 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -316,7 +316,7 @@ steps: source_file_dependencies: - vllm/ - tests/engine - - tests/tokenization + - tests/tokenizers_ - tests/test_sequence - tests/test_config - tests/test_logger @@ -324,7 +324,7 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - - pytest -v -s tokenization + - pytest -v -s tokenizers_ - label: V1 Test e2e + engine # 30min timeout_in_minutes: 45 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c38068a9b22c0..16d4907549587 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -282,7 +282,7 @@ steps: source_file_dependencies: - vllm/ - tests/engine - - tests/tokenization + - tests/tokenizers_ - tests/test_sequence - tests/test_config - tests/test_logger @@ -290,7 +290,7 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - - pytest -v -s tokenization + - pytest -v -s tokenizers_ - label: V1 Test e2e + engine # 30min timeout_in_minutes: 45 diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4021fede72153..d69d74ca61f54 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -620,7 +620,7 @@ def get_tokenizer( kwargs["use_fast"] = False if tokenizer_mode == "mistral": try: - from vllm.transformers_utils.tokenizer import MistralTokenizer + from vllm.tokenizers import MistralTokenizer except ImportError as e: raise ImportError( "MistralTokenizer requires vllm package.\n" diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 5f26c7cf182b9..08a0dd69efa90 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso # import the required packages from vllm.reasoning import ReasoningParser, ReasoningParserManager - from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) + from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage # define a reasoning parser and register it to vllm # the name list in register_module can be used # in --reasoning-parser. class ExampleParser(ReasoningParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) def extract_reasoning_streaming( diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 22dda37279ac6..b6dfbf10b4568 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -422,7 +422,7 @@ Here is a summary of a plugin file: # in --tool-call-parser. you can define as many # tool parsers as you want here. class ExampleToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # adjust request. e.g.: set skip special tokens diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 29892d0bf38aa..956a06dc5487c 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -10,7 +10,7 @@ import pytest from vllm.config import ModelConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer @pytest.fixture() diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py index f2ac5e5b9a8fa..a40d0ab44cf7f 100644 --- a/tests/entrypoints/openai/tool_parsers/conftest.py +++ b/tests/entrypoints/openai/tool_parsers/conftest.py @@ -4,9 +4,9 @@ import pytest from transformers import AutoTokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike @pytest.fixture(scope="function") -def default_tokenizer() -> AnyTokenizer: +def default_tokenizer() -> TokenizerLike: return AutoTokenizer.from_pretrained("gpt2") diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index 38008dafe32b2..b2303ab0e7b7c 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -7,7 +7,7 @@ import pytest from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from ....utils import RemoteOpenAIServer @@ -270,14 +270,14 @@ async def test_streaming_product_tool_call(): @pytest.fixture -def qwen_tokenizer() -> AnyTokenizer: +def qwen_tokenizer() -> TokenizerLike: from vllm.transformers_utils.tokenizer import get_tokenizer return get_tokenizer("Qwen/Qwen3-32B") @pytest.fixture -def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: +def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser: return Hermes2ProToolParser(qwen_tokenizer) @@ -291,7 +291,7 @@ def any_chat_request() -> ChatCompletionRequest: def test_hermes_parser_streaming_just_forward_text( - qwen_tokenizer: AnyTokenizer, + qwen_tokenizer: TokenizerLike, hermes_parser: Hermes2ProToolParser, any_chat_request: ChatCompletionRequest, ) -> None: @@ -323,7 +323,7 @@ def test_hermes_parser_streaming_just_forward_text( def test_hermes_parser_streaming_failure_case_bug_19056( - qwen_tokenizer: AnyTokenizer, + qwen_tokenizer: TokenizerLike, hermes_parser: Hermes2ProToolParser, any_chat_request: ChatCompletionRequest, ) -> None: @@ -357,7 +357,7 @@ def test_hermes_parser_streaming_failure_case_bug_19056( def test_hermes_parser_streaming( - qwen_tokenizer: AnyTokenizer, + qwen_tokenizer: TokenizerLike, hermes_parser: Hermes2ProToolParser, any_chat_request: ChatCompletionRequest, ) -> None: diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py index 37e52d2cdf609..6c286ca90ce48 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py @@ -7,11 +7,11 @@ import pytest from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike @pytest.fixture -def parser(default_tokenizer: AnyTokenizer): +def parser(default_tokenizer: TokenizerLike): return Llama3JsonToolParser(default_tokenizer) diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index d297432eab644..8aa88a007188f 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import ( ) from vllm.entrypoints.openai.protocol import FunctionCall from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike # Test cases similar to pythonic parser but with Llama4 specific format SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]" @@ -64,7 +64,7 @@ PYTHON_TAG_FUNCTION_OUTPUT = ( @pytest.mark.parametrize("streaming", [True, False]) -def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer): +def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")( default_tokenizer ) @@ -208,7 +208,7 @@ def test_tool_call( streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall], - default_tokenizer: AnyTokenizer, + default_tokenizer: TokenizerLike, ): tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")( default_tokenizer @@ -224,7 +224,7 @@ def test_tool_call( assert actual.function == expected -def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): +def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")( default_tokenizer ) @@ -246,7 +246,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): @pytest.mark.parametrize("streaming", [False]) -def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer): +def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike): """test regex timeout is handled gracefully""" tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")( default_tokenizer diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py index 13cff9a8ebf1e..a0b9a3c563bc2 100644 --- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py @@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import ( ) from vllm.entrypoints.openai.protocol import FunctionCall from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')" @@ -69,7 +69,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall( @pytest.mark.parametrize("streaming", [True, False]) -def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer): +def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")( default_tokenizer ) @@ -188,7 +188,7 @@ def test_tool_call( streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall], - default_tokenizer: AnyTokenizer, + default_tokenizer: TokenizerLike, ): tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")( default_tokenizer @@ -205,7 +205,7 @@ def test_tool_call( assert actual.function == expected -def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): +def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")( default_tokenizer ) @@ -228,7 +228,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): @pytest.mark.parametrize("streaming", [False]) -def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer): +def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike): """test regex timeout is handled gracefully""" tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")( default_tokenizer diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index fcd3df16e5cfa..52202c55e8405 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import ( ) from vllm.entrypoints.openai.protocol import FunctionCall from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')" @@ -61,7 +61,7 @@ ESCAPED_STRING_FUNCTION_CALL = FunctionCall( @pytest.mark.parametrize("streaming", [True, False]) -def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer): +def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( default_tokenizer ) @@ -168,7 +168,7 @@ def test_tool_call( streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall], - default_tokenizer: AnyTokenizer, + default_tokenizer: TokenizerLike, ): tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( default_tokenizer @@ -185,7 +185,7 @@ def test_tool_call( assert actual.function == expected -def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): +def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike): tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( default_tokenizer ) @@ -208,7 +208,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer): @pytest.mark.parametrize("streaming", [False]) -def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer): +def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike): """test regex timeout is handled gracefully""" tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( default_tokenizer diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index 38899f2632554..2d4f5f1734102 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers import ToolParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike class StreamingToolReconstructor: @@ -111,7 +111,7 @@ def run_tool_extraction_nonstreaming( return tool_parser.extract_tool_calls(model_output, request) -def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]: +def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]: # Split a string into a series of deltas using the provided tokenizer. Each # delta will be the string equivalent of a single token. token_ids = tokenizer.encode(text, add_special_tokens=False) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 7baf564ad01a4..a351cda60621f 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -28,8 +28,8 @@ from vllm.multimodal.utils import ( encode_image_base64, encode_video_base64, ) +from vllm.tokenizers import MistralTokenizer from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from ..models.registry import HF_EXAMPLE_MODELS from ..utils import VLLM_PATH diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 80e337d570a36..1377776a6d84b 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolParser, ) from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer import MistralTokenizer +from vllm.tokenizers import MistralTokenizer from ...utils import check_logprobs_close diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py index 18a50c3a555da..9e9087cb0fc4d 100644 --- a/tests/models/multimodal/generation/test_voxtral.py +++ b/tests/models/multimodal/generation/test_voxtral.py @@ -9,7 +9,7 @@ from mistral_common.audio import Audio from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk from mistral_common.protocol.instruct.messages import UserMessage -from vllm.transformers_utils.tokenizer import MistralTokenizer +from vllm.tokenizers import MistralTokenizer from ....conftest import AudioTestAssets from ....utils import RemoteOpenAIServer diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 03ff3bcf6307b..08cf4b2202dcd 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -9,7 +9,7 @@ import torch from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config.model import RunnerOption -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from .....conftest import HfRunner, VllmRunner from ....registry import HF_EXAMPLE_MODELS @@ -33,7 +33,7 @@ def run_test( auto_cls: type[_BaseAutoModelClass], use_tokenizer_eos: bool, comparator: Callable[..., None], - get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None, + get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None, stop_str: list[str] | None, limit_mm_per_prompt: dict[str, int], vllm_runner_kwargs: dict[str, Any] | None, diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 5c1bc6ac28fe3..0c03c84497125 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -14,7 +14,7 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config.model import RunnerOption from vllm.logprobs import SampleLogprobs -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from .....conftest import ( AUDIO_ASSETS, @@ -126,7 +126,7 @@ class VLMTestInfo(NamedTuple): vllm_runner_kwargs: dict[str, Any] | None = None # Optional callable which gets a list of token IDs from the model tokenizer - get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None + get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None # Optional list of strings to stop generation, useful when stop tokens are # not special tokens in the tokenizer stop_str: list[str] | None = None diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 9638791ab5caa..c39e522100901 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,8 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext +from vllm.tokenizers import MistralTokenizer from vllm.transformers_utils.tokenizer import ( - MistralTokenizer, cached_tokenizer_from_config, encode_tokens, ) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index d860c50e7899a..f7fa8da54d54e 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time from contextlib import nullcontext from typing import cast @@ -23,7 +24,7 @@ from vllm.multimodal.processing import ( replace_token_matches, ) from vllm.multimodal.profiling import MultiModalProfiler -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from .utils import random_image @@ -238,7 +239,7 @@ def test_find_token_matches( update_type, ): # Should not be used since there is nothing to convert to token IDs - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) prompt_updates = { key: update_type(key, target, []).resolve(0) @@ -385,7 +386,7 @@ def test_find_text_matches( update_type, ): # Should not be used since there is nothing to convert to text - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) prompt_updates = { key: update_type(key, target, []).resolve(0) @@ -545,7 +546,7 @@ def test_find_update_text( expected_by_update_type_mm_count, ): # Should not be used since there is nothing to convert to text - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) for ( update_type, @@ -750,7 +751,7 @@ def test_find_update_tokens( expected_by_update_type_mm_count, ): # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) for ( update_type, @@ -900,7 +901,7 @@ def test_find_mm_placeholders( update_type, ): # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) mm_prompt_updates = { key: [[update_type(key, [], repl).resolve(i)] for i in range(3)] @@ -1029,7 +1030,7 @@ def test_hf_processor_init_kwargs( expected_kwargs, ): # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) ctx = InputProcessingContext( model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), @@ -1065,7 +1066,7 @@ def test_hf_processor_call_kwargs( expected_kwargs, ): # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) ctx = InputProcessingContext( model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), @@ -1088,9 +1089,7 @@ def test_apply_matches_no_match_exits_quickly(): With the fix, it should exit immediately when no match is found. """ - import time - - mock_tokenizer = cast(AnyTokenizer, object()) + mock_tokenizer = cast(TokenizerLike, object()) # Create a long prompt with no placeholder long_prompt = "x" * 10000 diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py index 5163c863863a7..0fe315c2567f9 100644 --- a/tests/reasoning/test_mistral_reasoning_parser.py +++ b/tests/reasoning/test_mistral_reasoning_parser.py @@ -5,7 +5,7 @@ import pytest from tests.reasoning.utils import run_reasoning_extraction_mistral from vllm.reasoning import ReasoningParser, ReasoningParserManager -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer parser_name = "mistral" diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index bd0b230a847cb..695312a0cadfe 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -4,7 +4,7 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from vllm.reasoning import ReasoningParser -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer class StreamingReasoningReconstructor: diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py deleted file mode 100644 index 8aff50b351e31..0000000000000 --- a/tests/tokenization/test_do_lower_case.py +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.transformers_utils.tokenizer import get_tokenizer - -TOKENIZER_NAMES = ["BAAI/bge-base-en"] - - -@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) -@pytest.mark.parametrize("n_tokens", [510]) -def test_special_tokens(tokenizer_name: str, n_tokens: int): - tokenizer = get_tokenizer(tokenizer_name, revision="main") - - prompts = "[UNK]" * n_tokens - prompt_token_ids = tokenizer.encode(prompts) - assert len(prompt_token_ids) == n_tokens + 2 diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py deleted file mode 100644 index 921d77b1b335e..0000000000000 --- a/tests/tokenization/test_get_eos.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This test file includes some cases where it is inappropriate to -only get the `eos_token_id` from the tokenizer as defined by -{meth}`vllm.LLMEngine._get_eos_token_id`. -""" - -from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer import get_tokenizer - - -def test_get_llama3_eos_token(): - model_name = "meta-llama/Llama-3.2-1B-Instruct" - - tokenizer = get_tokenizer(model_name) - assert tokenizer.eos_token_id == 128009 - - generation_config = try_get_generation_config(model_name, trust_remote_code=False) - assert generation_config is not None - assert generation_config.eos_token_id == [128001, 128008, 128009] - - -def test_get_blip2_eos_token(): - model_name = "Salesforce/blip2-opt-2.7b" - - tokenizer = get_tokenizer(model_name) - assert tokenizer.eos_token_id == 2 - - generation_config = try_get_generation_config(model_name, trust_remote_code=False) - assert generation_config is not None - assert generation_config.eos_token_id == 50118 diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py deleted file mode 100644 index e86bb03883b5e..0000000000000 --- a/tests/tokenization/test_tokenizer.py +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from transformers import PreTrainedTokenizerBase - -from vllm.transformers_utils.tokenizer import get_tokenizer - -TOKENIZER_NAMES = [ - "facebook/opt-125m", - "gpt2", -] - - -@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) -def test_tokenizer_revision(tokenizer_name: str): - # Assume that "main" branch always exists - tokenizer = get_tokenizer(tokenizer_name, revision="main") - assert isinstance(tokenizer, PreTrainedTokenizerBase) - - # Assume that "never" branch always does not exist - with pytest.raises(OSError, match="not a valid git identifier"): - get_tokenizer(tokenizer_name, revision="never") diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py deleted file mode 100644 index f13bb4333d619..0000000000000 --- a/tests/tokenization/test_tokenizer_registry.py +++ /dev/null @@ -1,120 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import TYPE_CHECKING, Any - -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry - -if TYPE_CHECKING: - from vllm.entrypoints.chat_utils import ChatCompletionMessageParam - - -class TestTokenizer(TokenizerBase): - @classmethod - def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": - return TestTokenizer() - - @property - def all_special_tokens(self) -> list[str]: - raise NotImplementedError() - - @property - def all_special_ids(self) -> list[int]: - raise NotImplementedError() - - @property - def bos_token_id(self) -> int: - return 0 - - @property - def eos_token_id(self) -> int: - return 1 - - @property - def sep_token(self) -> str: - raise NotImplementedError() - - @property - def pad_token(self) -> str: - raise NotImplementedError() - - @property - def is_fast(self) -> bool: - raise NotImplementedError() - - @property - def vocab_size(self) -> int: - raise NotImplementedError() - - @property - def max_token_id(self) -> int: - raise NotImplementedError() - - @property - def truncation_side(self) -> str: - raise NotImplementedError() - - def __call__( - self, - text: str | list[str] | list[int], - text_pair: str | None = None, - add_special_tokens: bool = False, - truncation: bool = False, - max_length: int | None = None, - ): - raise NotImplementedError() - - def get_vocab(self) -> dict[str, int]: - raise NotImplementedError() - - def get_added_vocab(self) -> dict[str, int]: - raise NotImplementedError() - - def encode_one( - self, - text: str, - truncation: bool = False, - max_length: int | None = None, - ) -> list[int]: - raise NotImplementedError() - - def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]: - raise NotImplementedError() - - def apply_chat_template( - self, - messages: list["ChatCompletionMessageParam"], - tools: list[dict[str, Any]] | None = None, - **kwargs, - ) -> list[int]: - raise NotImplementedError() - - def convert_tokens_to_string(self, tokens: list[str]) -> str: - raise NotImplementedError() - - def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: - raise NotImplementedError() - - def convert_ids_to_tokens( - self, - ids: list[int], - skip_special_tokens: bool = True, - ) -> list[str]: - raise NotImplementedError() - - -def test_customized_tokenizer(): - TokenizerRegistry.register( - "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer" - ) - - tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer") - assert isinstance(tokenizer, TestTokenizer) - assert tokenizer.bos_token_id == 0 - assert tokenizer.eos_token_id == 1 - - tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom") - assert isinstance(tokenizer, TestTokenizer) - assert tokenizer.bos_token_id == 0 - assert tokenizer.eos_token_id == 1 diff --git a/tests/tokenizers_/__init__.py b/tests/tokenizers_/__init__.py new file mode 100644 index 0000000000000..a5d7f4b031032 --- /dev/null +++ b/tests/tokenizers_/__init__.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename +# this module to avoid conflicting with HF's `tokenizers` package diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py new file mode 100644 index 0000000000000..1fca633cc5cd7 --- /dev/null +++ b/tests/tokenizers_/test_basic.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import _get_protocol_attrs # type: ignore + +import pytest +from transformers import PreTrainedTokenizerBase + +from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.tokenizer import get_tokenizer + + +def _get_missing_attrs(obj: object, target: type): + return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)] + + +def test_tokenizer_like_protocol(): + assert not ( + missing_attrs := _get_missing_attrs( + get_tokenizer("gpt2", use_fast=False), + TokenizerLike, + ) + ), f"Missing attrs: {missing_attrs}" + + assert not ( + missing_attrs := _get_missing_attrs( + get_tokenizer("gpt2", use_fast=True), + TokenizerLike, + ) + ), f"Missing attrs: {missing_attrs}" + + assert not ( + missing_attrs := _get_missing_attrs( + get_tokenizer( + "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral" + ), + TokenizerLike, + ) + ), f"Missing attrs: {missing_attrs}" + + +@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) +def test_tokenizer_revision(tokenizer_name: str): + # Assume that "main" branch always exists + tokenizer = get_tokenizer(tokenizer_name, revision="main") + assert isinstance(tokenizer, PreTrainedTokenizerBase) + + # Assume that "never" branch always does not exist + with pytest.raises(OSError, match="not a valid git identifier"): + get_tokenizer(tokenizer_name, revision="never") + + +@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"]) +@pytest.mark.parametrize("n_tokens", [510]) +def test_special_tokens(tokenizer_name: str, n_tokens: int): + tokenizer = get_tokenizer(tokenizer_name, revision="main") + + prompts = "[UNK]" * n_tokens + prompt_token_ids = tokenizer.encode(prompts) + assert len(prompt_token_ids) == n_tokens + 2 diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenizers_/test_cached_tokenizer.py similarity index 88% rename from tests/tokenization/test_cached_tokenizer.py rename to tests/tokenizers_/test_cached_tokenizer.py index a5bb3dbcfe29d..48234687ea1ea 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenizers_/test_cached_tokenizer.py @@ -6,7 +6,8 @@ from copy import deepcopy import pytest from transformers import AutoTokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer +from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.tokenizer import get_cached_tokenizer @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"]) @@ -25,7 +26,7 @@ def test_cached_tokenizer(model_id: str): _check_consistency(unpickled_tokenizer, reference_tokenizer) -def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer): +def _check_consistency(target: TokenizerLike, expected: TokenizerLike): assert isinstance(target, type(expected)) # Cached attributes diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenizers_/test_detokenize.py similarity index 99% rename from tests/tokenization/test_detokenize.py rename to tests/tokenizers_/test_detokenize.py index f4b43a21daaa8..ae1d6b0956722 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -8,7 +8,7 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.detokenizer import ( FastIncrementalDetokenizer, diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenizers_/test_mistral.py similarity index 98% rename from tests/tokenization/test_mistral_tokenizer.py rename to tests/tokenizers_/test_mistral.py index 4cdfa9df95e1a..0706a94791dc9 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenizers_/test_mistral.py @@ -7,7 +7,7 @@ import pytest from mistral_common.exceptions import InvalidMessageStructureException from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy -from vllm.transformers_utils.tokenizers.mistral import ( +from vllm.tokenizers.mistral import ( MistralTokenizer, _prepare_apply_chat_template_tools_and_messages, ) @@ -308,25 +308,6 @@ class TestMistralTokenizer: def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer): assert mistral_tokenizer.get_added_vocab() == {} - def test_encode_one(self, mistral_tokenizer: MistralTokenizer): - token_ids = ( - [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686] - ) - - assert mistral_tokenizer.encode_one("Hello world !") == token_ids - assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids - assert ( - mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1) - == token_ids[:-2] - ) - assert ( - mistral_tokenizer.encode_one( - "Hello world !", truncation=False, max_length=1 - ) - == token_ids - ) - assert mistral_tokenizer.encode_one("") == [] - def test_encode(self, mistral_tokenizer: MistralTokenizer): token_ids = ( [1, 22177, 4304, 2662] diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py new file mode 100644 index 0000000000000..1eb19a0996dd9 --- /dev/null +++ b/tests/tokenizers_/test_registry.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.tokenizers import TokenizerLike, TokenizerRegistry +from vllm.transformers_utils.tokenizer import get_tokenizer + + +class TestTokenizer(TokenizerLike): + @classmethod + def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": + return TestTokenizer() # type: ignore + + @property + def bos_token_id(self) -> int: + return 0 + + @property + def eos_token_id(self) -> int: + return 1 + + +def test_customized_tokenizer(): + TokenizerRegistry.register( + "test_tokenizer", + __name__, + TestTokenizer.__name__, + ) + + tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer") + assert isinstance(tokenizer, TestTokenizer) + assert tokenizer.bos_token_id == 0 + assert tokenizer.eos_token_id == 1 + + tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom") + assert isinstance(tokenizer, TestTokenizer) + assert tokenizer.bos_token_id == 0 + assert tokenizer.eos_token_id == 1 diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py index 36a07bb561d9e..ee9da4fd6464b 100644 --- a/tests/tool_use/test_ernie45_moe_tool_parser.py +++ b/tests/tool_use/test_ernie45_moe_tool_parser.py @@ -14,8 +14,9 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer # Use a common model that is likely to be available MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking" @@ -173,7 +174,7 @@ def test_extract_tool_calls( def stream_delta_message_generator( ernie45_tool_parser: Ernie45ToolParser, - ernie45_tokenizer: AnyTokenizer, + ernie45_tokenizer: TokenizerLike, model_output: str, request: ChatCompletionRequest | None = None, ) -> Generator[DeltaMessage, None, None]: diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 9eb73b80fa9b4..2413b983fe871 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -10,8 +10,9 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test @@ -44,7 +45,9 @@ def assert_tool_calls( def stream_delta_message_generator( - jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str + jamba_tool_parser: JambaToolParser, + jamba_tokenizer: TokenizerLike, + model_output: str, ) -> Generator[DeltaMessage, None, None]: all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index 93ef1049fc07e..3cf1f4ef89f14 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -17,8 +17,9 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser, ) from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test @@ -104,7 +105,7 @@ def assert_tool_calls( def stream_delta_message_generator( qwen3_tool_parser, - qwen3_tokenizer: AnyTokenizer, + qwen3_tokenizer: TokenizerLike, model_output: str, request: ChatCompletionRequest | None = None, ) -> Generator[DeltaMessage, None, None]: diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index 1367ad87cb019..8e1ad5e9cedc8 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -15,8 +15,9 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test @@ -256,7 +257,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser): def stream_delta_message_generator( seed_oss_tool_parser: SeedOssToolParser, - seed_oss_tokenizer: AnyTokenizer, + seed_oss_tokenizer: TokenizerLike, model_output: str, request: ChatCompletionRequest | None = None, ) -> Generator[DeltaMessage, None, None]: diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index 122b427d60409..a1852c368eeb8 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -13,8 +13,9 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test @@ -49,7 +50,7 @@ def assert_tool_calls( def stream_delta_message_generator( xlam_tool_parser: xLAMToolParser, - xlam_tokenizer: AnyTokenizer, + xlam_tokenizer: TokenizerLike, model_output: str, request: ChatCompletionRequest | None = None, ) -> Generator[DeltaMessage, None, None]: diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py index 7107ad0f7b99d..7b56c9f0189d4 100644 --- a/tests/transformers_utils/test_config.py +++ b/tests/transformers_utils/test_config.py @@ -1,62 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This test file includes some cases where it is inappropriate to +only get the `eos_token_id` from the tokenizer as defined by +`vllm.LLMEngine._get_eos_token_id`. +""" + +from vllm.transformers_utils.config import try_get_generation_config +from vllm.transformers_utils.tokenizer import get_tokenizer -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, call, patch +def test_get_llama3_eos_token(): + model_name = "meta-llama/Llama-3.2-1B-Instruct" -import pytest + tokenizer = get_tokenizer(model_name) + assert tokenizer.eos_token_id == 128009 -from vllm.transformers_utils.repo_utils import list_filtered_repo_files + generation_config = try_get_generation_config(model_name, trust_remote_code=False) + assert generation_config is not None + assert generation_config.eos_token_id == [128001, 128008, 128009] -@pytest.mark.parametrize( - "allow_patterns,expected_relative_files", - [ - ( - ["*.json", "correct*.txt"], - ["json_file.json", "subfolder/correct.txt", "correct_2.txt"], - ), - ], -) -def test_list_filtered_repo_files( - allow_patterns: list[str], expected_relative_files: list[str] -): - with tempfile.TemporaryDirectory() as tmp_dir: - # Prep folder and files - path_tmp_dir = Path(tmp_dir) - subfolder = path_tmp_dir / "subfolder" - subfolder.mkdir() - (path_tmp_dir / "json_file.json").touch() - (path_tmp_dir / "correct_2.txt").touch() - (path_tmp_dir / "uncorrect.txt").touch() - (path_tmp_dir / "uncorrect.jpeg").touch() - (subfolder / "correct.txt").touch() - (subfolder / "uncorrect_sub.txt").touch() +def test_get_blip2_eos_token(): + model_name = "Salesforce/blip2-opt-2.7b" - def _glob_path() -> list[str]: - return [ - str(file.relative_to(path_tmp_dir)) - for file in path_tmp_dir.glob("**/*") - if file.is_file() - ] + tokenizer = get_tokenizer(model_name) + assert tokenizer.eos_token_id == 2 - # Patch list_repo_files called by fn - with patch( - "vllm.transformers_utils.repo_utils.list_repo_files", - MagicMock(return_value=_glob_path()), - ) as mock_list_repo_files: - out_files = sorted( - list_filtered_repo_files( - tmp_dir, allow_patterns, "revision", "model", "token" - ) - ) - assert out_files == sorted(expected_relative_files) - assert mock_list_repo_files.call_count == 1 - assert mock_list_repo_files.call_args_list[0] == call( - repo_id=tmp_dir, - revision="revision", - repo_type="model", - token="token", - ) + generation_config = try_get_generation_config(model_name, trust_remote_code=False) + assert generation_config is not None + assert generation_config.eos_token_id == 50118 diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_processor.py similarity index 100% rename from tests/transformers_utils/test_get_processor_kwargs_from_processor.py rename to tests/transformers_utils/test_processor.py diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py new file mode 100644 index 0000000000000..7107ad0f7b99d --- /dev/null +++ b/tests/transformers_utils/test_repo_utils.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pytest + +from vllm.transformers_utils.repo_utils import list_filtered_repo_files + + +@pytest.mark.parametrize( + "allow_patterns,expected_relative_files", + [ + ( + ["*.json", "correct*.txt"], + ["json_file.json", "subfolder/correct.txt", "correct_2.txt"], + ), + ], +) +def test_list_filtered_repo_files( + allow_patterns: list[str], expected_relative_files: list[str] +): + with tempfile.TemporaryDirectory() as tmp_dir: + # Prep folder and files + path_tmp_dir = Path(tmp_dir) + subfolder = path_tmp_dir / "subfolder" + subfolder.mkdir() + (path_tmp_dir / "json_file.json").touch() + (path_tmp_dir / "correct_2.txt").touch() + (path_tmp_dir / "uncorrect.txt").touch() + (path_tmp_dir / "uncorrect.jpeg").touch() + (subfolder / "correct.txt").touch() + (subfolder / "uncorrect_sub.txt").touch() + + def _glob_path() -> list[str]: + return [ + str(file.relative_to(path_tmp_dir)) + for file in path_tmp_dir.glob("**/*") + if file.is_file() + ] + + # Patch list_repo_files called by fn + with patch( + "vllm.transformers_utils.repo_utils.list_repo_files", + MagicMock(return_value=_glob_path()), + ) as mock_list_repo_files: + out_files = sorted( + list_filtered_repo_files( + tmp_dir, allow_patterns, "revision", "model", "token" + ) + ) + assert out_files == sorted(expected_relative_files) + assert mock_list_repo_files.call_count == 1 + assert mock_list_repo_files.call_args_list[0] == call( + repo_id=tmp_dir, + revision="revision", + repo_type="model", + token="token", + ) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 8e1198b315bd1..990aa9d925855 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -18,7 +18,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.v1.engine import ( EngineCoreEvent, EngineCoreEventType, @@ -31,7 +31,7 @@ from vllm.v1.metrics.stats import IterationStats, SchedulerStats def _ref_convert_id_to_token( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, token_id: int, ) -> str: """Reference impl of logprobs detokenization. diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py index b96a6701333de..2bb468da68c2a 100644 --- a/tools/pre_commit/check_pickle_imports.py +++ b/tools/pre_commit/check_pickle_imports.py @@ -27,8 +27,8 @@ ALLOWED_FILES = { "vllm/distributed/device_communicators/shm_broadcast.py", "vllm/distributed/device_communicators/shm_object_storage.py", "vllm/utils/hashing.py", + "tests/tokenizers_/test_cached_tokenizer.py", "tests/utils_/test_hashing.py", - "tests/tokenization/test_cached_tokenizer.py", "benchmarks/kernels/graph_machete_bench.py", "benchmarks/kernels/benchmark_lora.py", "benchmarks/kernels/benchmark_machete.py", diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 47e01fc93b48b..724b393044266 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -35,6 +35,7 @@ FILES = [ "vllm/multimodal", "vllm/platforms", "vllm/plugins", + "vllm/tokenizers", "vllm/transformers_utils", "vllm/triton_utils", "vllm/usage", diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 5411ecbb27b27..ec9b0fd6e969c 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -39,7 +39,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.import_utils import PlaceholderModule try: @@ -293,7 +293,7 @@ def lora_path_on_disk(lora_path: str) -> str: # Global cache for LoRA tokenizers. -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} +lora_tokenizer_cache: dict[int, TokenizerLike] = {} def process_image(image: Any) -> Mapping[str, Any]: diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 02741e50f6aa0..f2b19c845018c 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -13,7 +13,7 @@ from vllm.plugins.io_processors import IOProcessor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor @@ -85,7 +85,7 @@ class EngineClient(ABC): ... @abstractmethod - async def get_tokenizer(self) -> AnyTokenizer: + async def get_tokenizer(self) -> TokenizerLike: """Get the tokenizer""" ... diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index bf80856c1bbfc..1643906894c66 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -49,9 +49,9 @@ from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector +from vllm.tokenizers import MistralTokenizer, TokenizerLike from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.processor import cached_get_processor -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import random_uuid from vllm.utils.func_utils import supports_kw @@ -536,7 +536,7 @@ def resolve_hf_chat_template( def _resolve_chat_template_content_format( chat_template: str | None, tools: list[dict[str, Any]] | None, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, model_config: ModelConfig, ) -> _ChatTemplateContentFormat: @@ -593,7 +593,7 @@ def resolve_chat_template_content_format( chat_template: str | None, tools: list[dict[str, Any]] | None, given_format: ChatTemplateContentFormatOption, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, model_config: ModelConfig, ) -> _ChatTemplateContentFormat: @@ -627,7 +627,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike): super().__init__() self._model_config = model_config @@ -1592,7 +1592,7 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], model_config: ModelConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], @@ -1624,7 +1624,7 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], model_config: ModelConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2b34f36253edf..4ea213752e394 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -71,11 +71,8 @@ from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask -from vllm.transformers_utils.tokenizer import ( - AnyTokenizer, - MistralTokenizer, - get_cached_tokenizer, -) +from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.transformers_utils.tokenizer import get_cached_tokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter @@ -350,11 +347,11 @@ class LLM: self.input_processor = self.llm_engine.input_processor self.io_processor = self.llm_engine.io_processor - def get_tokenizer(self) -> AnyTokenizer: + def get_tokenizer(self) -> TokenizerLike: return self.llm_engine.get_tokenizer() @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.") - def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: + def set_tokenizer(self, tokenizer: TokenizerLike) -> None: # While CachedTokenizer is dynamic, have no choice but # compare class name. Misjudgment will arise from # user-defined tokenizer started with 'Cached' @@ -1244,7 +1241,7 @@ class LLM: def _embedding_score( self, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, text_1: list[str | TextPrompt | TokensPrompt], text_2: list[str | TextPrompt | TokensPrompt], truncate_prompt_tokens: int | None = None, @@ -1276,7 +1273,7 @@ class LLM: def _cross_encoding_score( self, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, data_1: list[str] | list[ScoreContentPartParam], data_2: list[str] | list[ScoreContentPartParam], truncate_prompt_tokens: int | None = None, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9a7051e0920af..cecd1da1e5548 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -62,8 +62,9 @@ from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.transformers_utils.tokenizers import ( +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import ( + MistralTokenizer, maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params, @@ -530,7 +531,7 @@ class OpenAIServingChat(OpenAIServing): request_id: str, model_name: str, conversation: list[ConversationMessage], - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, ) -> AsyncGenerator[str, None]: created_time = int(time.time()) @@ -1296,7 +1297,7 @@ class OpenAIServingChat(OpenAIServing): request_id: str, model_name: str, conversation: list[ConversationMessage], - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, ) -> ErrorResponse | ChatCompletionResponse: created_time = int(time.time()) @@ -1624,7 +1625,7 @@ class OpenAIServingChat(OpenAIServing): self, logprobs: dict[int, Logprob], top_logprobs: int | None, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, should_return_as_token_id: bool, ) -> list[ChatCompletionLogProb]: return [ @@ -1648,7 +1649,7 @@ class OpenAIServingChat(OpenAIServing): self, token_ids: GenericSequence[int], top_logprobs: GenericSequence[dict[int, Logprob] | None], - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, num_output_top_logprobs: int | None = None, return_as_token_id: bool | None = None, ) -> ChatCompletionLogProbs: diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 167ee152fece3..3b973eb125a83 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -221,7 +221,7 @@ class ServingClassification(ClassificationMixin): def _create_pooling_params( self, - ctx: ClassificationServeContext, + ctx: ServeContext[ClassificationRequest], ) -> PoolingParams | ErrorResponse: pooling_params = super()._create_pooling_params(ctx) if isinstance(pooling_params, ErrorResponse): diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 9681aa8c71e6d..3e421e21e3e80 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -33,7 +33,7 @@ from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import as_list from vllm.v1.sample.logits_processor import validate_logits_processors_parameters @@ -326,7 +326,7 @@ class OpenAIServingCompletion(OpenAIServing): created_time: int, model_name: str, num_prompts: int, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, request_metadata: RequestResponseMetadata, ) -> AsyncGenerator[str, None]: num_choices = 1 if request.n is None else request.n @@ -511,7 +511,7 @@ class OpenAIServingCompletion(OpenAIServing): request_id: str, created_time: int, model_name: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, request_metadata: RequestResponseMetadata, ) -> CompletionResponse: choices: list[CompletionResponseChoice] = [] @@ -622,7 +622,7 @@ class OpenAIServingCompletion(OpenAIServing): token_ids: GenericSequence[int], top_logprobs: GenericSequence[dict[int, Logprob] | None], num_output_top_logprobs: int, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, initial_text_offset: int = 0, return_as_token_id: bool | None = None, ) -> CompletionLogProbs: @@ -642,9 +642,15 @@ class OpenAIServingCompletion(OpenAIServing): for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: - token = tokenizer.decode(token_id) if should_return_as_token_id: token = f"token_id:{token_id}" + else: + if tokenizer is None: + raise ValueError( + "Unable to get tokenizer because `skip_tokenizer_init=True`" + ) + + token = tokenizer.decode(token_id) out_tokens.append(token) out_token_logprobs.append(None) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index cca2fd982fe0f..e7a632e025103 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -7,13 +7,14 @@ import time import traceback from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field from http import HTTPStatus from typing import Any, ClassVar, Generic, TypeAlias, TypeVar import numpy as np import torch from fastapi import Request -from pydantic import BaseModel, ConfigDict, Field, TypeAdapter +from pydantic import ConfigDict, TypeAdapter from starlette.datastructures import Headers from typing_extensions import TypeIs @@ -96,12 +97,12 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.tokenizers import MistralTokenizer, TokenizerLike from vllm.tracing import ( contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import random_uuid from vllm.utils.async_utils import ( AsyncMicrobatchTokenizer, @@ -184,19 +185,19 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]: RequestT = TypeVar("RequestT", bound=AnyRequest) -class RequestProcessingMixin(BaseModel): +@dataclass(kw_only=True) +class RequestProcessingMixin: """ Mixin for request processing, handling prompt preparation and engine input. """ - request_prompts: Sequence[RequestPrompt] | None = [] - engine_prompts: list[EngineTokensPrompt] | None = [] - - model_config = ConfigDict(arbitrary_types_allowed=True) + request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list) + engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list) -class ResponseGenerationMixin(BaseModel): +@dataclass(kw_only=True) +class ResponseGenerationMixin: """ Mixin for response generation, managing result generators and final batch results. @@ -205,54 +206,38 @@ class ResponseGenerationMixin(BaseModel): result_generator: ( AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None ) = None - final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field( + final_res_batch: list[RequestOutput | PoolingRequestOutput] = field( default_factory=list ) model_config = ConfigDict(arbitrary_types_allowed=True) -class ServeContext( - RequestProcessingMixin, - ResponseGenerationMixin, - BaseModel, - Generic[RequestT], -): +@dataclass(kw_only=True) +class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, Generic[RequestT]): # Shared across all requests request: RequestT raw_request: Request | None = None model_name: str request_id: str - created_time: int = Field(default_factory=lambda: int(time.time())) + created_time: int = field(default_factory=lambda: int(time.time())) lora_request: LoRARequest | None = None # Shared across most requests - tokenizer: AnyTokenizer | None = None - - # `protected_namespaces` resolves Pydantic v2's warning - # on conflict with protected namespace "model_" - model_config = ConfigDict( - protected_namespaces=(), - arbitrary_types_allowed=True, - ) + tokenizer: TokenizerLike | None = None -ClassificationServeContext = ServeContext[ClassificationRequest] +@dataclass(kw_only=True) +class ClassificationServeContext(ServeContext[ClassificationRequest]): + pass +@dataclass(kw_only=True) class EmbeddingServeContext(ServeContext[EmbeddingRequest]): chat_template: str | None = None chat_template_content_format: ChatTemplateContentFormatOption -# Used to resolve the Pydantic error related to -# forward reference of MultiModalDataDict in TokensPrompt -RequestProcessingMixin.model_rebuild() -ServeContext.model_rebuild() -ClassificationServeContext.model_rebuild() -EmbeddingServeContext.model_rebuild() - - class OpenAIServing: request_id_prefix: ClassVar[str] = """ A short string prepended to every request’s ID (e.g. "embd", "classify") @@ -281,7 +266,7 @@ class OpenAIServing: apply_mistral_chat_template, executor=self._tokenizer_executor ) - self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {} + self._async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] = {} self.log_error_stack = log_error_stack self.input_processor = self.models.input_processor @@ -291,7 +276,7 @@ class OpenAIServing: def _get_tool_parser( self, tool_parser_name: str | None = None, enable_auto_tools: bool = False - ) -> Callable[[AnyTokenizer], ToolParser] | None: + ) -> Callable[[TokenizerLike], ToolParser] | None: """Get the tool parser based on the name.""" parser = None if not enable_auto_tools or tool_parser_name is None: @@ -317,7 +302,7 @@ class OpenAIServing: def _get_reasoning_parser( self, reasoning_parser_name: str, - ) -> Callable[[AnyTokenizer], ReasoningParser] | None: + ) -> Callable[[TokenizerLike], ReasoningParser] | None: """Get the reasoning parser based on the name.""" parser = None if not reasoning_parser_name: @@ -547,7 +532,7 @@ class OpenAIServing: prompt_logprobs=None, ) - def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer: + def _get_renderer(self, tokenizer: TokenizerLike | None) -> BaseRenderer: """ Get a Renderer instance with the provided tokenizer. Uses shared async tokenizer pool for efficiency. @@ -877,7 +862,7 @@ class OpenAIServing: self, request: AnyRequest, prompt: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, add_special_tokens: bool, ) -> TextTokensPrompt: async_tokenizer = self._get_async_tokenizer(tokenizer) @@ -919,7 +904,7 @@ class OpenAIServing: self, request: AnyRequest, prompt_ids: list[int], - tokenizer: AnyTokenizer | None, + tokenizer: TokenizerLike | None, ) -> TextTokensPrompt: truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) @@ -1015,7 +1000,7 @@ class OpenAIServing: async def _tokenize_prompt_input_async( self, request: AnyRequest, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, prompt_input: str | list[int], add_special_tokens: bool = True, ) -> TextTokensPrompt: @@ -1034,7 +1019,7 @@ class OpenAIServing: async def _tokenize_prompt_inputs_async( self, request: AnyRequest, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, prompt_inputs: Iterable[str | list[int]], add_special_tokens: bool = True, ) -> AsyncGenerator[TextTokensPrompt, None]: @@ -1079,7 +1064,7 @@ class OpenAIServing: async def _preprocess_chat( self, request: ChatLikeRequest | ResponsesRequest, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, messages: list[ChatCompletionMessageParam], chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -1088,13 +1073,18 @@ class OpenAIServing: tool_dicts: list[dict[str, Any]] | None = None, documents: list[dict[str, str]] | None = None, chat_template_kwargs: dict[str, Any] | None = None, - tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None, + tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, add_special_tokens: bool = False, ) -> tuple[ list[ConversationMessage], Sequence[RequestPrompt], list[EngineTokensPrompt], ]: + if tokenizer is None: + raise ValueError( + "Unable to get tokenizer because `skip_tokenizer_init=True`" + ) + model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( @@ -1370,9 +1360,9 @@ class OpenAIServing: @staticmethod def _parse_tool_calls_from_content( request: ResponsesRequest | ChatCompletionRequest, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, enable_auto_tools: bool, - tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None, + tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None, content: str | None = None, ) -> tuple[list[FunctionCall] | None, str | None]: function_calls = list[FunctionCall]() @@ -1442,7 +1432,7 @@ class OpenAIServing: def _get_decoded_token( logprob: Logprob, token_id: int, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, return_as_token_id: bool = False, ) -> str: if return_as_token_id: @@ -1450,6 +1440,12 @@ class OpenAIServing: if logprob.decoded_token is not None: return logprob.decoded_token + + if tokenizer is None: + raise ValueError( + "Unable to get tokenizer because `skip_tokenizer_init=True`" + ) + return tokenizer.decode(token_id) def _is_model_supported(self, model_name: str | None) -> bool: diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index f546dbda7fef5..5144916ba71e9 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -105,7 +105,7 @@ from vllm.logprobs import Logprob as SampleLogprob from vllm.logprobs import SampleLogprobs from vllm.outputs import CompletionOutput from vllm.sampling_params import SamplingParams, StructuredOutputsParams -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils import random_uuid logger = init_logger(__name__) @@ -492,7 +492,7 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, prev_response: ResponsesResponse | None, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ): if request.tools is None or ( request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none @@ -563,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing): result_generator: AsyncIterator[ConversationContext], context: ConversationContext, model_name: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int | None = None, ) -> ErrorResponse | ResponsesResponse: @@ -675,7 +675,7 @@ class OpenAIServingResponses(OpenAIServing): self, logprobs: dict[int, SampleLogprob], top_logprobs: int, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> list[LogprobTopLogprob]: """Returns the top-k logprobs from the logprobs dictionary.""" out = [] @@ -700,7 +700,7 @@ class OpenAIServingResponses(OpenAIServing): self, token_ids: Sequence[int], logprobs: SampleLogprobs | None, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, top_logprobs: int | None = None, ) -> list[Logprob]: assert logprobs is not None, "logprobs must be provided" @@ -736,7 +736,7 @@ class OpenAIServingResponses(OpenAIServing): self, token_ids: Sequence[int], logprobs: SampleLogprobs | None, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, top_logprobs: int | None = None, ) -> list[response_text_delta_event.Logprob]: lgs = self._create_response_logprobs( @@ -763,7 +763,7 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, final_output: CompletionOutput, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> list[ResponseOutputItem]: if self.reasoning_parser: try: @@ -1135,7 +1135,7 @@ class OpenAIServingResponses(OpenAIServing): result_generator: AsyncIterator[ConversationContext | None], context: ConversationContext, model_name: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int, _increment_sequence_number_and_return: Callable[ @@ -1438,7 +1438,7 @@ class OpenAIServingResponses(OpenAIServing): result_generator: AsyncIterator[ConversationContext | None], context: ConversationContext, model_name: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int, _increment_sequence_number_and_return: Callable[ @@ -1891,7 +1891,7 @@ class OpenAIServingResponses(OpenAIServing): result_generator: AsyncIterator[ConversationContext | None], context: ConversationContext, model_name: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, request_metadata: RequestResponseMetadata, created_time: int | None = None, ) -> AsyncGenerator[StreamingResponsesResponse, None]: diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 9cbfc9791819e..0874c01c1f2a7 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -36,7 +36,7 @@ from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.tokenizers import MistralTokenizer, TokenizerLike from vllm.utils.async_utils import make_async, merge_async_iterators logger = init_logger(__name__) @@ -60,7 +60,7 @@ class ServingScores(OpenAIServing): async def _embedding_score( self, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, texts_1: list[str], texts_2: list[str], request: RerankRequest | ScoreRequest, @@ -153,7 +153,7 @@ class ServingScores(OpenAIServing): def _preprocess_score( self, request: RerankRequest | ScoreRequest, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, @@ -175,7 +175,7 @@ class ServingScores(OpenAIServing): async def _cross_encoding_score( self, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, data_1: list[str] | list[ScoreContentPartParam], data_2: list[str] | list[ScoreContentPartParam], request: RerankRequest | ScoreRequest, diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 39aae0cd04956..979da02d14500 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -22,7 +22,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.renderer import RenderConfig from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -170,7 +170,7 @@ class OpenAIServingTokenization(OpenAIServing): @dataclass class TokenizerInfo: - tokenizer: AnyTokenizer + tokenizer: TokenizerLike chat_template: str | None def to_dict(self) -> dict[str, Any]: diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index e99e405f5de65..87ef2e0786a94 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -22,7 +22,7 @@ from vllm.logger import init_logger from vllm.sampling_params import ( StructuredOutputsParams, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import import_from_path @@ -36,7 +36,7 @@ class ToolParser: derived classes. """ - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): self.prev_tool_call_arr: list[dict] = [] # the index of the tool call that is currently being parsed self.current_tool_id: int = -1 diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py index cbeb879969ece..10de3dabf985c 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py @@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class DeepSeekV31ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.current_tool_name_sent: bool = False diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index bf7f6fa61ab90..66b14875dce25 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class DeepSeekV3ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.current_tool_name_sent: bool = False diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py index 82370323cb00d..d054d8e4b8651 100644 --- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py @@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class Ernie45ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): """ Ernie thinking model format: abc\n\n\n\n\ndef\n\n diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 389e9754b34da..165346adb3d93 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class Glm4MoeModelToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.current_tool_name_sent = False self.prev_tool_call_arr: list[dict] = [] diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index ae9217426fb51..df1b590526b1a 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -29,7 +29,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import ( partial_json_loads, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -44,7 +44,7 @@ class Granite20bFCToolParser(ToolParser): are all set """ - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.bot_token = "" diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index d29c427694dc9..14b0ca0abe357 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -27,7 +27,7 @@ from vllm.entrypoints.openai.tool_parsers.utils import ( partial_json_loads, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -42,7 +42,7 @@ class GraniteToolParser(ToolParser): are all set """ - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # for granite 3.0, the token `<|tool_call|>` self.bot_token = "<|tool_call|>" diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 4336a5438109f..19c1c83268ed4 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -22,18 +22,18 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.tokenizers import MistralTokenizer, TokenizerLike logger = init_logger(__name__) class Hermes2ProToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) - if isinstance(self.model_tokenizer, MistralTokenizer): + if isinstance(tokenizer, MistralTokenizer): logger.error("Detected Mistral tokenizer when using a Hermes model") - self.model_tokenizer = self.model_tokenizer.tokenizer + self.model_tokenizer = tokenizer.tokenizer self.current_tool_name_sent: bool = False self.prev_tool_call_arr: list[dict] = [] diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py index 920675c8389b8..d2419b5d84ead 100644 --- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py @@ -22,14 +22,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ) from vllm.entrypoints.openai.tool_parsers.utils import consume_space from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils import random_uuid logger = init_logger(__name__) class HunyuanA13BToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # Initialize state for streaming mode diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 1dd327f645b3a..67788358543e9 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class Internlm2ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.position = 0 diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 6f53ddea4f0ef..4655da8dd4542 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -21,14 +21,13 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.tokenizers import MistralTokenizer, TokenizerLike logger = init_logger(__name__) class JambaToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) if isinstance(self.model_tokenizer, MistralTokenizer): diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index 2b84c60a3b841..07db52ebd5af1 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -19,13 +19,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class KimiK2ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.current_tool_name_sent: bool = False self.prev_tool_call_arr: list[dict] = [] diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index c6c8ae8ae95f1..76d76a4aa35a1 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -4,11 +4,11 @@ import regex as re from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike class LongcatFlashToolParser(Hermes2ProToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.tool_call_start_token: str = "" diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py index 5c2258ba62b29..b595a98f35555 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -21,13 +21,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class MinimaxM2ToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.prev_tool_call_arr: list[dict] = [] diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 982518a52e3da..1025041037c6e 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class MinimaxToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # Initialize streaming state for tracking tool call progress diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 85671271522d3..7e2d67a1fb659 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.tokenizers import MistralTokenizer, TokenizerLike logger = init_logger(__name__) @@ -46,7 +46,7 @@ class MistralToolCall(ToolCall): return id.isalnum() and len(id) == 9 -def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool: +def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool: return ( isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11 ) @@ -61,7 +61,7 @@ class MistralToolParser(ToolParser): Used when --enable-auto-tool-choice --tool-call-parser mistral are all set """ - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) if not isinstance(self.model_tokenizer, MistralTokenizer): diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py index d1b36a297e0b1..8bdf35d408805 100644 --- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -18,15 +18,15 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( from vllm.logger import init_logger if TYPE_CHECKING: - from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.tokenizers import TokenizerLike else: - AnyTokenizer = object + TokenizerLike = object logger = init_logger(__name__) class OpenAIToolParser(ToolParser): - def __init__(self, tokenizer: "AnyTokenizer"): + def __init__(self, tokenizer: "TokenizerLike"): super().__init__(tokenizer) def extract_tool_calls( diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index 9d4c079eba188..d49b14690ef03 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -22,13 +22,13 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) class Qwen3CoderToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.current_tool_name_sent: bool = False diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index 432c419db189a..03862ff432a5d 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -23,7 +23,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -1165,7 +1165,7 @@ class StreamingXMLToolCallParser: class Qwen3XMLToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.parser = StreamingXMLToolCallParser() diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py index 8aed7f0e9fc96..c7947faad1923 100644 --- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py @@ -25,7 +25,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -34,7 +34,7 @@ class SeedOssToolParser(ToolParser): TOOL_CALL_START = "" TOOL_CALL_END = "" - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # --- streaming state --- diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py index adcb9f4765473..9213d6859dd93 100644 --- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py @@ -21,7 +21,7 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils import random_uuid logger = init_logger(__name__) @@ -41,7 +41,7 @@ class Step3ToolParser(ToolParser): TOOL_SEP = "<|tool_sep|>" SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END] - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.position = 0 # Explicit state flags for robust streaming diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 9d308af4de601..effd2bd08b42a 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -21,14 +21,14 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils import random_uuid logger = init_logger(__name__) class xLAMToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): + def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) # Initialize state for streaming mode diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 3c5a396a99f93..10b90bbbb0f32 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -16,7 +16,7 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import TextPrompt as EngineTextPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.parse import get_prompt_components, parse_raw_prompts -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import AsyncMicrobatchTokenizer @@ -85,7 +85,7 @@ class BaseRenderer(ABC): def __init__( self, model_config: ModelConfig, - tokenizer: AnyTokenizer | None = None, + tokenizer: TokenizerLike | None = None, ): super().__init__() self.model_config = model_config @@ -200,8 +200,8 @@ class CompletionRenderer(BaseRenderer): def __init__( self, model_config: ModelConfig, - tokenizer: AnyTokenizer | None = None, - async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] + tokenizer: TokenizerLike | None = None, + async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] | None = None, ): super().__init__(model_config, tokenizer) @@ -373,7 +373,7 @@ class CompletionRenderer(BaseRenderer): return async_tokenizer tokenizer = self.tokenizer - if self.tokenizer is None: + if tokenizer is None: raise ValueError("No tokenizer available for text input processing") if self.async_tokenizer_pool is None: diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 309a4c996392d..04d5a192918dd 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -19,11 +19,7 @@ from vllm.inputs import TokensPrompt from vllm.model_executor.models.interfaces import supports_score_template from vllm.multimodal.inputs import MultiModalDataDict from vllm.outputs import PoolingRequestOutput -from vllm.transformers_utils.tokenizer import ( - AnyTokenizer, - PreTrainedTokenizer, - PreTrainedTokenizerFast, -) +from vllm.transformers_utils.tokenizer import TokenizerLike ScoreContentPartParam: TypeAlias = ( ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam @@ -45,7 +41,7 @@ class ScoreMultiModalParam(TypedDict, total=False): def _cosine_similarity( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, + tokenizer: TokenizerLike, embed_1: list[PoolingRequestOutput], embed_2: list[PoolingRequestOutput], ) -> list[PoolingRequestOutput]: @@ -93,7 +89,7 @@ def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, model_config: ModelConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> tuple[str, str, MultiModalDataDict | None]: mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -118,12 +114,14 @@ def _parse_score_content( mm_tracker: BaseMultiModalItemTracker, ) -> _ContentPart | None: if isinstance(data, str): - data = ChatCompletionContentPartTextParam(type="text", text=data) + part = ChatCompletionContentPartTextParam(type="text", text=data) + else: + part = data mm_parser = mm_tracker.create_parser() parse_res = _parse_chat_message_content_part( - data, + part, mm_parser, wrap_dicts=False, interleave_strings=False, @@ -181,7 +179,7 @@ def post_process_tokens( def get_score_prompt( model_config: ModelConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 088bb679fef40..daeeb995bc749 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.tokenizers import MistralTokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 839c13868a16c..46d1bed38aa85 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -17,7 +17,7 @@ from vllm.multimodal.inputs import ( MultiModalUUIDDict, ) from vllm.multimodal.processing import BaseMultiModalProcessor -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.jsontree import json_iter_leaves from vllm.v1.metrics.stats import MultiModalCacheStats @@ -46,7 +46,7 @@ class InputPreprocessor: def __init__( self, model_config: ModelConfig, - tokenizer: AnyTokenizer | None, + tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: @@ -59,7 +59,7 @@ class InputPreprocessor: self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None - def get_tokenizer(self) -> AnyTokenizer: + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( "You cannot pass text prompts when `skip_tokenizer_init` is True" @@ -228,11 +228,11 @@ class InputPreprocessor: return tokenizer.encode(prompt, **tokenization_kwargs) - def _get_mm_tokenizer(self) -> AnyTokenizer: + def _get_mm_tokenizer(self) -> TokenizerLike: # PrithviGeoSpatialMAE needs to be initialized without a tokenizer # while using also multi-modal input if not self.tokenizer: - return cast(AnyTokenizer, object()) # Dummy + return cast(TokenizerLike, object()) # Dummy tokenizer = self.get_tokenizer() return tokenizer diff --git a/vllm/logits_process.py b/vllm/logits_process.py index 7b6a6528e20e8..1bf97c2535fb7 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -5,7 +5,7 @@ from typing import TypeAlias import torch -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike LogitsProcessor: TypeAlias = ( Callable[[list[int], torch.Tensor], torch.Tensor] @@ -19,7 +19,7 @@ to sample from.""" def get_bad_words_logits_processors( - bad_words: list[str], tokenizer: AnyTokenizer + bad_words: list[str], tokenizer: TokenizerLike ) -> list[LogitsProcessor]: bad_words_ids: list[list[int]] = list() diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 81c6b34bd6ce0..6276c3d675411 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -28,7 +28,7 @@ from vllm.multimodal.processing import ( PromptUpdate, PromptUpdateDetails, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from .intern_vit import InternVisionModel from .internvl import ( @@ -241,7 +241,7 @@ class H2OVLProcessor(BaseInternVLProcessor): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, min_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index ccbde115009d2..fccddf3a6b293 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -50,7 +50,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_num_threads @@ -347,7 +347,7 @@ class BaseInternVLProcessor(ABC): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, min_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None, @@ -561,7 +561,7 @@ class InternVLProcessor(BaseInternVLProcessor): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, min_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 5529089e06ae9..11beeddabe307 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -73,9 +73,9 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.tokenizer import ( - AnyTokenizer, cached_tokenizer_from_config, encode_tokens, ) @@ -284,7 +284,7 @@ class BaseNanoNemotronVLProcessor(ABC): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *args, max_num_tiles: int | None = None, **kwargs, @@ -434,7 +434,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, max_num_tiles: int | None = None, min_dynamic_patch: int | None = None, @@ -645,7 +645,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): tokens_per_frame: list[int], frames_indices: list[int], frame_duration_ms: int, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, img_start_token_ids: list[int], img_end_token_ids: list[int], img_context_token_ids: list[int], @@ -670,7 +670,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): tokens_per_frame (list[int]): number of tokens per frame frames_indices (list[int]): frame indices frame_duration_ms (int): duration of each frame in milliseconds - tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators + tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators img_start_token_ids (list[int]): pre-tokenized IMG_START tokens img_end_token_ids (list[int]): pre-tokenized IMG_END tokens img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 5a1dda8aac2c1..a57668b21fb86 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -34,8 +34,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.processing import PromptUpdateDetails from vllm.sequence import IntermediateTensors +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processor import cached_image_processor_from_config -from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import ( MultiModalEmbeddings, @@ -203,7 +203,7 @@ class NemotronVLProcessor(InternVLProcessor): def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, image_processor: BaseImageProcessorFast, *, min_dynamic_patch: int | None = None, diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index 121bf896fa6ba..4338918663378 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -31,7 +31,7 @@ from vllm.multimodal.processing import ( PromptReplacement, PromptUpdate, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from .qwen2_5_vl import ( Qwen2_5_VisionTransformer as OpenCUAVisionTransformer, @@ -79,7 +79,7 @@ class OpenCUAProcessor(Qwen2VLProcessor): def __init__( self, vision_config: dict, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, **kwargs, ): image_processor = Qwen2VLImageProcessor(**vision_config) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 3464de472add5..54bde75cc0131 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -59,10 +59,8 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import ( - MistralTokenizer, - cached_tokenizer_from_config, -) +from vllm.tokenizers import MistralTokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 672659aa6042c..8fbd896223944 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -91,7 +91,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -1533,7 +1533,7 @@ class Tarsier2Processor(Qwen2VLProcessor): def __init__( self, vision_config: dict, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, **kwargs, ): self.image_processor = Tarsier2ImageProcessor(**vision_config) diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index d825eb3a1c134..55c25ce6190fb 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -47,7 +47,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -282,7 +282,7 @@ class SkyworkR1VProcessor: def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, min_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 1c60cb4148121..3e55ada0ed2e1 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -43,8 +43,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.configs import Step3VisionEncoderConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -321,7 +321,7 @@ class Step3VLProcessor: def __init__( self, config: PretrainedConfig, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> None: super().__init__() diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 18ad8851fccda..0a39ea7ef5bff 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -51,10 +51,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import ( - MistralTokenizer, - cached_tokenizer_from_config, -) +from vllm.tokenizers import MistralTokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription from .utils import init_vllm_registered_model, maybe_prefix diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 27bf12a5f3169..aab657b24ba23 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -23,8 +23,9 @@ import torch from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens +from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves @@ -76,7 +77,7 @@ PromptSeq: TypeAlias = str | list[int] @lru_cache(maxsize=2048) def _cached_encode( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, text: str, *, add_special_tokens: bool | None = None, @@ -86,7 +87,7 @@ def _cached_encode( @lru_cache(maxsize=2048) def _cached_decode( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, token_ids: tuple[int, ...], *, skip_special_tokens: bool | None = None, @@ -96,14 +97,14 @@ def _cached_decode( ) -def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str: +def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str: if isinstance(seq, str): return seq return _cached_decode(tokenizer, tuple(seq)) -def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]: +def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]: if isinstance(seq, str): return _cached_encode(tokenizer, seq, add_special_tokens=False) @@ -113,7 +114,7 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]: class _GetMatchIndex(Protocol): def __call__( self, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, prompt: PromptSeq, start_idx: int = 0, ) -> int | None: ... @@ -143,7 +144,7 @@ class PromptIndexTargets: """ def get_match_index( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, prompt: PromptSeq, start_idx: int = 0, ) -> int | None: @@ -199,7 +200,7 @@ class PromptUpdateDetails(Generic[_S]): full: _S """The full content.""" - is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None + is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None """ Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full], return a boolean mask of shape `(len(full),)` indicating which positions @@ -220,7 +221,7 @@ class PromptUpdateDetails(Generic[_S]): seq: _S, embed_text: str, ) -> "PromptUpdateDetails[_S]": - def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor: + def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor: embed_token_ids = encode_tokens(tokenizer, embed_text) token_ids = _seq2tokens(tokenizer, full) @@ -236,7 +237,7 @@ class PromptUpdateDetails(Generic[_S]): seq: _S, embed_token_id: int, ) -> "PromptUpdateDetails[_S]": - def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor: + def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor: token_ids = _seq2tokens(tokenizer, full) return torch.tensor(token_ids) == embed_token_id @@ -522,7 +523,7 @@ class ResolvedPromptUpdate: def iter_token_matches( self, prompt: list[int], - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, start_idx: int = 0, ) -> Generator[PromptTargetMatch]: @@ -544,7 +545,7 @@ class ResolvedPromptUpdate: def iter_text_matches( self, prompt: str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, start_idx: int = 0, ) -> Generator[PromptTargetMatch]: @@ -566,7 +567,7 @@ class ResolvedPromptUpdate: def iter_matches( self, prompt: list[int] | str, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, start_idx: int = 0, ) -> Generator[PromptTargetMatch]: @@ -675,7 +676,7 @@ _MatchToApply = tuple[tuple[str, int], tuple[PromptTargetMatch, int]] def _find_matches( prompt: _S, mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, *, prev_end_idx: int = 0, current_result: "MultiModalPromptUpdatesApplyResult", @@ -740,7 +741,7 @@ def _all_items_found( def _apply_matches( prompt: _S, mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]: mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()} @@ -806,7 +807,7 @@ def _apply_matches( def apply_token_matches( prompt: list[int], mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]: """ Apply the updates in `mm_prompt_updates` to `prompt`. @@ -823,7 +824,7 @@ def apply_token_matches( def apply_text_matches( prompt: str, mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]: """ Apply the updates in `mm_prompt_updates` to `prompt`. @@ -840,7 +841,7 @@ def apply_text_matches( def _iter_placeholders( prompt: list[int], mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> Iterable[PlaceholderFeaturesInfo]: """ Yield each set of placeholder tokens found in `prompt`. @@ -909,7 +910,7 @@ def _iter_placeholders( def find_mm_placeholders( prompt: list[int], mm_prompt_updates: "MultiModalPromptUpdates", - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer) return dict(full_groupby_modality(it)) @@ -930,7 +931,7 @@ class InputProcessingContext: model_config: ModelConfig """The configuration of the model.""" - tokenizer: AnyTokenizer + tokenizer: TokenizerLike """The tokenizer used to tokenize the inputs.""" @overload @@ -1146,7 +1147,7 @@ class BaseProcessingInfo: def model_id(self) -> str: return self.ctx.model_config.model - def get_tokenizer(self) -> AnyTokenizer: + def get_tokenizer(self) -> TokenizerLike: return self.ctx.tokenizer def get_hf_config(self) -> PretrainedConfig: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index a7eafa76ad17e..ee90570b24aaf 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,8 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .cache import BaseMultiModalProcessorCache from .processing import ( @@ -231,17 +232,20 @@ class MultiModalRegistry: def _create_processing_ctx( self, model_config: "ModelConfig", - tokenizer: AnyTokenizer | None = None, + tokenizer: TokenizerLike | None = None, ) -> InputProcessingContext: - if tokenizer is None and not model_config.skip_tokenizer_init: + if model_config.skip_tokenizer_init: + tokenizer = cast(TokenizerLike, object()) + elif tokenizer is None: tokenizer = cached_tokenizer_from_config(model_config) + return InputProcessingContext(model_config, tokenizer) def _create_processing_info( self, model_config: "ModelConfig", *, - tokenizer: AnyTokenizer | None = None, + tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory @@ -252,7 +256,7 @@ class MultiModalRegistry: self, model_config: "ModelConfig", *, - tokenizer: AnyTokenizer | None = None, + tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index d26e4ffc9c163..4a04292be009e 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -19,12 +19,12 @@ if TYPE_CHECKING: DeltaMessage, ResponsesRequest, ) - from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.tokenizers import TokenizerLike else: ChatCompletionRequest = Any DeltaMessage = Any ResponsesRequest = Any - AnyTokenizer = Any + TokenizerLike = Any logger = init_logger(__name__) @@ -37,7 +37,7 @@ class ReasoningParser: It is used to extract reasoning content from the model output. """ - def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs): + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): self.model_tokenizer = tokenizer @cached_property diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index 0268947732726..35084c0e7cc86 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any from vllm.entrypoints.openai.protocol import DeltaMessage from vllm.reasoning.abs_reasoning_parsers import ReasoningParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike if TYPE_CHECKING: from vllm.entrypoints.openai.protocol import ( @@ -43,7 +43,7 @@ class BaseThinkingReasoningParser(ReasoningParser): """The token that ends reasoning content.""" raise NotImplementedError - def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs): + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) if not self.model_tokenizer: diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 30f5f2f88caf7..138d1b4e6dacf 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.logger import init_logger from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -37,7 +37,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): Reasoning parser for MiniMax M2 model. """ - def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs): + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) self.end_token_id = self.vocab.get("") diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index af6d179bf6d01..b61e50c188f8c 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -6,7 +6,7 @@ from functools import cached_property from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer logger = init_logger(__name__) diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 7149f8c4123b3..2742a24a2c3e7 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING import regex as re if TYPE_CHECKING: - from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.tokenizers import TokenizerLike from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, @@ -220,7 +220,7 @@ class Olmo3ReasoningParser(ReasoningParser): token is missing from generation. """ - def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs): + def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) self.think_start = r"" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 8de961e62db1b..453100f2e5135 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike from vllm.v1.serial_utils import PydanticMsgspecMixin logger = init_logger(__name__) @@ -477,7 +477,7 @@ class SamplingParams( eos_ids.update(self.stop_token_ids) self.stop_token_ids = list(eos_ids) - def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None: + def update_from_tokenizer(self, tokenizer: TokenizerLike) -> None: if not self.bad_words: return self._bad_words_token_ids = [] diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py new file mode 100644 index 0000000000000..e26b4e8797ec8 --- /dev/null +++ b/vllm/tokenizers/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .mistral import MistralTokenizer +from .protocol import TokenizerLike +from .registry import TokenizerRegistry + +__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/tokenizers/mistral.py similarity index 96% rename from vllm/transformers_utils/tokenizers/mistral.py rename to vllm/tokenizers/mistral.py index 1954e2a815b03..a42fb0e1e5f14 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -4,7 +4,8 @@ from typing import TYPE_CHECKING, Any, cast from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer_base import TokenizerBase + +from .protocol import TokenizerLike if TYPE_CHECKING: from mistral_common.protocol.instruct.request import ( @@ -163,7 +164,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int: return tokenizer.unk_id -class MistralTokenizer(TokenizerBase): +class MistralTokenizer(TokenizerLike): def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None: from mistral_common.protocol.instruct.validator import ValidationMode from mistral_common.tokens.tokenizers.sentencepiece import ( @@ -270,14 +271,6 @@ class MistralTokenizer(TokenizerBase): def eos_token_id(self) -> int: return self.tokenizer.eos_id - @property - def sep_token(self) -> str: - raise NotImplementedError() - - @property - def pad_token(self) -> str: - return self.transformers_tokenizer.pad_token - @property def is_fast(self) -> bool: return True @@ -292,11 +285,14 @@ class MistralTokenizer(TokenizerBase): @property def truncation_side(self) -> str: - raise NotImplementedError() + return self.transformers_tokenizer.truncation_side def _is_special_token_id(self, token_id: int) -> bool: return token_id in self._special_token_ids_set + def __hash__(self) -> int: + return hash(id(self)) + def __len__(self) -> int: return self.vocab_size @@ -341,17 +337,6 @@ class MistralTokenizer(TokenizerBase): # Mistral tokenizers have no added vocabulary return {} - def encode_one( - self, - text: str, - truncation: bool = False, - max_length: int | None = None, - ) -> list[int]: - # Mistral Tokenizers should not add special tokens - return self.transformers_tokenizer.encode( - text, add_special_tokens=False, truncation=truncation, max_length=max_length - ) - def encode( self, text: str, diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py new file mode 100644 index 0000000000000..58a1a7c23f21c --- /dev/null +++ b/vllm/tokenizers/protocol.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, Any, Protocol + +from typing_extensions import Self + +if TYPE_CHECKING: + from vllm.entrypoints.chat_utils import ChatCompletionMessageParam + + +class TokenizerLike(Protocol): + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + /, + *, + revision: str | None = None, + ) -> Self: + raise NotImplementedError + + @property + def all_special_tokens(self) -> list[str]: + raise NotImplementedError + + @property + def all_special_ids(self) -> list[int]: + raise NotImplementedError + + @property + def bos_token_id(self) -> int: + raise NotImplementedError + + @property + def eos_token_id(self) -> int: + raise NotImplementedError + + @property + def is_fast(self) -> bool: + raise NotImplementedError + + @property + def vocab_size(self) -> int: + raise NotImplementedError + + @property + def max_token_id(self) -> int: + raise NotImplementedError + + @property + def truncation_side(self) -> str: + raise NotImplementedError + + def __hash__(self) -> int: + return hash(id(self)) + + def __len__(self) -> int: + return self.vocab_size + + def __call__( + self, + text: str | list[str] | list[int], + text_pair: str | None = None, + add_special_tokens: bool = False, + truncation: bool = False, + max_length: int | None = None, + ): + raise NotImplementedError + + def get_vocab(self) -> dict[str, int]: + raise NotImplementedError + + def get_added_vocab(self) -> dict[str, int]: + raise NotImplementedError + + def encode( + self, + text: str, + truncation: bool | None = None, + max_length: int | None = None, + add_special_tokens: bool | None = None, + ) -> list[int]: + raise NotImplementedError + + def apply_chat_template( + self, + messages: list["ChatCompletionMessageParam"], + tools: list[dict[str, Any]] | None = None, + **kwargs, + ) -> list[int]: + raise NotImplementedError + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + raise NotImplementedError + + def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: + raise NotImplementedError + + def convert_ids_to_tokens( + self, + ids: list[int], + skip_special_tokens: bool = True, + ) -> list[str]: + raise NotImplementedError diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py new file mode 100644 index 0000000000000..3a236c99b3564 --- /dev/null +++ b/vllm/tokenizers/registry.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib + +from .protocol import TokenizerLike + + +class TokenizerRegistry: + # Tokenizer name -> (tokenizer module, tokenizer class) + REGISTRY: dict[str, tuple[str, str]] = {} + + @staticmethod + def register(name: str, module: str, class_name: str) -> None: + TokenizerRegistry.REGISTRY[name] = (module, class_name) + + @staticmethod + def get_tokenizer( + tokenizer_name: str, + *args, + **kwargs, + ) -> "TokenizerLike": + tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name) + if tokenizer_cls is None: + raise ValueError(f"Tokenizer {tokenizer_name} not found.") + + tokenizer_module = importlib.import_module(tokenizer_cls[0]) + class_ = getattr(tokenizer_module, tokenizer_cls[1]) + return class_.from_pretrained(*args, **kwargs) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 45c4358bbc8f2..8f2cd3315ab94 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -26,8 +26,9 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs from vllm.logger import init_logger -from vllm.transformers_utils.config_parser_base import ConfigParserBase -from vllm.transformers_utils.repo_utils import ( + +from .config_parser_base import ConfigParserBase +from .repo_utils import ( _get_hf_token, file_or_path_exists, get_hf_file_to_dict, @@ -35,7 +36,7 @@ from vllm.transformers_utils.repo_utils import ( try_get_local_file, with_retry, ) -from vllm.transformers_utils.utils import ( +from .utils import ( check_gguf_file, is_gguf, is_remote_gguf, diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 560526bfd823e..e586a5d46cb82 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .tokenizer import AnyTokenizer +from vllm.tokenizers import TokenizerLike def _replace_none_with_empty(tokens: list[str | None]): @@ -12,7 +12,7 @@ def _replace_none_with_empty(tokens: list[str | None]): def _convert_tokens_to_string_with_added_encoders( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, output_tokens: list[str], skip_special_tokens: bool, spaces_between_special_tokens: bool, @@ -57,7 +57,7 @@ INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5 def convert_prompt_ids_to_tokens( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, prompt_ids: list[int], skip_special_tokens: bool = False, ) -> tuple[list[str], int, int]: @@ -81,7 +81,7 @@ def convert_prompt_ids_to_tokens( def convert_ids_list_to_tokens( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, token_ids: list[int], ) -> list[str]: """Detokenize the input ids individually. @@ -108,7 +108,7 @@ def convert_ids_list_to_tokens( # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 # under Apache 2.0 license def detokenize_incrementally( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, all_input_ids: list[int], prev_tokens: list[str] | None, prefix_offset: int, diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index c5b4d3f000901..cb1fc2d092e01 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -9,7 +9,8 @@ from gguf.constants import Keys, VisionProjectorType from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger -from vllm.transformers_utils.repo_utils import list_filtered_repo_files + +from .repo_utils import list_filtered_repo_files logger = init_logger(__name__) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index be4325ab9101d..87d5cc2b483fb 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -5,41 +5,48 @@ import contextlib import copy import importlib.util import os +import warnings from functools import lru_cache from pathlib import Path -from typing import TYPE_CHECKING, Any, TypeAlias +from typing import TYPE_CHECKING, Any import huggingface_hub -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import AutoTokenizer, PreTrainedTokenizerBase from typing_extensions import assert_never from vllm import envs from vllm.logger import init_logger -from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config -from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf -from vllm.transformers_utils.repo_utils import list_filtered_repo_files -from vllm.transformers_utils.tokenizers import MistralTokenizer -from vllm.transformers_utils.utils import ( - check_gguf_file, - is_gguf, - is_remote_gguf, - split_remote_gguf, -) +from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry + +from .config import get_sentence_transformer_tokenizer_config +from .gguf_utils import get_gguf_file_path_from_hf +from .repo_utils import list_filtered_repo_files +from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf if TYPE_CHECKING: from vllm.config import ModelConfig - from vllm.transformers_utils.tokenizer_base import TokenizerBase -else: - ModelConfig = Any - TokenizerBase = Any + logger = init_logger(__name__) -AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase + +def __getattr__(name: str): + if name == "AnyTokenizer": + warnings.warn( + "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to " + "`vllm.tokenizers.TokenizerLike`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return TokenizerLike + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") def decode_tokens( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, token_ids: list[int], *, skip_special_tokens: bool | None = None, @@ -58,7 +65,7 @@ def decode_tokens( def encode_tokens( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, text: str, *, truncation: bool | None = None, @@ -86,7 +93,7 @@ def encode_tokens( return tokenizer.encode(text, **kw_args) -def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: +def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike: """ By default, transformers will recompute multiple tokenizer properties each time they are called, leading to a significant slowdown. @@ -144,7 +151,7 @@ def get_tokenizer( revision: str | None = None, download_dir: str | None = None, **kwargs, -) -> AnyTokenizer: +) -> TokenizerLike: """Gets a tokenizer for the given model name via HuggingFace or ModelScope.""" if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, @@ -206,15 +213,13 @@ def get_tokenizer( if len(files_list) > 0: tokenizer_mode = "mistral" - tokenizer: AnyTokenizer + tokenizer: TokenizerLike if tokenizer_mode == "mistral": logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}") tokenizer = MistralTokenizer.from_pretrained( str(tokenizer_name), revision=revision ) elif tokenizer_mode == "custom": - from vllm.transformers_utils.tokenizer_base import TokenizerRegistry - logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}") tokenizer = TokenizerRegistry.get_tokenizer( str(tokenizer_name), @@ -260,12 +265,13 @@ def get_tokenizer( if isinstance(encoder_config, dict) and encoder_config.get( "do_lower_case", False ): + assert isinstance(tokenizer, PreTrainedTokenizerBase) special_tokens_map = { k: v.lower() for k, v in tokenizer.special_tokens_map.items() } tokenizer.add_special_tokens(special_tokens_map) - if not isinstance(tokenizer, PreTrainedTokenizerFast): + if not tokenizer.is_fast: logger.warning( "Using a slow tokenizer. This might cause a significant " "slowdown. Consider using a fast tokenizer instead." @@ -279,7 +285,7 @@ cached_get_tokenizer = lru_cache(get_tokenizer) def cached_tokenizer_from_config( - model_config: ModelConfig, + model_config: "ModelConfig", **kwargs: Any, ): return cached_get_tokenizer( @@ -291,7 +297,7 @@ def cached_tokenizer_from_config( ) -def init_tokenizer_from_configs(model_config: ModelConfig): +def init_tokenizer_from_configs(model_config: "ModelConfig"): runner_type = model_config.runner_type if runner_type == "generate" or runner_type == "draft": truncation_side = "left" diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index 52f221d1e373e..78fb6edc8b9ed 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -1,150 +1,33 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +import warnings -class TokenizerBase(ABC): - @property - @abstractmethod - def all_special_tokens(self) -> list[str]: - raise NotImplementedError() +def __getattr__(name: str): + if name == "TokenizerBase": + from vllm.tokenizers import TokenizerLike - @property - @abstractmethod - def all_special_ids(self) -> list[int]: - raise NotImplementedError() + warnings.warn( + "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been " + "moved to `vllm.tokenizers.TokenizerLike`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) - @property - @abstractmethod - def bos_token_id(self) -> int: - raise NotImplementedError() + return TokenizerLike + if name == "TokenizerRegistry": + from vllm.tokenizers import TokenizerRegistry - @property - @abstractmethod - def eos_token_id(self) -> int: - raise NotImplementedError() + warnings.warn( + "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been " + "moved to `vllm.tokenizers.TokenizerRegistry`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) - @property - @abstractmethod - def sep_token(self) -> str: - raise NotImplementedError() + return TokenizerRegistry - @property - @abstractmethod - def pad_token(self) -> str: - raise NotImplementedError() - - @property - @abstractmethod - def is_fast(self) -> bool: - raise NotImplementedError() - - @property - @abstractmethod - def vocab_size(self) -> int: - raise NotImplementedError() - - @property - @abstractmethod - def max_token_id(self) -> int: - raise NotImplementedError() - - @property - @abstractmethod - def truncation_side(self) -> str: - raise NotImplementedError() - - def __len__(self) -> int: - return self.vocab_size - - @abstractmethod - def __call__( - self, - text: str | list[str] | list[int], - text_pair: str | None = None, - add_special_tokens: bool = False, - truncation: bool = False, - max_length: int | None = None, - ): - raise NotImplementedError() - - @abstractmethod - def get_vocab(self) -> dict[str, int]: - raise NotImplementedError() - - @abstractmethod - def get_added_vocab(self) -> dict[str, int]: - raise NotImplementedError() - - @abstractmethod - def encode_one( - self, - text: str, - truncation: bool = False, - max_length: int | None = None, - ) -> list[int]: - raise NotImplementedError() - - @abstractmethod - def encode( - self, - text: str, - truncation: bool | None = None, - max_length: int | None = None, - add_special_tokens: bool | None = None, - ) -> list[int]: - raise NotImplementedError() - - @abstractmethod - def apply_chat_template( - self, - messages: list["ChatCompletionMessageParam"], - tools: list[dict[str, Any]] | None = None, - **kwargs, - ) -> list[int]: - raise NotImplementedError() - - @abstractmethod - def convert_tokens_to_string(self, tokens: list[str]) -> str: - raise NotImplementedError() - - @abstractmethod - def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: - raise NotImplementedError() - - @abstractmethod - def convert_ids_to_tokens( - self, - ids: list[int], - skip_special_tokens: bool = True, - ) -> list[str]: - raise NotImplementedError() - - -class TokenizerRegistry: - # Tokenizer name -> (tokenizer module, tokenizer class) - REGISTRY: dict[str, tuple[str, str]] = {} - - @staticmethod - def register(name: str, module: str, class_name: str) -> None: - TokenizerRegistry.REGISTRY[name] = (module, class_name) - - @staticmethod - def get_tokenizer( - tokenizer_name: str, - *args, - **kwargs, - ) -> TokenizerBase: - tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name) - if tokenizer_cls is None: - raise ValueError(f"Tokenizer {tokenizer_name} not found.") - - tokenizer_module = importlib.import_module(tokenizer_cls[0]) - class_ = getattr(tokenizer_module, tokenizer_cls[1]) - return class_.from_pretrained(*args, **kwargs) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py deleted file mode 100644 index b63cb26af46dd..0000000000000 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from .mistral import ( - MistralTokenizer, - maybe_serialize_tool_calls, - truncate_tool_call_ids, - validate_request_params, -) - -__all__ = [ - "MistralTokenizer", - "maybe_serialize_tool_calls", - "truncate_tool_call_ids", - "validate_request_params", -] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bd28c41fb50e8..336d3e9fa1d20 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,9 +26,10 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask +from vllm.tokenizers import TokenizerLike from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value -from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list @@ -120,9 +121,10 @@ class AsyncLLM(EngineClient): ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). - stream_interval = self.vllm_config.scheduler_config.stream_interval self.output_processor = OutputProcessor( - self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval + self.tokenizer, + log_stats=self.log_stats, + stream_interval=self.vllm_config.scheduler_config.stream_interval, ) endpoint = self.observability_config.otlp_traces_endpoint if endpoint is not None: @@ -703,17 +705,17 @@ class AsyncLLM(EngineClient): raise EngineGenerateError() from e @property - def tokenizer(self) -> AnyTokenizer | None: + def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer @tokenizer.setter - def tokenizer(self, tokenizer: AnyTokenizer | None) -> None: + def tokenizer(self, tokenizer: TokenizerLike | None) -> None: self.input_processor.tokenizer = tokenizer - async def get_tokenizer(self) -> AnyTokenizer: + async def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( - "Unable to get tokenizer because skip_tokenizer_init is True" + "Unable to get tokenizer because `skip_tokenizer_init=True`" ) return self.tokenizer diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index b7a24096bf15f..c55240c40f6f0 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -10,7 +10,7 @@ from transformers import PreTrainedTokenizerFast from vllm.logger import init_logger from vllm.transformers_utils.detokenizer_utils import ( - AnyTokenizer, + TokenizerLike, convert_prompt_ids_to_tokens, detokenize_incrementally, ) @@ -45,7 +45,7 @@ class IncrementalDetokenizer: @classmethod def from_new_request( cls, - tokenizer: AnyTokenizer | None, + tokenizer: TokenizerLike | None, request: EngineCoreRequest, ) -> "IncrementalDetokenizer": assert request.sampling_params is not None @@ -256,7 +256,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): - def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest): + def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest): super().__init__(request) self.tokenizer = tokenizer diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index cfd637931a1ce..e6a94f4e3de5d 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -19,8 +19,7 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer, TokenizerLike from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats @@ -40,7 +39,7 @@ class InputProcessor: def __init__( self, vllm_config: VllmConfig, - tokenizer: AnyTokenizer | None, + tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config @@ -62,11 +61,11 @@ class InputProcessor: ) @property - def tokenizer(self) -> AnyTokenizer | None: + def tokenizer(self) -> TokenizerLike | None: return self.input_preprocessor.tokenizer @tokenizer.setter - def tokenizer(self, tokenizer: AnyTokenizer | None) -> None: + def tokenizer(self, tokenizer: TokenizerLike | None) -> None: self.input_preprocessor.tokenizer = tokenizer def _validate_logprobs( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index ead553e98a978..a3bde7ba8d64d 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -23,8 +23,9 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask +from vllm.tokenizers import TokenizerLike from vllm.tracing import init_tracer -from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient @@ -95,9 +96,10 @@ class LLMEngine: ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). - stream_interval = self.vllm_config.scheduler_config.stream_interval self.output_processor = OutputProcessor( - self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval + self.tokenizer, + log_stats=self.log_stats, + stream_interval=self.vllm_config.scheduler_config.stream_interval, ) endpoint = self.observability_config.otlp_traces_endpoint if endpoint is not None: @@ -350,17 +352,17 @@ class LLMEngine: return get_metrics_snapshot() @property - def tokenizer(self) -> AnyTokenizer | None: + def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer @tokenizer.setter - def tokenizer(self, tokenizer: AnyTokenizer | None) -> None: + def tokenizer(self, tokenizer: TokenizerLike | None) -> None: self.input_processor.tokenizer = tokenizer - def get_tokenizer(self) -> AnyTokenizer: + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( - "Unable to get tokenizer because skip_tokenizer_init is True" + "Unable to get tokenizer because `skip_tokenizer_init=True`" ) return self.tokenizer diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 63064a2c65d67..1c8f808bc25ba 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -13,7 +13,7 @@ from vllm.logprobs import ( create_sample_logprobs, ) from vllm.transformers_utils.detokenizer_utils import ( - AnyTokenizer, + TokenizerLike, convert_ids_list_to_tokens, ) from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest @@ -28,7 +28,7 @@ NONES = itertools.repeat(None) class LogprobsProcessor: # Tokenizer for this request, # None if detokenization is disabled. - tokenizer: AnyTokenizer | None + tokenizer: TokenizerLike | None # Logprobs for this request logprobs: SampleLogprobs | None @@ -40,7 +40,7 @@ class LogprobsProcessor: @classmethod def from_new_request( cls, - tokenizer: AnyTokenizer | None, + tokenizer: TokenizerLike | None, request: EngineCoreRequest, ) -> "LogprobsProcessor": sampling_params = request.sampling_params diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 0453c4a77f0cd..e85fbb4ee0fb0 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -15,8 +15,8 @@ from vllm.outputs import ( RequestOutput, ) from vllm.sampling_params import RequestOutputKind +from vllm.tokenizers import TokenizerLike from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine.detokenizer import IncrementalDetokenizer @@ -139,7 +139,7 @@ class RequestState: @classmethod def from_new_request( cls, - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike | None, request: EngineCoreRequest, prompt: str | None, parent_req: ParentRequest | None, @@ -341,7 +341,10 @@ class OutputProcessor: """Process EngineCoreOutputs into RequestOutputs.""" def __init__( - self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1 + self, + tokenizer: TokenizerLike | None, + log_stats: bool, + stream_interval: int = 1, ): self.log_stats = log_stats self.tokenizer = tokenizer diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 7dc9589b63b86..5c09b7b0634f2 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -10,10 +10,10 @@ if TYPE_CHECKING: import torch from vllm.config import VllmConfig - from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.tokenizers import TokenizerLike else: VllmConfig = object - AnyTokenizer = object + TokenizerLike = object class StructuredOutputOptions(enum.Enum): @@ -100,7 +100,7 @@ class StructuredOutputBackend(ABC): """Engine-level backend for structured output requests.""" vllm_config: VllmConfig - tokenizer: AnyTokenizer + tokenizer: TokenizerLike vocab_size: int @abstractmethod diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index c9f2dc07da786..f8a2df43dd90e 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -10,7 +10,7 @@ import torch import vllm.envs from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.tokenizers import MistralTokenizer from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_types import ( StructuredOutputBackend, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index d2d14fcfc4362..ae42b33f80f88 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -24,7 +24,7 @@ if TYPE_CHECKING: import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2 import xgrammar as xgr - from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.tokenizers import TokenizerLike from vllm.v1.worker.gpu_input_batch import InputBatch else: xgr = LazyLoader("xgr", globals(), "xgrammar") @@ -36,7 +36,7 @@ else: "transformers.models.gpt2.tokenization_gpt2", ) - AnyTokenizer = object + TokenizerLike = object SchedulerOutput = object InputBatch = object @@ -195,7 +195,7 @@ re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$") def _reduced_vocabulary( - tokenizer: AnyTokenizer, + tokenizer: TokenizerLike, eos_token_id: int, ) -> dict[bytes, list[int]]: """Create a map from vocabulary tokens to lists of equivalent token ids. @@ -222,7 +222,7 @@ def _reduced_vocabulary( vocabulary: dict[bytes, list[int]] = {} empty_token_ids: list[int] = [] for token, token_idx in tokenizer.get_vocab().items(): - if token in tokenizer.all_special_tokens: # type: ignore + if token in tokenizer.all_special_tokens: continue token_str = convert_token_to_string(token) @@ -261,7 +261,7 @@ def _reduced_vocabulary( return vocabulary -def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary: +def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary: """Get the `Vocabulary` object for a given tokenizer.""" if hasattr(tokenizer, "_outlines_vocabulary"): return tokenizer._outlines_vocabulary # type: ignore