From 653591d5e73b34ffd9186c61e964474bcc4b7c80 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 2 Dec 2025 13:33:37 +0800 Subject: [PATCH] [Chore] Move tokenizer initialization methods (#29793) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_prefix_caching.py | 2 +- .../benchmark_serving_structured_output.py | 2 +- .../test_dynamic_shapes_compilation.py | 2 +- .../entrypoints/openai/test_chat_template.py | 2 +- .../entrypoints/openai/test_lora_resolvers.py | 2 +- .../openai/test_return_token_ids.py | 2 +- .../openai/test_return_tokens_as_ids.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 2 +- .../openai/test_token_in_token_out.py | 2 +- tests/entrypoints/openai/test_tokenization.py | 2 +- .../tool_parsers/test_hermes_tool_parser.py | 2 +- .../entrypoints/pooling/embed/test_online.py | 2 +- .../pooling/pooling/test_online.py | 2 +- tests/entrypoints/test_chat_utils.py | 3 +- .../multimodal/processing/test_common.py | 7 +- .../processing/test_tensor_schema.py | 2 +- tests/models/utils.py | 2 +- tests/test_inputs.py | 4 +- tests/tokenizers_/test_basic.py | 3 +- tests/tokenizers_/test_registry.py | 3 +- .../tool_use/test_deepseekv31_tool_parser.py | 2 +- .../tool_use/test_ernie45_moe_tool_parser.py | 3 +- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- tests/tool_use/test_jamba_tool_parser.py | 3 +- tests/tool_use/test_kimi_k2_tool_parser.py | 2 +- tests/tool_use/test_minimax_tool_parser.py | 2 +- tests/tool_use/test_openai_tool_parser.py | 2 +- tests/tool_use/test_qwen3coder_tool_parser.py | 3 +- tests/tool_use/test_seed_oss_tool_parser.py | 3 +- tests/tool_use/test_xlam_tool_parser.py | 3 +- tests/transformers_utils/test_config.py | 2 +- tests/utils.py | 2 +- .../v1/entrypoints/openai/test_completion.py | 2 +- tests/v1/tpu/test_perf.py | 2 +- vllm/benchmarks/serve.py | 2 +- vllm/model_executor/models/adapters.py | 4 +- vllm/model_executor/models/deepseek_ocr.py | 2 +- vllm/model_executor/models/deepseek_vl2.py | 2 +- vllm/model_executor/models/granite_speech.py | 8 +- vllm/model_executor/models/gritlm.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 7 +- vllm/model_executor/models/pixtral.py | 3 +- vllm/model_executor/models/voxtral.py | 3 +- vllm/model_executor/models/whisper.py | 6 +- vllm/multimodal/registry.py | 3 +- vllm/tokenizers/__init__.py | 11 ++- vllm/tokenizers/registry.py | 38 +++++++- vllm/transformers_utils/tokenizer.py | 91 +++++++++---------- vllm/v1/engine/async_llm.py | 5 +- vllm/v1/engine/llm_engine.py | 5 +- vllm/v1/structured_output/__init__.py | 4 +- 51 files changed, 150 insertions(+), 129 deletions(-) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 28fc383a318dd..e6391134ff932 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.utils.argparse_utils import FlexibleArgumentParser try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 55001cf3722a0..df122b4c5e8db 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index c20aea822fe81..1966b03cd9c89 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -8,7 +8,7 @@ import torch from vllm import LLM, SamplingParams from vllm.config.compilation import CompilationMode, DynamicShapesType -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.torch_utils import is_torch_equal_or_newer diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index ee79ed59c4102..77087ac21ea8b 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -6,7 +6,7 @@ import pytest from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...models.registry import HF_EXAMPLE_MODELS from ...utils import VLLM_PATH diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 4856cafef44b3..ea6b3d812d8fe 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "openai-community/gpt2" diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py index feef48a36dfa1..8537082e3f8d1 100644 --- a/tests/entrypoints/openai/test_return_token_ids.py +++ b/tests/entrypoints/openai/test_return_token_ids.py @@ -3,7 +3,7 @@ import pytest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index cedf6ce160607..d4d9a6c5b6120 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -7,7 +7,7 @@ import pytest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 6a1b15c4131e0..9ea65f9fa6e7a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py index 25eb5882be89c..c7f8abe27e6e0 100644 --- a/tests/entrypoints/openai/test_token_in_token_out.py +++ b/tests/entrypoints/openai/test_token_in_token_out.py @@ -7,7 +7,7 @@ import tempfile import pytest from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 751f94319eb9f..052f9fecc18de 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -5,7 +5,7 @@ import pytest import pytest_asyncio import requests -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index b2303ab0e7b7c..ce6727bb04f6c 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -271,7 +271,7 @@ async def test_streaming_product_tool_call(): @pytest.fixture def qwen_tokenizer() -> TokenizerLike: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer return get_tokenizer("Qwen/Qwen3-32B") diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index 6aac649bc3035..ddba1c790ba8c 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py index 977c74d54a351..cc5c2f26f80fb 100644 --- a/tests/entrypoints/pooling/pooling/test_online.py +++ b/tests/entrypoints/pooling/pooling/test_online.py @@ -12,7 +12,7 @@ import torch from tests.models.utils import check_embeddings_close from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index a351cda60621f..03a0c058ea690 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -28,8 +28,7 @@ from vllm.multimodal.utils import ( encode_image_base64, encode_video_base64, ) -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import MistralTokenizer, get_tokenizer from ..models.registry import HF_EXAMPLE_MODELS from ..utils import VLLM_PATH diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index c39e522100901..90158a028b0bd 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import ( - cached_tokenizer_from_config, - encode_tokens, -) +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config +from vllm.transformers_utils.tokenizer import encode_tokens from ....multimodal.utils import random_audio, random_image, random_video from ...registry import ( diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 66a3fbe11b6a5..7628ab4fe2349 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/tests/models/utils.py b/tests/models/utils.py index 9843887a13204..d84b4b820533e 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,7 +13,7 @@ from transformers import PretrainedConfig from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS diff --git a/tests/test_inputs.py b/tests/test_inputs.py index b1fb4e06a6906..c4339827de8b6 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -7,7 +7,7 @@ from vllm.config import ModelConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs +from vllm.tokenizers import init_tokenizer_from_config pytestmark = pytest.mark.cpu_test @@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_configs(model_config) + tokenizer = init_tokenizer_from_config(model_config) input_preprocessor = InputPreprocessor(model_config, tokenizer) # HF processor adds sep token diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py index 1fca633cc5cd7..b152227a5a50f 100644 --- a/tests/tokenizers_/test_basic.py +++ b/tests/tokenizers_/test_basic.py @@ -5,8 +5,7 @@ from typing import _get_protocol_attrs # type: ignore import pytest from transformers import PreTrainedTokenizerBase -from vllm.tokenizers import TokenizerLike -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import TokenizerLike, get_tokenizer def _get_missing_attrs(obj: object, target: type): diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py index 57b6a14a54b3f..7e795350d64c8 100644 --- a/tests/tokenizers_/test_registry.py +++ b/tests/tokenizers_/test_registry.py @@ -2,8 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from vllm.tokenizers import TokenizerLike, TokenizerRegistry -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer class TestTokenizer(TokenizerLike): diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py index db5168071fbce..8beb7739b6081 100644 --- a/tests/tool_use/test_deepseekv31_tool_parser.py +++ b/tests/tool_use/test_deepseekv31_tool_parser.py @@ -6,7 +6,7 @@ import pytest from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import ( DeepSeekV31ToolParser, ) -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer MODEL = "deepseek-ai/DeepSeek-V3.1" diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py index 8fbbbba325385..92f86de23267b 100644 --- a/tests/tool_use/test_ernie45_moe_tool_parser.py +++ b/tests/tool_use/test_ernie45_moe_tool_parser.py @@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer # Use a common model that is likely to be available MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking" diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index f545f52c02dcb..753b3f1c23adf 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import ( Glm4MoeModelToolParser, ) -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index c7ca024f3a767..9036bd32dd704 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py index 3a48b5206141d..1558a9c3e01f2 100644 --- a/tests/tool_use/test_kimi_k2_tool_parser.py +++ b/tests/tool_use/test_kimi_k2_tool_parser.py @@ -8,7 +8,7 @@ import pytest from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py index 4332984083dab..dda63f984a832 100644 --- a/tests/tool_use/test_minimax_tool_parser.py +++ b/tests/tool_use/test_minimax_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py index c874a9601ae70..6537f281c0e1b 100644 --- a/tests/tool_use/test_openai_tool_parser.py +++ b/tests/tool_use/test_openai_tool_parser.py @@ -16,7 +16,7 @@ from openai_harmony import ( from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer MODEL = "gpt2" diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index 864bb0d0c06c2..5a56768805fdf 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -17,9 +17,8 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser, ) from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index d94df61128c9c..8795c35a1347f 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index fdcdd4038131a..3098fda036a81 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -13,9 +13,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py index 7b56c9f0189d4..85680c41ed74d 100644 --- a/tests/transformers_utils/test_config.py +++ b/tests/transformers_utils/test_config.py @@ -6,8 +6,8 @@ only get the `eos_token_id` from the tokenizer as defined by `vllm.LLMEngine._get_eos_token_id`. """ +from vllm.tokenizers import get_tokenizer from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer import get_tokenizer def test_get_llama3_eos_token(): diff --git a/tests/utils.py b/tests/utils.py index 9565b0ff06e36..539f67c47ac1d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -44,7 +44,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.serve import ServeSubcommand from vllm.model_executor.model_loader import get_model_loader from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GB_bytes from vllm.utils.network_utils import get_open_port diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 736ccbefbc4da..ddab006d0d31a 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -9,7 +9,7 @@ import regex as re from openai import BadRequestError from tests.utils import RemoteOpenAIServer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py index e230491cddb01..e62b969fe3b95 100644 --- a/tests/v1/tpu/test_perf.py +++ b/tests/v1/tpu/test_perf.py @@ -14,7 +14,7 @@ import pytest from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer if TYPE_CHECKING: from tests.conftest import VllmRunner diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 519303c0bfa0a..2933f5d01b274 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -47,7 +47,7 @@ from vllm.benchmarks.lib.endpoint_request_func import ( ) from vllm.benchmarks.lib.ready_checker import wait_for_endpoint from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.gc_utils import freeze_gc_heap from vllm.utils.network_utils import join_host_port diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 05f257feea3ee..007d847ac3b7b 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -444,7 +444,7 @@ def load_weights_using_from_2_way_softmax( ) loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True) - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( model_config.tokenizer, @@ -498,7 +498,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te # Skip ModelForSequenceClassification in MRO to avoid infinite recursion loaded_weights = type(model).__mro__[1].load_weights(model, weights) - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( model_config.tokenizer, diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 8179f916ff417..019fb3e29ab91 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -45,6 +45,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors +from vllm.tokenizers import cached_tokenizer_from_config from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.processors.deepseek_ocr import ( BASE_SIZE, @@ -53,7 +54,6 @@ from vllm.transformers_utils.processors.deepseek_ocr import ( DeepseekOCRProcessor, count_tiles, ) -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.sample.logits_processor import ( AdapterLogitsProcessor, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 1b6e4110039c4..56c1a87a25401 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -41,13 +41,13 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.tokenizers import cached_tokenizer_from_config from vllm.transformers_utils.configs.deepseek_vl2 import ( DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig, ) from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 1797adab8d146..accf7e6ef2f47 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -59,8 +59,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processor import cached_get_processor -from vllm.transformers_utils.tokenizer import cached_get_tokenizer +from vllm.tokenizers import cached_tokenizer_from_config +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .blip2 import Blip2QFormerModel @@ -862,7 +862,7 @@ class GraniteSpeechForConditionalGeneration( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_get_tokenizer(model_config.model) + tokenizer = cached_tokenizer_from_config(model_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -886,7 +886,7 @@ class GraniteSpeechForConditionalGeneration( model_config: ModelConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 181c4ed2dca5a..550e8b014d5e7 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.pooler import ( ) from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.tasks import PoolingTask -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.outputs import PoolerOutput from vllm.v1.pool.metadata import PoolingMetadata diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 11beeddabe307..0f86a17752802 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -73,12 +73,9 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import ( - cached_tokenizer_from_config, - encode_tokens, -) +from vllm.transformers_utils.tokenizer import encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import _merge_multimodal_embeddings diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 54bde75cc0131..cad241842cd30 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -59,8 +59,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 0a39ea7ef5bff..45f8fa079c714 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -51,8 +51,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription from .utils import init_vllm_registered_model, maybe_prefix diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 1ed6ae4366d0c..0daf6bda61ccb 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -48,7 +48,7 @@ from vllm.multimodal.processing import ( PromptUpdate, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype @@ -850,7 +850,7 @@ class WhisperForConditionalGeneration( def get_speech_to_text_config( cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -864,7 +864,7 @@ class WhisperForConditionalGeneration( stt_config: SpeechToTextConfig, model_config: ModelConfig, ) -> int | None: - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2fdae46e547b0..00a84f9dec4f7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,8 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from .cache import BaseMultiModalProcessorCache from .processing import ( diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py index 14f0148cf7ba8..42487f5f51651 100644 --- a/vllm/tokenizers/__init__.py +++ b/vllm/tokenizers/__init__.py @@ -4,12 +4,21 @@ from .hf import HfTokenizer from .mistral import MistralTokenizer from .protocol import TokenizerLike -from .registry import TokenizerRegistry, get_tokenizer +from .registry import ( + TokenizerRegistry, + cached_get_tokenizer, + cached_tokenizer_from_config, + get_tokenizer, + init_tokenizer_from_config, +) __all__ = [ "TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry", + "cached_get_tokenizer", "get_tokenizer", + "cached_tokenizer_from_config", + "init_tokenizer_from_config", ] diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index d5e7899321615..bf9d295de23ae 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util from collections.abc import Callable +from functools import lru_cache from pathlib import Path -from typing import TypeVar, overload +from typing import TYPE_CHECKING, TypeVar, overload import huggingface_hub +from typing_extensions import assert_never import vllm.envs as envs from vllm.logger import init_logger @@ -21,6 +23,9 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike +if TYPE_CHECKING: + from vllm.config import ModelConfig + logger = init_logger(__name__) _T = TypeVar("_T", bound=type[TokenizerLike]) @@ -195,3 +200,34 @@ def get_tokenizer( ) return tokenizer + + +cached_get_tokenizer = lru_cache(get_tokenizer) + + +def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): + return cached_get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, + **kwargs, + ) + + +def init_tokenizer_from_config(model_config: "ModelConfig"): + runner_type = model_config.runner_type + if runner_type == "generate" or runner_type == "draft": + truncation_side = "left" + elif runner_type == "pooling": + truncation_side = "right" + else: + assert_never(runner_type) + + return get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision, + truncation_side=truncation_side, + ) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 0911848c02e14..617d16779ca26 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -2,17 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings -from functools import lru_cache -from typing import TYPE_CHECKING, Any - -from typing_extensions import assert_never +from typing import Any from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, get_tokenizer - -if TYPE_CHECKING: - from vllm.config import ModelConfig - +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -28,18 +21,54 @@ def __getattr__(name: str): ) return TokenizerLike - if name == "get_cached_tokenizer": - from vllm.tokenizers.hf import get_cached_tokenizer + if name == "get_tokenizer": + from vllm.tokenizers import get_tokenizer warnings.warn( - "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` " - "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. " + "`vllm.transformers_utils.tokenizer.get_tokenizer` " + "has been moved to `vllm.tokenizers.get_tokenizer`. " "The old name will be removed in v0.13.", DeprecationWarning, stacklevel=2, ) - return get_cached_tokenizer + return get_tokenizer + if name == "cached_get_tokenizer": + from vllm.tokenizers import cached_get_tokenizer + + warnings.warn( + "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` " + "has been moved to `vllm.tokenizers.cached_get_tokenizer`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return cached_get_tokenizer + if name == "cached_tokenizer_from_config": + from vllm.tokenizers import cached_tokenizer_from_config + + warnings.warn( + "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` " + "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return cached_tokenizer_from_config + if name == "init_tokenizer_from_configs": + from vllm.tokenizers import init_tokenizer_from_config + + warnings.warn( + "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` " + "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return init_tokenizer_from_config raise AttributeError(f"module {__name__!r} has no attribute {name!r}") @@ -92,37 +121,3 @@ def encode_tokens( kw_args["add_special_tokens"] = add_special_tokens return tokenizer.encode(text, **kw_args) - - -cached_get_tokenizer = lru_cache(get_tokenizer) - - -def cached_tokenizer_from_config( - model_config: "ModelConfig", - **kwargs: Any, -): - return cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, - **kwargs, - ) - - -def init_tokenizer_from_configs(model_config: "ModelConfig"): - runner_type = model_config.runner_type - if runner_type == "generate" or runner_type == "draft": - truncation_side = "left" - elif runner_type == "pooling": - truncation_side = "right" - else: - assert_never(runner_type) - - return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, - truncation_side=truncation_side, - ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 17a271ca42e26..ec5d6e95ce3aa 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,10 +26,9 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list @@ -112,7 +111,7 @@ class AsyncLLM(EngineClient): if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_configs(self.model_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index e7dfc554e76fa..d21cdf04ead26 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -23,9 +23,8 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config from vllm.tracing import init_tracer -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient @@ -87,7 +86,7 @@ class LLMEngine: if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_configs(self.model_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 029129cf1a475..d087d28b1dae3 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs +from vllm.tokenizers import init_tokenizer_from_config from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import ( @@ -61,7 +61,7 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_configs( + self.tokenizer = init_tokenizer_from_config( model_config=self.vllm_config.model_config ) reasoning_parser = (