[Misc] Unify tokenizer registration (#29767)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-12-01 19:34:58 +08:00 committed by GitHub
parent 86e178f7c4
commit f0a28bf661
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 237 additions and 183 deletions

View File

@ -53,7 +53,7 @@ async def test_tokenize_completions(
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_special in [False, True]:
prompt = "vllm1 This is a test prompt."
@ -87,7 +87,7 @@ async def test_tokenize_chat(
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]:
for add_special in [False, True]:
@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]:
for add_special in [False, True]:
@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a token_strs test prompt! vllm1"
response = requests.post(
@ -240,7 +240,7 @@ async def test_detokenize(
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)

View File

@ -197,7 +197,7 @@ async def test_conversation_embedding(
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,

View File

@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
chat_response.raise_for_status()
chat_poolings = PoolingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(
tokenizer_name=model_name,
tokenizer_mode="fast",
trust_remote_code=True,
)
tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,

View File

@ -23,7 +23,7 @@ class _HfExamplesInfo:
tokenizer: str | None = None
"""Set the tokenizer to load for this architecture."""
tokenizer_mode: TokenizerMode = "auto"
tokenizer_mode: TokenizerMode | str = "auto"
"""Set the tokenizer type for this architecture."""
speculative_model: str | None = None

View File

@ -1,13 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
from vllm.transformers_utils.tokenizer import get_tokenizer
class TestTokenizer(TokenizerLike):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
return TestTokenizer() # type: ignore
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TestTokenizer":
return TestTokenizer(path_or_repo_id) # type: ignore
def __init__(self, path_or_repo_id: str | Path) -> None:
super().__init__()
self.path_or_repo_id = path_or_repo_id
@property
def bos_token_id(self) -> int:
@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
def test_customized_tokenizer():
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2

View File

@ -4,7 +4,7 @@
import json
from enum import Enum
from typing import TYPE_CHECKING, Any
from typing import Any
import jsonschema
import pytest
@ -24,11 +24,6 @@ from vllm.sampling_params import (
StructuredOutputsParams,
)
if TYPE_CHECKING:
from vllm.config.model import TokenizerMode
else:
TokenizerMode = str
NGRAM_SPEC_CONFIG = {
"model": "[ngram]",
"num_speculative_tokens": 5,
@ -627,7 +622,7 @@ Make the response as short as possible.
)
def test_structured_output_with_reasoning_matrices(
backend: str,
tokenizer_mode: TokenizerMode,
tokenizer_mode: str,
reasoning_parser: str,
model_name: str,
speculative_config: dict[str, Any] | None,

View File

@ -86,7 +86,7 @@ TaskOption = Literal[
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@ -137,13 +137,13 @@ class ModelConfig:
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode = "auto"
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "custom" will use --tokenizer to select the preregistered tokenizer."""
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
@ -708,9 +708,6 @@ class ModelConfig:
# can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
# Avoid running try_verify_and_update_config multiple times
self.config_updated = False
@ -718,6 +715,10 @@ class ModelConfig:
self._verify_cuda_graph()
self._verify_bnb_config()
@field_validator("tokenizer_mode", mode="after")
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
return tokenizer_mode.lower()
@field_validator("quantization", mode="before")
@classmethod
def validate_quantization_before(cls, value: Any) -> Any:
@ -829,15 +830,6 @@ class ModelConfig:
model, _ = split_remote_gguf(model)
return get_sentence_transformer_tokenizer_config(model, self.revision)
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
if tokenizer_mode not in get_args(TokenizerMode):
raise ValueError(
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
f"one of {get_args(TokenizerMode)}."
)
self.tokenizer_mode = tokenizer_mode
def _get_default_runner_type(
self,
architectures: list[str],

View File

@ -360,7 +360,7 @@ class EngineArgs:
task: TaskOption | None = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
trust_remote_code: bool = ModelConfig.trust_remote_code
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains

View File

@ -188,7 +188,7 @@ class LLM:
runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer: str | None = None,
tokenizer_mode: TokenizerMode = "auto",
tokenizer_mode: TokenizerMode | str = "auto",
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
allowed_local_media_path: str = "",

View File

@ -4,6 +4,12 @@
from .hf import HfTokenizer
from .mistral import MistralTokenizer
from .protocol import TokenizerLike
from .registry import TokenizerRegistry
from .registry import TokenizerRegistry, get_tokenizer
__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
__all__ = [
"TokenizerLike",
"HfTokenizer",
"MistralTokenizer",
"TokenizerRegistry",
"get_tokenizer",
]

View File

@ -10,6 +10,7 @@ from transformers import AutoTokenizer
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
from .protocol import TokenizerLike
from .registry import TokenizerRegistry
if TYPE_CHECKING:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@ -67,6 +68,7 @@ def get_cached_tokenizer(
return cached_tokenizer # type: ignore
@TokenizerRegistry.register("hf")
class HfTokenizer(TokenizerLike):
@classmethod
def from_pretrained(

View File

@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
from vllm.logger import init_logger
from .protocol import TokenizerLike
from .registry import TokenizerRegistry
if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import (
@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return tokenizer.unk_id
@TokenizerRegistry.register("mistral")
class MistralTokenizer(TokenizerLike):
@classmethod
def from_pretrained(

View File

@ -1,28 +1,197 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
from collections.abc import Callable
from pathlib import Path
from typing import TypeVar, overload
import huggingface_hub
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.transformers_utils.utils import (
check_gguf_file,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
logger = init_logger(__name__)
_T = TypeVar("_T", bound=type[TokenizerLike])
class TokenizerRegistry:
# Tokenizer name -> (tokenizer module, tokenizer class)
REGISTRY: dict[str, tuple[str, str]] = {}
# Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
# In-tree tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
# OOT tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
@staticmethod
def register(name: str, module: str, class_name: str) -> None:
TokenizerRegistry.REGISTRY[name] = (module, class_name)
def register(
tokenizer_mode: str,
module: str | None = None,
class_name: str | None = None,
) -> Callable[[_T], _T] | None:
# In-tree tokenizers
if module is None or class_name is None:
def wrapper(tokenizer_cls: _T) -> _T:
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
return tokenizer_cls
return wrapper
# OOT tokenizers
if tokenizer_mode in TokenizerRegistry.REGISTRY:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
module,
class_name,
tokenizer_mode,
)
TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
return None
@staticmethod
def get_tokenizer(
tokenizer_name: str,
*args,
**kwargs,
) -> "TokenizerLike":
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
if tokenizer_cls is None:
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
if tokenizer_mode not in TokenizerRegistry.REGISTRY:
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
tokenizer_module = importlib.import_module(tokenizer_cls[0])
class_ = getattr(tokenizer_module, tokenizer_cls[1])
item = TokenizerRegistry.REGISTRY[tokenizer_mode]
if isinstance(item, type):
return item.from_pretrained(*args, **kwargs)
module, class_name = item
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
return class_.from_pretrained(*args, **kwargs)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circular import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not Path(tokenizer_name).exists():
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=str(tokenizer_name),
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
tokenizer_mode = "hf"
kwargs["use_fast"] = False
if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
# Try to use official Mistral tokenizer if possible
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
files_list = list_filtered_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=allow_patterns,
revision=revision,
)
if len(files_list) > 0:
tokenizer_mode = "mistral"
# Fallback to HF tokenizer
if tokenizer_mode == "auto":
tokenizer_mode = "hf"
tokenizer_args = (tokenizer_name, *args)
tokenizer_kwargs = dict(
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if tokenizer_mode == "custom":
logger.warning_once(
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
"instead of `tokenizer_name`. "
"Please update the definition of `.from_pretrained` in "
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
"Then, you can pass `tokenizer_mode=%r` instead of "
"`tokenizer_mode='custom'` when initializing vLLM.",
tokenizer_args,
str(tokenizer_kwargs),
tokenizer_mode,
)
tokenizer_mode = str(tokenizer_name)
tokenizer = TokenizerRegistry.get_tokenizer(
tokenizer_mode,
*tokenizer_args,
**tokenizer_kwargs,
)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer

View File

@ -1,28 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
import os
import warnings
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any
import huggingface_hub
from typing_extensions import assert_never
from vllm import envs
from vllm.logger import init_logger
from vllm.tokenizers import (
HfTokenizer,
MistralTokenizer,
TokenizerLike,
TokenizerRegistry,
)
from .gguf_utils import get_gguf_file_path_from_hf
from .repo_utils import list_filtered_repo_files
from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
from vllm.tokenizers import TokenizerLike, get_tokenizer
if TYPE_CHECKING:
from vllm.config import ModelConfig
@ -108,117 +94,6 @@ def encode_tokens(
return tokenizer.encode(text, **kw_args)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circuit import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not os.path.exists(tokenizer_name):
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=tokenizer_name,
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
# first to use official Mistral tokenizer if possible.
mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
if tokenizer_mode == "auto" and mistral_common_installed:
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
files_list = list_filtered_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=allow_patterns,
revision=revision,
)
if len(files_list) > 0:
tokenizer_mode = "mistral"
tokenizer: TokenizerLike
if tokenizer_mode == "mistral":
logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
tokenizer = MistralTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
elif tokenizer_mode == "custom":
logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
tokenizer = TokenizerRegistry.get_tokenizer(
str(tokenizer_name),
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
else:
logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
tokenizer = HfTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer
cached_get_tokenizer = lru_cache(get_tokenizer)