[Misc] Unify tokenizer registration (#29767)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-12-01 19:34:58 +08:00 committed by GitHub
parent 86e178f7c4
commit f0a28bf661
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 237 additions and 183 deletions

View File

@ -53,7 +53,7 @@ async def test_tokenize_completions(
model_name: str, model_name: str,
tokenizer_name: str, tokenizer_name: str,
): ):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_special in [False, True]: for add_special in [False, True]:
prompt = "vllm1 This is a test prompt." prompt = "vllm1 This is a test prompt."
@ -87,7 +87,7 @@ async def test_tokenize_chat(
model_name: str, model_name: str,
tokenizer_name: str, tokenizer_name: str,
): ):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]: for add_generation in [False, True]:
for add_special in [False, True]: for add_special in [False, True]:
@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
model_name: str, model_name: str,
tokenizer_name: str, tokenizer_name: str,
): ):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
for add_generation in [False, True]: for add_generation in [False, True]:
for add_special in [False, True]: for add_special in [False, True]:
@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
model_name: str, model_name: str,
tokenizer_name: str, tokenizer_name: str,
): ):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a token_strs test prompt! vllm1" prompt = "This is a token_strs test prompt! vllm1"
response = requests.post( response = requests.post(
@ -240,7 +240,7 @@ async def test_detokenize(
model_name: str, model_name: str,
tokenizer_name: str, tokenizer_name: str,
): ):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
prompt = "This is a test prompt. vllm1" prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False) tokens = tokenizer.encode(prompt, add_special_tokens=False)

View File

@ -197,7 +197,7 @@ async def test_conversation_embedding(
chat_response.raise_for_status() chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json()) chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template( prompt = tokenizer.apply_chat_template(
messages, messages,
chat_template=DUMMY_CHAT_TEMPLATE, chat_template=DUMMY_CHAT_TEMPLATE,

View File

@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
chat_response.raise_for_status() chat_response.raise_for_status()
chat_poolings = PoolingResponse.model_validate(chat_response.json()) chat_poolings = PoolingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer( tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
tokenizer_name=model_name,
tokenizer_mode="fast",
trust_remote_code=True,
)
prompt = tokenizer.apply_chat_template( prompt = tokenizer.apply_chat_template(
messages, messages,
chat_template=DUMMY_CHAT_TEMPLATE, chat_template=DUMMY_CHAT_TEMPLATE,

View File

@ -23,7 +23,7 @@ class _HfExamplesInfo:
tokenizer: str | None = None tokenizer: str | None = None
"""Set the tokenizer to load for this architecture.""" """Set the tokenizer to load for this architecture."""
tokenizer_mode: TokenizerMode = "auto" tokenizer_mode: TokenizerMode | str = "auto"
"""Set the tokenizer type for this architecture.""" """Set the tokenizer type for this architecture."""
speculative_model: str | None = None speculative_model: str | None = None

View File

@ -1,13 +1,28 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from vllm.tokenizers import TokenizerLike, TokenizerRegistry from vllm.tokenizers import TokenizerLike, TokenizerRegistry
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
class TestTokenizer(TokenizerLike): class TestTokenizer(TokenizerLike):
@classmethod @classmethod
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": def from_pretrained(
return TestTokenizer() # type: ignore cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TestTokenizer":
return TestTokenizer(path_or_repo_id) # type: ignore
def __init__(self, path_or_repo_id: str | Path) -> None:
super().__init__()
self.path_or_repo_id = path_or_repo_id
@property @property
def bos_token_id(self) -> int: def bos_token_id(self) -> int:
@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
def test_customized_tokenizer(): def test_customized_tokenizer():
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__) TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer") tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
assert isinstance(tokenizer, TestTokenizer) assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0 assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1 assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2 assert tokenizer.pad_token_id == 2
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom") tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
assert isinstance(tokenizer, TestTokenizer) assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0 assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1 assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2 assert tokenizer.pad_token_id == 2

View File

@ -4,7 +4,7 @@
import json import json
from enum import Enum from enum import Enum
from typing import TYPE_CHECKING, Any from typing import Any
import jsonschema import jsonschema
import pytest import pytest
@ -24,11 +24,6 @@ from vllm.sampling_params import (
StructuredOutputsParams, StructuredOutputsParams,
) )
if TYPE_CHECKING:
from vllm.config.model import TokenizerMode
else:
TokenizerMode = str
NGRAM_SPEC_CONFIG = { NGRAM_SPEC_CONFIG = {
"model": "[ngram]", "model": "[ngram]",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
@ -627,7 +622,7 @@ Make the response as short as possible.
) )
def test_structured_output_with_reasoning_matrices( def test_structured_output_with_reasoning_matrices(
backend: str, backend: str,
tokenizer_mode: TokenizerMode, tokenizer_mode: str,
reasoning_parser: str, reasoning_parser: str,
model_name: str, model_name: str,
speculative_config: dict[str, Any] | None, speculative_config: dict[str, Any] | None,

View File

@ -86,7 +86,7 @@ TaskOption = Literal[
"transcription", "transcription",
"draft", "draft",
] ]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"] TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[ LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@ -137,13 +137,13 @@ class ModelConfig:
tokenizer: SkipValidation[str] = None # type: ignore tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model """Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used.""" name or path will be used."""
tokenizer_mode: TokenizerMode = "auto" tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n """Tokenizer mode:\n
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
- "hf" will use the fast tokenizer if available.\n - "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n - "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n - "mistral" will always use the tokenizer from `mistral_common`.\n
- "custom" will use --tokenizer to select the preregistered tokenizer.""" - Other custom values can be supported via plugins."""
trust_remote_code: bool = False trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model """Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer.""" and tokenizer."""
@ -708,9 +708,6 @@ class ModelConfig:
# can be correctly capped to sliding window size # can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None self.hf_text_config.sliding_window = None
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
# Avoid running try_verify_and_update_config multiple times # Avoid running try_verify_and_update_config multiple times
self.config_updated = False self.config_updated = False
@ -718,6 +715,10 @@ class ModelConfig:
self._verify_cuda_graph() self._verify_cuda_graph()
self._verify_bnb_config() self._verify_bnb_config()
@field_validator("tokenizer_mode", mode="after")
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
return tokenizer_mode.lower()
@field_validator("quantization", mode="before") @field_validator("quantization", mode="before")
@classmethod @classmethod
def validate_quantization_before(cls, value: Any) -> Any: def validate_quantization_before(cls, value: Any) -> Any:
@ -829,15 +830,6 @@ class ModelConfig:
model, _ = split_remote_gguf(model) model, _ = split_remote_gguf(model)
return get_sentence_transformer_tokenizer_config(model, self.revision) return get_sentence_transformer_tokenizer_config(model, self.revision)
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
if tokenizer_mode not in get_args(TokenizerMode):
raise ValueError(
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
f"one of {get_args(TokenizerMode)}."
)
self.tokenizer_mode = tokenizer_mode
def _get_default_runner_type( def _get_default_runner_type(
self, self,
architectures: list[str], architectures: list[str],

View File

@ -360,7 +360,7 @@ class EngineArgs:
task: TaskOption | None = ModelConfig.task task: TaskOption | None = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
trust_remote_code: bool = ModelConfig.trust_remote_code trust_remote_code: bool = ModelConfig.trust_remote_code
allowed_local_media_path: str = ModelConfig.allowed_local_media_path allowed_local_media_path: str = ModelConfig.allowed_local_media_path
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains

View File

@ -188,7 +188,7 @@ class LLM:
runner: RunnerOption = "auto", runner: RunnerOption = "auto",
convert: ConvertOption = "auto", convert: ConvertOption = "auto",
tokenizer: str | None = None, tokenizer: str | None = None,
tokenizer_mode: TokenizerMode = "auto", tokenizer_mode: TokenizerMode | str = "auto",
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
trust_remote_code: bool = False, trust_remote_code: bool = False,
allowed_local_media_path: str = "", allowed_local_media_path: str = "",

View File

@ -4,6 +4,12 @@
from .hf import HfTokenizer from .hf import HfTokenizer
from .mistral import MistralTokenizer from .mistral import MistralTokenizer
from .protocol import TokenizerLike from .protocol import TokenizerLike
from .registry import TokenizerRegistry from .registry import TokenizerRegistry, get_tokenizer
__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"] __all__ = [
"TokenizerLike",
"HfTokenizer",
"MistralTokenizer",
"TokenizerRegistry",
"get_tokenizer",
]

View File

@ -10,6 +10,7 @@ from transformers import AutoTokenizer
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
from .protocol import TokenizerLike from .protocol import TokenizerLike
from .registry import TokenizerRegistry
if TYPE_CHECKING: if TYPE_CHECKING:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@ -67,6 +68,7 @@ def get_cached_tokenizer(
return cached_tokenizer # type: ignore return cached_tokenizer # type: ignore
@TokenizerRegistry.register("hf")
class HfTokenizer(TokenizerLike): class HfTokenizer(TokenizerLike):
@classmethod @classmethod
def from_pretrained( def from_pretrained(

View File

@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
from vllm.logger import init_logger from vllm.logger import init_logger
from .protocol import TokenizerLike from .protocol import TokenizerLike
from .registry import TokenizerRegistry
if TYPE_CHECKING: if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import ( from mistral_common.protocol.instruct.request import (
@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return tokenizer.unk_id return tokenizer.unk_id
@TokenizerRegistry.register("mistral")
class MistralTokenizer(TokenizerLike): class MistralTokenizer(TokenizerLike):
@classmethod @classmethod
def from_pretrained( def from_pretrained(

View File

@ -1,28 +1,197 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib import importlib.util
from collections.abc import Callable
from pathlib import Path
from typing import TypeVar, overload
import huggingface_hub
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.transformers_utils.utils import (
check_gguf_file,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike from .protocol import TokenizerLike
logger = init_logger(__name__)
_T = TypeVar("_T", bound=type[TokenizerLike])
class TokenizerRegistry: class TokenizerRegistry:
# Tokenizer name -> (tokenizer module, tokenizer class) # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
REGISTRY: dict[str, tuple[str, str]] = {} REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
# In-tree tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
# OOT tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
@staticmethod @staticmethod
def register(name: str, module: str, class_name: str) -> None: def register(
TokenizerRegistry.REGISTRY[name] = (module, class_name) tokenizer_mode: str,
module: str | None = None,
class_name: str | None = None,
) -> Callable[[_T], _T] | None:
# In-tree tokenizers
if module is None or class_name is None:
def wrapper(tokenizer_cls: _T) -> _T:
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
return tokenizer_cls
return wrapper
# OOT tokenizers
if tokenizer_mode in TokenizerRegistry.REGISTRY:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
module,
class_name,
tokenizer_mode,
)
TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
return None
@staticmethod @staticmethod
def get_tokenizer( def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
tokenizer_name: str, if tokenizer_mode not in TokenizerRegistry.REGISTRY:
*args, raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
**kwargs,
) -> "TokenizerLike":
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
if tokenizer_cls is None:
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
tokenizer_module = importlib.import_module(tokenizer_cls[0]) item = TokenizerRegistry.REGISTRY[tokenizer_mode]
class_ = getattr(tokenizer_module, tokenizer_cls[1]) if isinstance(item, type):
return item.from_pretrained(*args, **kwargs)
module, class_name = item
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
return class_.from_pretrained(*args, **kwargs) return class_.from_pretrained(*args, **kwargs)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circular import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not Path(tokenizer_name).exists():
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=str(tokenizer_name),
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
tokenizer_mode = "hf"
kwargs["use_fast"] = False
if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
# Try to use official Mistral tokenizer if possible
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
files_list = list_filtered_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=allow_patterns,
revision=revision,
)
if len(files_list) > 0:
tokenizer_mode = "mistral"
# Fallback to HF tokenizer
if tokenizer_mode == "auto":
tokenizer_mode = "hf"
tokenizer_args = (tokenizer_name, *args)
tokenizer_kwargs = dict(
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if tokenizer_mode == "custom":
logger.warning_once(
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
"instead of `tokenizer_name`. "
"Please update the definition of `.from_pretrained` in "
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
"Then, you can pass `tokenizer_mode=%r` instead of "
"`tokenizer_mode='custom'` when initializing vLLM.",
tokenizer_args,
str(tokenizer_kwargs),
tokenizer_mode,
)
tokenizer_mode = str(tokenizer_name)
tokenizer = TokenizerRegistry.get_tokenizer(
tokenizer_mode,
*tokenizer_args,
**tokenizer_kwargs,
)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer

View File

@ -1,28 +1,14 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
import os
import warnings import warnings
from functools import lru_cache from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
import huggingface_hub
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm import envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.tokenizers import ( from vllm.tokenizers import TokenizerLike, get_tokenizer
HfTokenizer,
MistralTokenizer,
TokenizerLike,
TokenizerRegistry,
)
from .gguf_utils import get_gguf_file_path_from_hf
from .repo_utils import list_filtered_repo_files
from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig from vllm.config import ModelConfig
@ -108,117 +94,6 @@ def encode_tokens(
return tokenizer.encode(text, **kw_args) return tokenizer.encode(text, **kw_args)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circuit import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not os.path.exists(tokenizer_name):
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=tokenizer_name,
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
# first to use official Mistral tokenizer if possible.
mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
if tokenizer_mode == "auto" and mistral_common_installed:
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
files_list = list_filtered_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=allow_patterns,
revision=revision,
)
if len(files_list) > 0:
tokenizer_mode = "mistral"
tokenizer: TokenizerLike
if tokenizer_mode == "mistral":
logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
tokenizer = MistralTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
elif tokenizer_mode == "custom":
logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
tokenizer = TokenizerRegistry.get_tokenizer(
str(tokenizer_name),
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
else:
logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
tokenizer = HfTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer
cached_get_tokenizer = lru_cache(get_tokenizer) cached_get_tokenizer = lru_cache(get_tokenizer)