mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 08:05:31 +08:00
[Misc] Unify tokenizer registration (#29767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
86e178f7c4
commit
f0a28bf661
@ -53,7 +53,7 @@ async def test_tokenize_completions(
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_special in [False, True]:
|
||||
prompt = "vllm1 This is a test prompt."
|
||||
@ -87,7 +87,7 @@ async def test_tokenize_chat(
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_generation in [False, True]:
|
||||
for add_special in [False, True]:
|
||||
@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
for add_generation in [False, True]:
|
||||
for add_special in [False, True]:
|
||||
@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
prompt = "This is a token_strs test prompt! vllm1"
|
||||
response = requests.post(
|
||||
@ -240,7 +240,7 @@ async def test_detokenize(
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||
|
||||
prompt = "This is a test prompt. vllm1"
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
||||
|
||||
@ -197,7 +197,7 @@ async def test_conversation_embedding(
|
||||
chat_response.raise_for_status()
|
||||
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
|
||||
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||
|
||||
@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
|
||||
chat_response.raise_for_status()
|
||||
chat_poolings = PoolingResponse.model_validate(chat_response.json())
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_name=model_name,
|
||||
tokenizer_mode="fast",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||
|
||||
@ -23,7 +23,7 @@ class _HfExamplesInfo:
|
||||
tokenizer: str | None = None
|
||||
"""Set the tokenizer to load for this architecture."""
|
||||
|
||||
tokenizer_mode: TokenizerMode = "auto"
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
"""Set the tokenizer type for this architecture."""
|
||||
|
||||
speculative_model: str | None = None
|
||||
|
||||
@ -1,13 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
class TestTokenizer(TokenizerLike):
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
|
||||
return TestTokenizer() # type: ignore
|
||||
def from_pretrained(
|
||||
cls,
|
||||
path_or_repo_id: str | Path,
|
||||
*args,
|
||||
trust_remote_code: bool = False,
|
||||
revision: str | None = None,
|
||||
download_dir: str | None = None,
|
||||
**kwargs,
|
||||
) -> "TestTokenizer":
|
||||
return TestTokenizer(path_or_repo_id) # type: ignore
|
||||
|
||||
def __init__(self, path_or_repo_id: str | Path) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.path_or_repo_id = path_or_repo_id
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
|
||||
def test_customized_tokenizer():
|
||||
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
|
||||
|
||||
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
|
||||
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.path_or_repo_id == "abc"
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
|
||||
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
|
||||
tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.path_or_repo_id == "abc"
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from typing import Any
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
@ -24,11 +24,6 @@ from vllm.sampling_params import (
|
||||
StructuredOutputsParams,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config.model import TokenizerMode
|
||||
else:
|
||||
TokenizerMode = str
|
||||
|
||||
NGRAM_SPEC_CONFIG = {
|
||||
"model": "[ngram]",
|
||||
"num_speculative_tokens": 5,
|
||||
@ -627,7 +622,7 @@ Make the response as short as possible.
|
||||
)
|
||||
def test_structured_output_with_reasoning_matrices(
|
||||
backend: str,
|
||||
tokenizer_mode: TokenizerMode,
|
||||
tokenizer_mode: str,
|
||||
reasoning_parser: str,
|
||||
model_name: str,
|
||||
speculative_config: dict[str, Any] | None,
|
||||
|
||||
@ -86,7 +86,7 @@ TaskOption = Literal[
|
||||
"transcription",
|
||||
"draft",
|
||||
]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal[
|
||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||
@ -137,13 +137,13 @@ class ModelConfig:
|
||||
tokenizer: SkipValidation[str] = None # type: ignore
|
||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode = "auto"
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
"""Tokenizer mode:\n
|
||||
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "custom" will use --tokenizer to select the preregistered tokenizer."""
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
and tokenizer."""
|
||||
@ -708,9 +708,6 @@ class ModelConfig:
|
||||
# can be correctly capped to sliding window size
|
||||
self.hf_text_config.sliding_window = None
|
||||
|
||||
if not self.skip_tokenizer_init:
|
||||
self._verify_tokenizer_mode()
|
||||
|
||||
# Avoid running try_verify_and_update_config multiple times
|
||||
self.config_updated = False
|
||||
|
||||
@ -718,6 +715,10 @@ class ModelConfig:
|
||||
self._verify_cuda_graph()
|
||||
self._verify_bnb_config()
|
||||
|
||||
@field_validator("tokenizer_mode", mode="after")
|
||||
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
|
||||
return tokenizer_mode.lower()
|
||||
|
||||
@field_validator("quantization", mode="before")
|
||||
@classmethod
|
||||
def validate_quantization_before(cls, value: Any) -> Any:
|
||||
@ -829,15 +830,6 @@ class ModelConfig:
|
||||
model, _ = split_remote_gguf(model)
|
||||
return get_sentence_transformer_tokenizer_config(model, self.revision)
|
||||
|
||||
def _verify_tokenizer_mode(self) -> None:
|
||||
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
|
||||
if tokenizer_mode not in get_args(TokenizerMode):
|
||||
raise ValueError(
|
||||
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
|
||||
f"one of {get_args(TokenizerMode)}."
|
||||
)
|
||||
self.tokenizer_mode = tokenizer_mode
|
||||
|
||||
def _get_default_runner_type(
|
||||
self,
|
||||
architectures: list[str],
|
||||
|
||||
@ -360,7 +360,7 @@ class EngineArgs:
|
||||
task: TaskOption | None = ModelConfig.task
|
||||
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
||||
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
||||
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
|
||||
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
|
||||
trust_remote_code: bool = ModelConfig.trust_remote_code
|
||||
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
||||
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
|
||||
|
||||
@ -188,7 +188,7 @@ class LLM:
|
||||
runner: RunnerOption = "auto",
|
||||
convert: ConvertOption = "auto",
|
||||
tokenizer: str | None = None,
|
||||
tokenizer_mode: TokenizerMode = "auto",
|
||||
tokenizer_mode: TokenizerMode | str = "auto",
|
||||
skip_tokenizer_init: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
allowed_local_media_path: str = "",
|
||||
|
||||
@ -4,6 +4,12 @@
|
||||
from .hf import HfTokenizer
|
||||
from .mistral import MistralTokenizer
|
||||
from .protocol import TokenizerLike
|
||||
from .registry import TokenizerRegistry
|
||||
from .registry import TokenizerRegistry, get_tokenizer
|
||||
|
||||
__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
|
||||
__all__ = [
|
||||
"TokenizerLike",
|
||||
"HfTokenizer",
|
||||
"MistralTokenizer",
|
||||
"TokenizerRegistry",
|
||||
"get_tokenizer",
|
||||
]
|
||||
|
||||
@ -10,6 +10,7 @@ from transformers import AutoTokenizer
|
||||
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
|
||||
|
||||
from .protocol import TokenizerLike
|
||||
from .registry import TokenizerRegistry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
@ -67,6 +68,7 @@ def get_cached_tokenizer(
|
||||
return cached_tokenizer # type: ignore
|
||||
|
||||
|
||||
@TokenizerRegistry.register("hf")
|
||||
class HfTokenizer(TokenizerLike):
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
|
||||
@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .protocol import TokenizerLike
|
||||
from .registry import TokenizerRegistry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mistral_common.protocol.instruct.request import (
|
||||
@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
|
||||
return tokenizer.unk_id
|
||||
|
||||
|
||||
@TokenizerRegistry.register("mistral")
|
||||
class MistralTokenizer(TokenizerLike):
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
|
||||
@ -1,28 +1,197 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import importlib.util
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import TypeVar, overload
|
||||
|
||||
import huggingface_hub
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
|
||||
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
|
||||
from vllm.transformers_utils.utils import (
|
||||
check_gguf_file,
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
split_remote_gguf,
|
||||
)
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
|
||||
from .protocol import TokenizerLike
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_T = TypeVar("_T", bound=type[TokenizerLike])
|
||||
|
||||
|
||||
class TokenizerRegistry:
|
||||
# Tokenizer name -> (tokenizer module, tokenizer class)
|
||||
REGISTRY: dict[str, tuple[str, str]] = {}
|
||||
# Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
|
||||
REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
|
||||
|
||||
# In-tree tokenizers
|
||||
@staticmethod
|
||||
@overload
|
||||
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
|
||||
|
||||
# OOT tokenizers
|
||||
@staticmethod
|
||||
@overload
|
||||
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
|
||||
|
||||
@staticmethod
|
||||
def register(name: str, module: str, class_name: str) -> None:
|
||||
TokenizerRegistry.REGISTRY[name] = (module, class_name)
|
||||
def register(
|
||||
tokenizer_mode: str,
|
||||
module: str | None = None,
|
||||
class_name: str | None = None,
|
||||
) -> Callable[[_T], _T] | None:
|
||||
# In-tree tokenizers
|
||||
if module is None or class_name is None:
|
||||
|
||||
def wrapper(tokenizer_cls: _T) -> _T:
|
||||
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
|
||||
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
|
||||
|
||||
return tokenizer_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
# OOT tokenizers
|
||||
if tokenizer_mode in TokenizerRegistry.REGISTRY:
|
||||
logger.warning(
|
||||
"%s.%s is already registered for tokenizer_mode=%r. "
|
||||
"It is overwritten by the new one.",
|
||||
module,
|
||||
class_name,
|
||||
tokenizer_mode,
|
||||
)
|
||||
|
||||
TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> "TokenizerLike":
|
||||
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
|
||||
if tokenizer_cls is None:
|
||||
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
|
||||
def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
|
||||
if tokenizer_mode not in TokenizerRegistry.REGISTRY:
|
||||
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
|
||||
|
||||
tokenizer_module = importlib.import_module(tokenizer_cls[0])
|
||||
class_ = getattr(tokenizer_module, tokenizer_cls[1])
|
||||
item = TokenizerRegistry.REGISTRY[tokenizer_mode]
|
||||
if isinstance(item, type):
|
||||
return item.from_pretrained(*args, **kwargs)
|
||||
|
||||
module, class_name = item
|
||||
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
|
||||
|
||||
class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
|
||||
return class_.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str | Path,
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: str | None = None,
|
||||
download_dir: str | None = None,
|
||||
**kwargs,
|
||||
) -> TokenizerLike:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# avoid circular import
|
||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not Path(tokenizer_name).exists():
|
||||
# Use file lock to prevent multiple processes from
|
||||
# downloading the same file at the same time.
|
||||
with get_lock(tokenizer_name, download_dir):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=str(tokenizer_name),
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||
)
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
|
||||
tokenizer_mode = "hf"
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
if is_gguf(tokenizer_name):
|
||||
if check_gguf_file(tokenizer_name):
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
elif is_remote_gguf(tokenizer_name):
|
||||
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
|
||||
# Get the HuggingFace Hub path for the GGUF file
|
||||
gguf_file = get_gguf_file_path_from_hf(
|
||||
tokenizer_name,
|
||||
quant_type,
|
||||
revision=revision,
|
||||
)
|
||||
kwargs["gguf_file"] = gguf_file
|
||||
|
||||
# Try to use official Mistral tokenizer if possible
|
||||
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
|
||||
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
|
||||
files_list = list_filtered_repo_files(
|
||||
model_name_or_path=str(tokenizer_name),
|
||||
allow_patterns=allow_patterns,
|
||||
revision=revision,
|
||||
)
|
||||
if len(files_list) > 0:
|
||||
tokenizer_mode = "mistral"
|
||||
|
||||
# Fallback to HF tokenizer
|
||||
if tokenizer_mode == "auto":
|
||||
tokenizer_mode = "hf"
|
||||
|
||||
tokenizer_args = (tokenizer_name, *args)
|
||||
tokenizer_kwargs = dict(
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if tokenizer_mode == "custom":
|
||||
logger.warning_once(
|
||||
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
|
||||
"instead of `tokenizer_name`. "
|
||||
"Please update the definition of `.from_pretrained` in "
|
||||
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
|
||||
"Then, you can pass `tokenizer_mode=%r` instead of "
|
||||
"`tokenizer_mode='custom'` when initializing vLLM.",
|
||||
tokenizer_args,
|
||||
str(tokenizer_kwargs),
|
||||
tokenizer_mode,
|
||||
)
|
||||
|
||||
tokenizer_mode = str(tokenizer_name)
|
||||
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(
|
||||
tokenizer_mode,
|
||||
*tokenizer_args,
|
||||
**tokenizer_kwargs,
|
||||
)
|
||||
if not tokenizer.is_fast:
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead."
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
@ -1,28 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib.util
|
||||
import os
|
||||
import warnings
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import huggingface_hub
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import (
|
||||
HfTokenizer,
|
||||
MistralTokenizer,
|
||||
TokenizerLike,
|
||||
TokenizerRegistry,
|
||||
)
|
||||
|
||||
from .gguf_utils import get_gguf_file_path_from_hf
|
||||
from .repo_utils import list_filtered_repo_files
|
||||
from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
|
||||
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@ -108,117 +94,6 @@ def encode_tokens(
|
||||
return tokenizer.encode(text, **kw_args)
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str | Path,
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: str | None = None,
|
||||
download_dir: str | None = None,
|
||||
**kwargs,
|
||||
) -> TokenizerLike:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# avoid circuit import
|
||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not os.path.exists(tokenizer_name):
|
||||
# Use file lock to prevent multiple processes from
|
||||
# downloading the same file at the same time.
|
||||
with get_lock(tokenizer_name, download_dir):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=tokenizer_name,
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||
)
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
if is_gguf(tokenizer_name):
|
||||
if check_gguf_file(tokenizer_name):
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
elif is_remote_gguf(tokenizer_name):
|
||||
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
|
||||
# Get the HuggingFace Hub path for the GGUF file
|
||||
gguf_file = get_gguf_file_path_from_hf(
|
||||
tokenizer_name,
|
||||
quant_type,
|
||||
revision=revision,
|
||||
)
|
||||
kwargs["gguf_file"] = gguf_file
|
||||
|
||||
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
|
||||
# first to use official Mistral tokenizer if possible.
|
||||
mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
|
||||
if tokenizer_mode == "auto" and mistral_common_installed:
|
||||
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
|
||||
files_list = list_filtered_repo_files(
|
||||
model_name_or_path=str(tokenizer_name),
|
||||
allow_patterns=allow_patterns,
|
||||
revision=revision,
|
||||
)
|
||||
if len(files_list) > 0:
|
||||
tokenizer_mode = "mistral"
|
||||
|
||||
tokenizer: TokenizerLike
|
||||
if tokenizer_mode == "mistral":
|
||||
logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
|
||||
tokenizer = MistralTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs,
|
||||
)
|
||||
elif tokenizer_mode == "custom":
|
||||
logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(
|
||||
str(tokenizer_name),
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
|
||||
tokenizer = HfTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not tokenizer.is_fast:
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead."
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user