mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 06:25:02 +08:00
[Misc] Unify tokenizer registration (#29767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
86e178f7c4
commit
f0a28bf661
@ -53,7 +53,7 @@ async def test_tokenize_completions(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
):
|
):
|
||||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||||
|
|
||||||
for add_special in [False, True]:
|
for add_special in [False, True]:
|
||||||
prompt = "vllm1 This is a test prompt."
|
prompt = "vllm1 This is a test prompt."
|
||||||
@ -87,7 +87,7 @@ async def test_tokenize_chat(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
):
|
):
|
||||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||||
|
|
||||||
for add_generation in [False, True]:
|
for add_generation in [False, True]:
|
||||||
for add_special in [False, True]:
|
for add_special in [False, True]:
|
||||||
@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
):
|
):
|
||||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||||
|
|
||||||
for add_generation in [False, True]:
|
for add_generation in [False, True]:
|
||||||
for add_special in [False, True]:
|
for add_special in [False, True]:
|
||||||
@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
):
|
):
|
||||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||||
|
|
||||||
prompt = "This is a token_strs test prompt! vllm1"
|
prompt = "This is a token_strs test prompt! vllm1"
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
@ -240,7 +240,7 @@ async def test_detokenize(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
):
|
):
|
||||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
|
||||||
|
|
||||||
prompt = "This is a test prompt. vllm1"
|
prompt = "This is a test prompt. vllm1"
|
||||||
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
||||||
|
|||||||
@ -197,7 +197,7 @@ async def test_conversation_embedding(
|
|||||||
chat_response.raise_for_status()
|
chat_response.raise_for_status()
|
||||||
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
|
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
|
||||||
|
|
||||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
tokenizer = get_tokenizer(tokenizer_name=model_name)
|
||||||
prompt = tokenizer.apply_chat_template(
|
prompt = tokenizer.apply_chat_template(
|
||||||
messages,
|
messages,
|
||||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||||
|
|||||||
@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
|
|||||||
chat_response.raise_for_status()
|
chat_response.raise_for_status()
|
||||||
chat_poolings = PoolingResponse.model_validate(chat_response.json())
|
chat_poolings = PoolingResponse.model_validate(chat_response.json())
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
|
||||||
tokenizer_name=model_name,
|
|
||||||
tokenizer_mode="fast",
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
prompt = tokenizer.apply_chat_template(
|
prompt = tokenizer.apply_chat_template(
|
||||||
messages,
|
messages,
|
||||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class _HfExamplesInfo:
|
|||||||
tokenizer: str | None = None
|
tokenizer: str | None = None
|
||||||
"""Set the tokenizer to load for this architecture."""
|
"""Set the tokenizer to load for this architecture."""
|
||||||
|
|
||||||
tokenizer_mode: TokenizerMode = "auto"
|
tokenizer_mode: TokenizerMode | str = "auto"
|
||||||
"""Set the tokenizer type for this architecture."""
|
"""Set the tokenizer type for this architecture."""
|
||||||
|
|
||||||
speculative_model: str | None = None
|
speculative_model: str | None = None
|
||||||
|
|||||||
@ -1,13 +1,28 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
|
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
|
|
||||||
class TestTokenizer(TokenizerLike):
|
class TestTokenizer(TokenizerLike):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
|
def from_pretrained(
|
||||||
return TestTokenizer() # type: ignore
|
cls,
|
||||||
|
path_or_repo_id: str | Path,
|
||||||
|
*args,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
revision: str | None = None,
|
||||||
|
download_dir: str | None = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> "TestTokenizer":
|
||||||
|
return TestTokenizer(path_or_repo_id) # type: ignore
|
||||||
|
|
||||||
|
def __init__(self, path_or_repo_id: str | Path) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.path_or_repo_id = path_or_repo_id
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bos_token_id(self) -> int:
|
def bos_token_id(self) -> int:
|
||||||
@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
|
|||||||
def test_customized_tokenizer():
|
def test_customized_tokenizer():
|
||||||
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
|
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
|
||||||
|
|
||||||
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
|
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
|
||||||
assert isinstance(tokenizer, TestTokenizer)
|
assert isinstance(tokenizer, TestTokenizer)
|
||||||
|
assert tokenizer.path_or_repo_id == "abc"
|
||||||
assert tokenizer.bos_token_id == 0
|
assert tokenizer.bos_token_id == 0
|
||||||
assert tokenizer.eos_token_id == 1
|
assert tokenizer.eos_token_id == 1
|
||||||
assert tokenizer.pad_token_id == 2
|
assert tokenizer.pad_token_id == 2
|
||||||
|
|
||||||
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
|
tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
|
||||||
assert isinstance(tokenizer, TestTokenizer)
|
assert isinstance(tokenizer, TestTokenizer)
|
||||||
|
assert tokenizer.path_or_repo_id == "abc"
|
||||||
assert tokenizer.bos_token_id == 0
|
assert tokenizer.bos_token_id == 0
|
||||||
assert tokenizer.eos_token_id == 1
|
assert tokenizer.eos_token_id == 1
|
||||||
assert tokenizer.pad_token_id == 2
|
assert tokenizer.pad_token_id == 2
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import Any
|
||||||
|
|
||||||
import jsonschema
|
import jsonschema
|
||||||
import pytest
|
import pytest
|
||||||
@ -24,11 +24,6 @@ from vllm.sampling_params import (
|
|||||||
StructuredOutputsParams,
|
StructuredOutputsParams,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from vllm.config.model import TokenizerMode
|
|
||||||
else:
|
|
||||||
TokenizerMode = str
|
|
||||||
|
|
||||||
NGRAM_SPEC_CONFIG = {
|
NGRAM_SPEC_CONFIG = {
|
||||||
"model": "[ngram]",
|
"model": "[ngram]",
|
||||||
"num_speculative_tokens": 5,
|
"num_speculative_tokens": 5,
|
||||||
@ -627,7 +622,7 @@ Make the response as short as possible.
|
|||||||
)
|
)
|
||||||
def test_structured_output_with_reasoning_matrices(
|
def test_structured_output_with_reasoning_matrices(
|
||||||
backend: str,
|
backend: str,
|
||||||
tokenizer_mode: TokenizerMode,
|
tokenizer_mode: str,
|
||||||
reasoning_parser: str,
|
reasoning_parser: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
speculative_config: dict[str, Any] | None,
|
speculative_config: dict[str, Any] | None,
|
||||||
|
|||||||
@ -86,7 +86,7 @@ TaskOption = Literal[
|
|||||||
"transcription",
|
"transcription",
|
||||||
"draft",
|
"draft",
|
||||||
]
|
]
|
||||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
|
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
|
||||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||||
LogprobsMode = Literal[
|
LogprobsMode = Literal[
|
||||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||||
@ -137,13 +137,13 @@ class ModelConfig:
|
|||||||
tokenizer: SkipValidation[str] = None # type: ignore
|
tokenizer: SkipValidation[str] = None # type: ignore
|
||||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||||
name or path will be used."""
|
name or path will be used."""
|
||||||
tokenizer_mode: TokenizerMode = "auto"
|
tokenizer_mode: TokenizerMode | str = "auto"
|
||||||
"""Tokenizer mode:\n
|
"""Tokenizer mode:\n
|
||||||
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
|
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
|
||||||
- "hf" will use the fast tokenizer if available.\n
|
- "hf" will use the fast tokenizer if available.\n
|
||||||
- "slow" will always use the slow tokenizer.\n
|
- "slow" will always use the slow tokenizer.\n
|
||||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||||
- "custom" will use --tokenizer to select the preregistered tokenizer."""
|
- Other custom values can be supported via plugins."""
|
||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||||
and tokenizer."""
|
and tokenizer."""
|
||||||
@ -708,9 +708,6 @@ class ModelConfig:
|
|||||||
# can be correctly capped to sliding window size
|
# can be correctly capped to sliding window size
|
||||||
self.hf_text_config.sliding_window = None
|
self.hf_text_config.sliding_window = None
|
||||||
|
|
||||||
if not self.skip_tokenizer_init:
|
|
||||||
self._verify_tokenizer_mode()
|
|
||||||
|
|
||||||
# Avoid running try_verify_and_update_config multiple times
|
# Avoid running try_verify_and_update_config multiple times
|
||||||
self.config_updated = False
|
self.config_updated = False
|
||||||
|
|
||||||
@ -718,6 +715,10 @@ class ModelConfig:
|
|||||||
self._verify_cuda_graph()
|
self._verify_cuda_graph()
|
||||||
self._verify_bnb_config()
|
self._verify_bnb_config()
|
||||||
|
|
||||||
|
@field_validator("tokenizer_mode", mode="after")
|
||||||
|
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
|
||||||
|
return tokenizer_mode.lower()
|
||||||
|
|
||||||
@field_validator("quantization", mode="before")
|
@field_validator("quantization", mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_quantization_before(cls, value: Any) -> Any:
|
def validate_quantization_before(cls, value: Any) -> Any:
|
||||||
@ -829,15 +830,6 @@ class ModelConfig:
|
|||||||
model, _ = split_remote_gguf(model)
|
model, _ = split_remote_gguf(model)
|
||||||
return get_sentence_transformer_tokenizer_config(model, self.revision)
|
return get_sentence_transformer_tokenizer_config(model, self.revision)
|
||||||
|
|
||||||
def _verify_tokenizer_mode(self) -> None:
|
|
||||||
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
|
|
||||||
if tokenizer_mode not in get_args(TokenizerMode):
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
|
|
||||||
f"one of {get_args(TokenizerMode)}."
|
|
||||||
)
|
|
||||||
self.tokenizer_mode = tokenizer_mode
|
|
||||||
|
|
||||||
def _get_default_runner_type(
|
def _get_default_runner_type(
|
||||||
self,
|
self,
|
||||||
architectures: list[str],
|
architectures: list[str],
|
||||||
|
|||||||
@ -360,7 +360,7 @@ class EngineArgs:
|
|||||||
task: TaskOption | None = ModelConfig.task
|
task: TaskOption | None = ModelConfig.task
|
||||||
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
||||||
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
||||||
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
|
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
|
||||||
trust_remote_code: bool = ModelConfig.trust_remote_code
|
trust_remote_code: bool = ModelConfig.trust_remote_code
|
||||||
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
||||||
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
|
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
|
||||||
|
|||||||
@ -188,7 +188,7 @@ class LLM:
|
|||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
convert: ConvertOption = "auto",
|
convert: ConvertOption = "auto",
|
||||||
tokenizer: str | None = None,
|
tokenizer: str | None = None,
|
||||||
tokenizer_mode: TokenizerMode = "auto",
|
tokenizer_mode: TokenizerMode | str = "auto",
|
||||||
skip_tokenizer_init: bool = False,
|
skip_tokenizer_init: bool = False,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
allowed_local_media_path: str = "",
|
allowed_local_media_path: str = "",
|
||||||
|
|||||||
@ -4,6 +4,12 @@
|
|||||||
from .hf import HfTokenizer
|
from .hf import HfTokenizer
|
||||||
from .mistral import MistralTokenizer
|
from .mistral import MistralTokenizer
|
||||||
from .protocol import TokenizerLike
|
from .protocol import TokenizerLike
|
||||||
from .registry import TokenizerRegistry
|
from .registry import TokenizerRegistry, get_tokenizer
|
||||||
|
|
||||||
__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
|
__all__ = [
|
||||||
|
"TokenizerLike",
|
||||||
|
"HfTokenizer",
|
||||||
|
"MistralTokenizer",
|
||||||
|
"TokenizerRegistry",
|
||||||
|
"get_tokenizer",
|
||||||
|
]
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from transformers import AutoTokenizer
|
|||||||
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
|
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
|
||||||
|
|
||||||
from .protocol import TokenizerLike
|
from .protocol import TokenizerLike
|
||||||
|
from .registry import TokenizerRegistry
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||||
@ -67,6 +68,7 @@ def get_cached_tokenizer(
|
|||||||
return cached_tokenizer # type: ignore
|
return cached_tokenizer # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
@TokenizerRegistry.register("hf")
|
||||||
class HfTokenizer(TokenizerLike):
|
class HfTokenizer(TokenizerLike):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
|
|||||||
@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from .protocol import TokenizerLike
|
from .protocol import TokenizerLike
|
||||||
|
from .registry import TokenizerRegistry
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from mistral_common.protocol.instruct.request import (
|
from mistral_common.protocol.instruct.request import (
|
||||||
@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
|
|||||||
return tokenizer.unk_id
|
return tokenizer.unk_id
|
||||||
|
|
||||||
|
|
||||||
|
@TokenizerRegistry.register("mistral")
|
||||||
class MistralTokenizer(TokenizerLike):
|
class MistralTokenizer(TokenizerLike):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
|
|||||||
@ -1,28 +1,197 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import importlib
|
import importlib.util
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TypeVar, overload
|
||||||
|
|
||||||
|
import huggingface_hub
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
|
||||||
|
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
|
||||||
|
from vllm.transformers_utils.utils import (
|
||||||
|
check_gguf_file,
|
||||||
|
is_gguf,
|
||||||
|
is_remote_gguf,
|
||||||
|
split_remote_gguf,
|
||||||
|
)
|
||||||
|
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||||
|
|
||||||
from .protocol import TokenizerLike
|
from .protocol import TokenizerLike
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
_T = TypeVar("_T", bound=type[TokenizerLike])
|
||||||
|
|
||||||
|
|
||||||
class TokenizerRegistry:
|
class TokenizerRegistry:
|
||||||
# Tokenizer name -> (tokenizer module, tokenizer class)
|
# Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
|
||||||
REGISTRY: dict[str, tuple[str, str]] = {}
|
REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
|
||||||
|
|
||||||
|
# In-tree tokenizers
|
||||||
|
@staticmethod
|
||||||
|
@overload
|
||||||
|
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
|
||||||
|
|
||||||
|
# OOT tokenizers
|
||||||
|
@staticmethod
|
||||||
|
@overload
|
||||||
|
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def register(name: str, module: str, class_name: str) -> None:
|
def register(
|
||||||
TokenizerRegistry.REGISTRY[name] = (module, class_name)
|
tokenizer_mode: str,
|
||||||
|
module: str | None = None,
|
||||||
|
class_name: str | None = None,
|
||||||
|
) -> Callable[[_T], _T] | None:
|
||||||
|
# In-tree tokenizers
|
||||||
|
if module is None or class_name is None:
|
||||||
|
|
||||||
|
def wrapper(tokenizer_cls: _T) -> _T:
|
||||||
|
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
|
||||||
|
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
|
||||||
|
|
||||||
|
return tokenizer_cls
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
# OOT tokenizers
|
||||||
|
if tokenizer_mode in TokenizerRegistry.REGISTRY:
|
||||||
|
logger.warning(
|
||||||
|
"%s.%s is already registered for tokenizer_mode=%r. "
|
||||||
|
"It is overwritten by the new one.",
|
||||||
|
module,
|
||||||
|
class_name,
|
||||||
|
tokenizer_mode,
|
||||||
|
)
|
||||||
|
|
||||||
|
TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_tokenizer(
|
def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
|
||||||
tokenizer_name: str,
|
if tokenizer_mode not in TokenizerRegistry.REGISTRY:
|
||||||
*args,
|
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
|
||||||
**kwargs,
|
|
||||||
) -> "TokenizerLike":
|
|
||||||
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
|
|
||||||
if tokenizer_cls is None:
|
|
||||||
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
|
|
||||||
|
|
||||||
tokenizer_module = importlib.import_module(tokenizer_cls[0])
|
item = TokenizerRegistry.REGISTRY[tokenizer_mode]
|
||||||
class_ = getattr(tokenizer_module, tokenizer_cls[1])
|
if isinstance(item, type):
|
||||||
|
return item.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
module, class_name = item
|
||||||
|
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
|
||||||
|
|
||||||
|
class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
|
||||||
return class_.from_pretrained(*args, **kwargs)
|
return class_.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_tokenizer(
|
||||||
|
tokenizer_name: str | Path,
|
||||||
|
*args,
|
||||||
|
tokenizer_mode: str = "auto",
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
revision: str | None = None,
|
||||||
|
download_dir: str | None = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> TokenizerLike:
|
||||||
|
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
|
||||||
|
if envs.VLLM_USE_MODELSCOPE:
|
||||||
|
# download model from ModelScope hub,
|
||||||
|
# lazy import so that modelscope is not required for normal use.
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
|
||||||
|
# avoid circular import
|
||||||
|
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||||
|
|
||||||
|
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||||
|
if not Path(tokenizer_name).exists():
|
||||||
|
# Use file lock to prevent multiple processes from
|
||||||
|
# downloading the same file at the same time.
|
||||||
|
with get_lock(tokenizer_name, download_dir):
|
||||||
|
tokenizer_path = snapshot_download(
|
||||||
|
model_id=str(tokenizer_name),
|
||||||
|
cache_dir=download_dir,
|
||||||
|
revision=revision,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
# Ignore weights - we only need the tokenizer.
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||||
|
)
|
||||||
|
tokenizer_name = tokenizer_path
|
||||||
|
|
||||||
|
if tokenizer_mode == "slow":
|
||||||
|
if kwargs.get("use_fast", False):
|
||||||
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
|
|
||||||
|
tokenizer_mode = "hf"
|
||||||
|
kwargs["use_fast"] = False
|
||||||
|
|
||||||
|
if "truncation_side" not in kwargs:
|
||||||
|
kwargs["truncation_side"] = "left"
|
||||||
|
|
||||||
|
# Separate model folder from file path for GGUF models
|
||||||
|
if is_gguf(tokenizer_name):
|
||||||
|
if check_gguf_file(tokenizer_name):
|
||||||
|
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||||
|
tokenizer_name = Path(tokenizer_name).parent
|
||||||
|
elif is_remote_gguf(tokenizer_name):
|
||||||
|
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
|
||||||
|
# Get the HuggingFace Hub path for the GGUF file
|
||||||
|
gguf_file = get_gguf_file_path_from_hf(
|
||||||
|
tokenizer_name,
|
||||||
|
quant_type,
|
||||||
|
revision=revision,
|
||||||
|
)
|
||||||
|
kwargs["gguf_file"] = gguf_file
|
||||||
|
|
||||||
|
# Try to use official Mistral tokenizer if possible
|
||||||
|
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
|
||||||
|
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
|
||||||
|
files_list = list_filtered_repo_files(
|
||||||
|
model_name_or_path=str(tokenizer_name),
|
||||||
|
allow_patterns=allow_patterns,
|
||||||
|
revision=revision,
|
||||||
|
)
|
||||||
|
if len(files_list) > 0:
|
||||||
|
tokenizer_mode = "mistral"
|
||||||
|
|
||||||
|
# Fallback to HF tokenizer
|
||||||
|
if tokenizer_mode == "auto":
|
||||||
|
tokenizer_mode = "hf"
|
||||||
|
|
||||||
|
tokenizer_args = (tokenizer_name, *args)
|
||||||
|
tokenizer_kwargs = dict(
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
revision=revision,
|
||||||
|
download_dir=download_dir,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
if tokenizer_mode == "custom":
|
||||||
|
logger.warning_once(
|
||||||
|
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
|
||||||
|
"instead of `tokenizer_name`. "
|
||||||
|
"Please update the definition of `.from_pretrained` in "
|
||||||
|
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
|
||||||
|
"Then, you can pass `tokenizer_mode=%r` instead of "
|
||||||
|
"`tokenizer_mode='custom'` when initializing vLLM.",
|
||||||
|
tokenizer_args,
|
||||||
|
str(tokenizer_kwargs),
|
||||||
|
tokenizer_mode,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer_mode = str(tokenizer_name)
|
||||||
|
|
||||||
|
tokenizer = TokenizerRegistry.get_tokenizer(
|
||||||
|
tokenizer_mode,
|
||||||
|
*tokenizer_args,
|
||||||
|
**tokenizer_kwargs,
|
||||||
|
)
|
||||||
|
if not tokenizer.is_fast:
|
||||||
|
logger.warning(
|
||||||
|
"Using a slow tokenizer. This might cause a significant "
|
||||||
|
"slowdown. Consider using a fast tokenizer instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
return tokenizer
|
||||||
|
|||||||
@ -1,28 +1,14 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import importlib.util
|
|
||||||
import os
|
|
||||||
import warnings
|
import warnings
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import huggingface_hub
|
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm import envs
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.tokenizers import (
|
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||||
HfTokenizer,
|
|
||||||
MistralTokenizer,
|
|
||||||
TokenizerLike,
|
|
||||||
TokenizerRegistry,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .gguf_utils import get_gguf_file_path_from_hf
|
|
||||||
from .repo_utils import list_filtered_repo_files
|
|
||||||
from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
@ -108,117 +94,6 @@ def encode_tokens(
|
|||||||
return tokenizer.encode(text, **kw_args)
|
return tokenizer.encode(text, **kw_args)
|
||||||
|
|
||||||
|
|
||||||
def get_tokenizer(
|
|
||||||
tokenizer_name: str | Path,
|
|
||||||
*args,
|
|
||||||
tokenizer_mode: str = "auto",
|
|
||||||
trust_remote_code: bool = False,
|
|
||||||
revision: str | None = None,
|
|
||||||
download_dir: str | None = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> TokenizerLike:
|
|
||||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
|
|
||||||
if envs.VLLM_USE_MODELSCOPE:
|
|
||||||
# download model from ModelScope hub,
|
|
||||||
# lazy import so that modelscope is not required for normal use.
|
|
||||||
# pylint: disable=C.
|
|
||||||
from modelscope.hub.snapshot_download import snapshot_download
|
|
||||||
|
|
||||||
# avoid circuit import
|
|
||||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
|
||||||
|
|
||||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
|
||||||
if not os.path.exists(tokenizer_name):
|
|
||||||
# Use file lock to prevent multiple processes from
|
|
||||||
# downloading the same file at the same time.
|
|
||||||
with get_lock(tokenizer_name, download_dir):
|
|
||||||
tokenizer_path = snapshot_download(
|
|
||||||
model_id=tokenizer_name,
|
|
||||||
cache_dir=download_dir,
|
|
||||||
revision=revision,
|
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
|
||||||
# Ignore weights - we only need the tokenizer.
|
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
|
||||||
)
|
|
||||||
tokenizer_name = tokenizer_path
|
|
||||||
|
|
||||||
if tokenizer_mode == "slow":
|
|
||||||
if kwargs.get("use_fast", False):
|
|
||||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
|
||||||
kwargs["use_fast"] = False
|
|
||||||
|
|
||||||
if "truncation_side" not in kwargs:
|
|
||||||
kwargs["truncation_side"] = "left"
|
|
||||||
|
|
||||||
# Separate model folder from file path for GGUF models
|
|
||||||
if is_gguf(tokenizer_name):
|
|
||||||
if check_gguf_file(tokenizer_name):
|
|
||||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
|
||||||
tokenizer_name = Path(tokenizer_name).parent
|
|
||||||
elif is_remote_gguf(tokenizer_name):
|
|
||||||
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
|
|
||||||
# Get the HuggingFace Hub path for the GGUF file
|
|
||||||
gguf_file = get_gguf_file_path_from_hf(
|
|
||||||
tokenizer_name,
|
|
||||||
quant_type,
|
|
||||||
revision=revision,
|
|
||||||
)
|
|
||||||
kwargs["gguf_file"] = gguf_file
|
|
||||||
|
|
||||||
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
|
|
||||||
# first to use official Mistral tokenizer if possible.
|
|
||||||
mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
|
|
||||||
if tokenizer_mode == "auto" and mistral_common_installed:
|
|
||||||
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
|
|
||||||
files_list = list_filtered_repo_files(
|
|
||||||
model_name_or_path=str(tokenizer_name),
|
|
||||||
allow_patterns=allow_patterns,
|
|
||||||
revision=revision,
|
|
||||||
)
|
|
||||||
if len(files_list) > 0:
|
|
||||||
tokenizer_mode = "mistral"
|
|
||||||
|
|
||||||
tokenizer: TokenizerLike
|
|
||||||
if tokenizer_mode == "mistral":
|
|
||||||
logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
|
|
||||||
tokenizer = MistralTokenizer.from_pretrained(
|
|
||||||
tokenizer_name,
|
|
||||||
*args,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
revision=revision,
|
|
||||||
download_dir=download_dir,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
elif tokenizer_mode == "custom":
|
|
||||||
logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
|
|
||||||
tokenizer = TokenizerRegistry.get_tokenizer(
|
|
||||||
str(tokenizer_name),
|
|
||||||
*args,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
revision=revision,
|
|
||||||
download_dir=download_dir,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
|
|
||||||
tokenizer = HfTokenizer.from_pretrained(
|
|
||||||
tokenizer_name,
|
|
||||||
*args,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
revision=revision,
|
|
||||||
download_dir=download_dir,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not tokenizer.is_fast:
|
|
||||||
logger.warning(
|
|
||||||
"Using a slow tokenizer. This might cause a significant "
|
|
||||||
"slowdown. Consider using a fast tokenizer instead."
|
|
||||||
)
|
|
||||||
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user