mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 22:44:54 +08:00
Remove all_special_tokens_extended from tokenizer code (#29686)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
8d9338fae4
commit
fecae12cd7
@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
|
|||||||
# Cached attributes
|
# Cached attributes
|
||||||
assert target.all_special_ids == expected.all_special_ids
|
assert target.all_special_ids == expected.all_special_ids
|
||||||
assert target.all_special_tokens == expected.all_special_tokens
|
assert target.all_special_tokens == expected.all_special_tokens
|
||||||
assert target.all_special_tokens_extended == expected.all_special_tokens_extended
|
|
||||||
assert target.get_vocab() == expected.get_vocab()
|
assert target.get_vocab() == expected.get_vocab()
|
||||||
assert len(target) == len(expected)
|
assert len(target) == len(expected)
|
||||||
|
|
||||||
|
|||||||
@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
|
|||||||
)
|
)
|
||||||
class TestMistralTokenizer:
|
class TestMistralTokenizer:
|
||||||
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
|
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
|
||||||
attributes = [
|
if mistral_tokenizer.is_tekken:
|
||||||
mistral_tokenizer.all_special_tokens,
|
assert mistral_tokenizer.all_special_tokens == [
|
||||||
mistral_tokenizer.all_special_tokens_extended,
|
"<unk>",
|
||||||
]
|
"<s>",
|
||||||
|
"</s>",
|
||||||
for attribute in attributes:
|
"[INST]",
|
||||||
if mistral_tokenizer.is_tekken:
|
"[/INST]",
|
||||||
assert attribute == [
|
"[AVAILABLE_TOOLS]",
|
||||||
"<unk>",
|
"[/AVAILABLE_TOOLS]",
|
||||||
"<s>",
|
"[TOOL_RESULTS]",
|
||||||
"</s>",
|
"[/TOOL_RESULTS]",
|
||||||
"[INST]",
|
"[TOOL_CALLS]",
|
||||||
"[/INST]",
|
"[IMG]",
|
||||||
"[AVAILABLE_TOOLS]",
|
"<pad>",
|
||||||
"[/AVAILABLE_TOOLS]",
|
"[IMG_BREAK]",
|
||||||
"[TOOL_RESULTS]",
|
"[IMG_END]",
|
||||||
"[/TOOL_RESULTS]",
|
"[PREFIX]",
|
||||||
"[TOOL_CALLS]",
|
"[MIDDLE]",
|
||||||
"[IMG]",
|
"[SUFFIX]",
|
||||||
"<pad>",
|
"[SYSTEM_PROMPT]",
|
||||||
"[IMG_BREAK]",
|
"[/SYSTEM_PROMPT]",
|
||||||
"[IMG_END]",
|
"[TOOL_CONTENT]",
|
||||||
"[PREFIX]",
|
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
|
||||||
"[MIDDLE]",
|
"[ARGS]",
|
||||||
"[SUFFIX]",
|
"[CALL_ID]",
|
||||||
"[SYSTEM_PROMPT]",
|
"[THINK]",
|
||||||
"[/SYSTEM_PROMPT]",
|
"[/THINK]",
|
||||||
"[TOOL_CONTENT]",
|
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
|
||||||
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
|
else:
|
||||||
"[ARGS]",
|
assert mistral_tokenizer.all_special_tokens == [
|
||||||
"[CALL_ID]",
|
"<s>",
|
||||||
"[THINK]",
|
"</s>",
|
||||||
"[/THINK]",
|
"[INST]",
|
||||||
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
|
"[/INST]",
|
||||||
else:
|
"[TOOL_CALLS]",
|
||||||
assert attribute == [
|
"[AVAILABLE_TOOLS]",
|
||||||
"<s>",
|
"[/AVAILABLE_TOOLS]",
|
||||||
"</s>",
|
"[TOOL_RESULTS]",
|
||||||
"[INST]",
|
"[/TOOL_RESULTS]",
|
||||||
"[/INST]",
|
] + [f"[control_{i}]" for i in range(8, 769)]
|
||||||
"[TOOL_CALLS]",
|
|
||||||
"[AVAILABLE_TOOLS]",
|
|
||||||
"[/AVAILABLE_TOOLS]",
|
|
||||||
"[TOOL_RESULTS]",
|
|
||||||
"[/TOOL_RESULTS]",
|
|
||||||
] + [f"[control_{i}]" for i in range(8, 769)]
|
|
||||||
|
|
||||||
def get_vocab(self, mistral_tokenizer: MistralTokenizer):
|
def get_vocab(self, mistral_tokenizer: MistralTokenizer):
|
||||||
assert (
|
assert (
|
||||||
|
|||||||
@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
|
|||||||
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
|
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
|
||||||
return TestTokenizer()
|
return TestTokenizer()
|
||||||
|
|
||||||
@property
|
|
||||||
def all_special_tokens_extended(self) -> list[str]:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_special_tokens(self) -> list[str]:
|
def all_special_tokens(self) -> list[str]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|||||||
@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
|||||||
|
|
||||||
tokenizer_all_special_ids = tokenizer.all_special_ids
|
tokenizer_all_special_ids = tokenizer.all_special_ids
|
||||||
tokenizer_all_special_tokens = tokenizer.all_special_tokens
|
tokenizer_all_special_tokens = tokenizer.all_special_tokens
|
||||||
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
|
|
||||||
tokenizer_vocab = tokenizer.get_vocab()
|
tokenizer_vocab = tokenizer.get_vocab()
|
||||||
tokenizer_len = len(tokenizer)
|
tokenizer_len = len(tokenizer)
|
||||||
|
|
||||||
@ -118,10 +117,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
|||||||
def all_special_tokens(self) -> list[str]:
|
def all_special_tokens(self) -> list[str]:
|
||||||
return tokenizer_all_special_tokens
|
return tokenizer_all_special_tokens
|
||||||
|
|
||||||
@property
|
|
||||||
def all_special_tokens_extended(self) -> list[str]:
|
|
||||||
return tokenizer_all_special_tokens_extended
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_token_id(self) -> int:
|
def max_token_id(self) -> int:
|
||||||
return max_token_id
|
return max_token_id
|
||||||
|
|||||||
@ -10,11 +10,6 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class TokenizerBase(ABC):
|
class TokenizerBase(ABC):
|
||||||
@property
|
|
||||||
@abstractmethod
|
|
||||||
def all_special_tokens_extended(self) -> list[str]:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def all_special_tokens(self) -> list[str]:
|
def all_special_tokens(self) -> list[str]:
|
||||||
|
|||||||
@ -254,10 +254,6 @@ class MistralTokenizer(TokenizerBase):
|
|||||||
|
|
||||||
# the following attributes are set to fit vLLM's design and are used
|
# the following attributes are set to fit vLLM's design and are used
|
||||||
# by the structured output backends.
|
# by the structured output backends.
|
||||||
@property
|
|
||||||
def all_special_tokens_extended(self) -> list[str]:
|
|
||||||
return self.all_special_tokens
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_special_tokens(self) -> list[str]:
|
def all_special_tokens(self) -> list[str]:
|
||||||
return self._special_tokens
|
return self._special_tokens
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user