Remove all_special_tokens_extended from tokenizer code (#29686)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-11-28 20:26:51 +00:00 committed by GitHub
parent 8d9338fae4
commit fecae12cd7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 40 additions and 65 deletions

View File

@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
# Cached attributes # Cached attributes
assert target.all_special_ids == expected.all_special_ids assert target.all_special_ids == expected.all_special_ids
assert target.all_special_tokens == expected.all_special_tokens assert target.all_special_tokens == expected.all_special_tokens
assert target.all_special_tokens_extended == expected.all_special_tokens_extended
assert target.get_vocab() == expected.get_vocab() assert target.get_vocab() == expected.get_vocab()
assert len(target) == len(expected) assert len(target) == len(expected)

View File

@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
) )
class TestMistralTokenizer: class TestMistralTokenizer:
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer): def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
attributes = [ if mistral_tokenizer.is_tekken:
mistral_tokenizer.all_special_tokens, assert mistral_tokenizer.all_special_tokens == [
mistral_tokenizer.all_special_tokens_extended, "<unk>",
] "<s>",
"</s>",
for attribute in attributes: "[INST]",
if mistral_tokenizer.is_tekken: "[/INST]",
assert attribute == [ "[AVAILABLE_TOOLS]",
"<unk>", "[/AVAILABLE_TOOLS]",
"<s>", "[TOOL_RESULTS]",
"</s>", "[/TOOL_RESULTS]",
"[INST]", "[TOOL_CALLS]",
"[/INST]", "[IMG]",
"[AVAILABLE_TOOLS]", "<pad>",
"[/AVAILABLE_TOOLS]", "[IMG_BREAK]",
"[TOOL_RESULTS]", "[IMG_END]",
"[/TOOL_RESULTS]", "[PREFIX]",
"[TOOL_CALLS]", "[MIDDLE]",
"[IMG]", "[SUFFIX]",
"<pad>", "[SYSTEM_PROMPT]",
"[IMG_BREAK]", "[/SYSTEM_PROMPT]",
"[IMG_END]", "[TOOL_CONTENT]",
"[PREFIX]", ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
"[MIDDLE]", "[ARGS]",
"[SUFFIX]", "[CALL_ID]",
"[SYSTEM_PROMPT]", "[THINK]",
"[/SYSTEM_PROMPT]", "[/THINK]",
"[TOOL_CONTENT]", ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [ else:
"[ARGS]", assert mistral_tokenizer.all_special_tokens == [
"[CALL_ID]", "<s>",
"[THINK]", "</s>",
"[/THINK]", "[INST]",
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)] "[/INST]",
else: "[TOOL_CALLS]",
assert attribute == [ "[AVAILABLE_TOOLS]",
"<s>", "[/AVAILABLE_TOOLS]",
"</s>", "[TOOL_RESULTS]",
"[INST]", "[/TOOL_RESULTS]",
"[/INST]", ] + [f"[control_{i}]" for i in range(8, 769)]
"[TOOL_CALLS]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
] + [f"[control_{i}]" for i in range(8, 769)]
def get_vocab(self, mistral_tokenizer: MistralTokenizer): def get_vocab(self, mistral_tokenizer: MistralTokenizer):
assert ( assert (

View File

@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
return TestTokenizer() return TestTokenizer()
@property
def all_special_tokens_extended(self) -> list[str]:
raise NotImplementedError()
@property @property
def all_special_tokens(self) -> list[str]: def all_special_tokens(self) -> list[str]:
raise NotImplementedError() raise NotImplementedError()

View File

@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
tokenizer_all_special_ids = tokenizer.all_special_ids tokenizer_all_special_ids = tokenizer.all_special_ids
tokenizer_all_special_tokens = tokenizer.all_special_tokens tokenizer_all_special_tokens = tokenizer.all_special_tokens
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
tokenizer_vocab = tokenizer.get_vocab() tokenizer_vocab = tokenizer.get_vocab()
tokenizer_len = len(tokenizer) tokenizer_len = len(tokenizer)
@ -118,10 +117,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
def all_special_tokens(self) -> list[str]: def all_special_tokens(self) -> list[str]:
return tokenizer_all_special_tokens return tokenizer_all_special_tokens
@property
def all_special_tokens_extended(self) -> list[str]:
return tokenizer_all_special_tokens_extended
@property @property
def max_token_id(self) -> int: def max_token_id(self) -> int:
return max_token_id return max_token_id

View File

@ -10,11 +10,6 @@ if TYPE_CHECKING:
class TokenizerBase(ABC): class TokenizerBase(ABC):
@property
@abstractmethod
def all_special_tokens_extended(self) -> list[str]:
raise NotImplementedError()
@property @property
@abstractmethod @abstractmethod
def all_special_tokens(self) -> list[str]: def all_special_tokens(self) -> list[str]:

View File

@ -254,10 +254,6 @@ class MistralTokenizer(TokenizerBase):
# the following attributes are set to fit vLLM's design and are used # the following attributes are set to fit vLLM's design and are used
# by the structured output backends. # by the structured output backends.
@property
def all_special_tokens_extended(self) -> list[str]:
return self.all_special_tokens
@property @property
def all_special_tokens(self) -> list[str]: def all_special_tokens(self) -> list[str]:
return self._special_tokens return self._special_tokens