Patch Mistral Tokenizer (#28146)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
This commit is contained in:
Julien Denize 2025-11-06 07:43:16 +01:00 committed by GitHub
parent e31946f86e
commit a404e2c0f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 42 additions and 22 deletions

View File

@ -334,20 +334,20 @@ class TestMistralTokenizer:
def test_encode(self, mistral_tokenizer: MistralTokenizer): def test_encode(self, mistral_tokenizer: MistralTokenizer):
token_ids = ( token_ids = (
[1, 22177, 4304, 2662, 2] [1, 22177, 4304, 2662]
if mistral_tokenizer.is_tekken if mistral_tokenizer.is_tekken
else [1, 23325, 2294, 1686, 2] else [1, 23325, 2294, 1686]
) )
assert mistral_tokenizer.encode("Hello world !") == token_ids[:-1] assert mistral_tokenizer.encode("Hello world !") == token_ids
assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-2] assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-1]
assert ( assert (
mistral_tokenizer.encode("Hello world !", truncation=True, max_length=3) mistral_tokenizer.encode("Hello world !", truncation=True, max_length=3)
== token_ids[:-2] == token_ids[:-1]
) )
assert ( assert (
mistral_tokenizer.encode("Hello world !", truncation=False, max_length=3) mistral_tokenizer.encode("Hello world !", truncation=False, max_length=3)
== token_ids[:-1] == token_ids
) )
assert ( assert (
@ -358,7 +358,7 @@ class TestMistralTokenizer:
mistral_tokenizer.encode( mistral_tokenizer.encode(
"Hello world !", add_special_tokens=True, max_length=3 "Hello world !", add_special_tokens=True, max_length=3
) )
== token_ids[:-2] == token_ids[:-1]
) )
assert ( assert (
mistral_tokenizer.encode( mistral_tokenizer.encode(
@ -368,7 +368,7 @@ class TestMistralTokenizer:
) )
assert ( assert (
mistral_tokenizer.encode("Hello world !", add_special_tokens=False) mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
== token_ids[1:-1] == token_ids[1:]
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -1088,6 +1088,19 @@ class TestMistralTokenizer:
== expected_tokens[mistral_tokenizer.is_tekken] == expected_tokens[mistral_tokenizer.is_tekken]
) )
def test_decode_int(
self,
mistral_tokenizer: MistralTokenizer,
):
ids = 1
assert (
mistral_tokenizer.decode(
ids,
skip_special_tokens=False,
)
== "<s>"
)
def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer): def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
tokens = ( tokens = (
[ [

View File

@ -165,6 +165,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
class MistralTokenizer(TokenizerBase): class MistralTokenizer(TokenizerBase):
def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None: def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
from mistral_common.protocol.instruct.validator import ValidationMode
from mistral_common.tokens.tokenizers.sentencepiece import ( from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer, SentencePieceTokenizer,
) )
@ -175,6 +176,14 @@ class MistralTokenizer(TokenizerBase):
self.instruct = self.mistral.instruct_tokenizer self.instruct = self.mistral.instruct_tokenizer
self.tokenizer = self.instruct.tokenizer self.tokenizer = self.instruct.tokenizer
mode = self.mistral._chat_completion_request_validator._mode
if mode != ValidationMode.test:
raise ValueError(
"Mistral tokenizer must be in test mode. Make sure to "
"set `mode='ValidationMode.test'` when creating the "
"Mistral tokenizer."
)
_mistral_version_str = str(self.tokenizer.version.value) _mistral_version_str = str(self.tokenizer.version.value)
self.version: int = int(_mistral_version_str.split("v")[-1]) self.version: int = int(_mistral_version_str.split("v")[-1])
@ -205,6 +214,7 @@ class MistralTokenizer(TokenizerBase):
def from_pretrained( def from_pretrained(
cls, path_or_repo_id: str, *, revision: str | None = None cls, path_or_repo_id: str, *, revision: str | None = None
) -> "MistralTokenizer": ) -> "MistralTokenizer":
from mistral_common.protocol.instruct.validator import ValidationMode
from transformers.tokenization_mistral_common import ( from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as TransformersMistralTokenizer, MistralCommonTokenizer as TransformersMistralTokenizer,
) )
@ -212,7 +222,7 @@ class MistralTokenizer(TokenizerBase):
str_revision = "main" if revision is None else revision str_revision = "main" if revision is None else revision
return cls( return cls(
TransformersMistralTokenizer.from_pretrained( TransformersMistralTokenizer.from_pretrained(
path_or_repo_id, revision=str_revision path_or_repo_id, revision=str_revision, mode=ValidationMode.test
) )
) )
@ -339,15 +349,9 @@ class MistralTokenizer(TokenizerBase):
max_length: int | None = None, max_length: int | None = None,
add_special_tokens: bool | None = None, add_special_tokens: bool | None = None,
) -> list[int]: ) -> list[int]:
if add_special_tokens is not None: encoded = self.tokenizer.encode(
return self.transformers_tokenizer.encode( text, bos=add_special_tokens is not False, eos=False
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
) )
else:
encoded = self.tokenizer.encode(text, bos=True, eos=False)
if truncation is not False and max_length is not None: if truncation is not False and max_length is not None:
return encoded[:max_length] return encoded[:max_length]
@ -383,6 +387,9 @@ class MistralTokenizer(TokenizerBase):
) )
def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
if isinstance(ids, int):
ids = [ids]
return self.transformers_tokenizer.decode( return self.transformers_tokenizer.decode(
ids, skip_special_tokens=skip_special_tokens ids, skip_special_tokens=skip_special_tokens
) )