[Bugfix] The special_tokens in tokenizer should also be controlled by do_lower_case in encoder_config. (#20750)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-07-18 17:10:47 +08:00 committed by GitHub
parent ca4eb82bcb
commit 5895afd780
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 0 deletions

View File

@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
TOKENIZER_NAMES = ["BAAI/bge-base-en"]
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
@pytest.mark.parametrize("n_tokens", [510])
def test_special_tokens(tokenizer_name: str, n_tokens: int):
tokenizer = get_tokenizer(tokenizer_name, revision="main")
prompts = '[UNK]' * n_tokens
prompt_token_ids = tokenizer.encode(prompts)
assert len(prompt_token_ids) == n_tokens + 2

View File

@ -16,6 +16,8 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
from vllm import envs
from vllm.logger import init_logger
from vllm.transformers_utils.config import (
get_sentence_transformer_tokenizer_config)
from vllm.transformers_utils.tokenizers import MistralTokenizer
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import make_async
@ -256,6 +258,18 @@ def get_tokenizer(
else:
raise e
# The special_tokens in tokenizer should also be
# controlled by do_lower_case in encoder_config
encoder_config = get_sentence_transformer_tokenizer_config(
tokenizer_name, revision)
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False):
special_tokens_map = {
k: v.lower()
for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
"ChatGLM4Tokenizer"):