vllm/tests/tokenizers_/test_hf.py
majiayu000 abd1dbc548 [Bugfix] Preserve original tokenizer class name in CachedTokenizer
HuggingFace transformers processor validates tokenizer type by checking
the class name. When vLLM creates a CachedTokenizer with a modified
class name (e.g., 'CachedQwen2TokenizerFast'), the processor type check
fails with TypeError.

This fix preserves the original tokenizer class name and qualname in
CachedTokenizer, ensuring compatibility with HuggingFace transformers
processor type checking.

Fixes #31080

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: majiayu000 <1835304752@qq.com>
2025-12-24 16:02:48 +08:00

61 lines
2.2 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle
from copy import deepcopy
import pytest
from transformers import AutoTokenizer
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import get_cached_tokenizer
@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
def test_cached_tokenizer(model_id: str):
reference_tokenizer = AutoTokenizer.from_pretrained(
model_id, trust_remote_code=True
)
reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
reference_tokenizer.add_special_tokens({"additional_special_tokens": ["<SEP>"]})
cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
_check_consistency(cached_tokenizer, reference_tokenizer)
pickled_tokenizer = pickle.dumps(cached_tokenizer)
unpickled_tokenizer = pickle.loads(pickled_tokenizer)
_check_consistency(unpickled_tokenizer, reference_tokenizer)
def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
assert isinstance(target, type(expected))
# Cached attributes
assert target.all_special_ids == expected.all_special_ids
assert target.all_special_tokens == expected.all_special_tokens
assert target.get_vocab() == expected.get_vocab()
assert len(target) == len(expected)
# Other attributes
assert getattr(target, "padding_side", None) == getattr(
expected, "padding_side", None
)
assert target.encode("prompt") == expected.encode("prompt")
def test_cached_tokenizer_preserves_class_name():
"""Test that cached tokenizer preserves original class name.
This is important for compatibility with HuggingFace transformers
processor type checking, which validates tokenizer class name.
See: https://github.com/vllm-project/vllm/issues/31080
"""
tokenizer = AutoTokenizer.from_pretrained("gpt2")
original_class_name = tokenizer.__class__.__name__
cached_tokenizer = get_cached_tokenizer(tokenizer)
# The cached tokenizer's class should have the same name as original
assert cached_tokenizer.__class__.__name__ == original_class_name
assert cached_tokenizer.__class__.__qualname__ == tokenizer.__class__.__qualname__