Fix Max Token ID for Qwen-VL-Chat (#11980)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
This commit is contained in:
Alex Brooks 2025-01-13 01:37:48 -07:00 committed by GitHub
parent 89ce62a316
commit 5340a30d01
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,3 +1,4 @@
import contextlib
import os
import warnings
from pathlib import Path
@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
tokenizer.all_special_tokens_extended)
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
tokenizer_len = len(tokenizer)
max_token_id = max(tokenizer.get_vocab().values())
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
# are added and included in the implementation of the vocab_size
# property, but not in get_vocab(); if there is an implementation
# of vocab size, we should take the greater value.
if hasattr(tokenizer, "vocab_size"):
with contextlib.suppress(NotImplementedError):
max_token_id = max(max_token_id, tokenizer.vocab_size)
class CachedTokenizer(tokenizer.__class__): # type: ignore