[Bugfix] Cache added_vocab to avoid per-token overhead (#30351)

Signed-off-by: limingliang <limingliang@stepfun.com>
Co-authored-by: limingliang <limingliang@stepfun.com>
This commit is contained in:
Mingliang Li 2025-12-10 12:05:51 +08:00 committed by GitHub
parent 3bdd426636
commit d007387aa7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
@classmethod
def from_pretrained(
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab())
return self.vocab_size + self._added_vocab_size
def __call__(
self,
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab()
return self._added_vocab.copy()
def encode(
self,