[Bugfix] Cache added_vocab to avoid per-token overhead (#30351)

Signed-off-by: limingliang <limingliang@stepfun.com>
Co-authored-by: limingliang <limingliang@stepfun.com>
This commit is contained in:
Mingliang Li 2025-12-10 12:05:51 +08:00 committed by GitHub
parent 3bdd426636
commit d007387aa7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
self.name_or_path = ( self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else "" tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
) )
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
@classmethod @classmethod
def from_pretrained( def from_pretrained(
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
def __len__(self) -> int: def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer # </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab()) return self.vocab_size + self._added_vocab_size
def __call__( def __call__(
self, self,
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
return self.tokenizer.get_vocab() return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]: def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab() return self._added_vocab.copy()
def encode( def encode(
self, self,