[Perf] Speed up function _convert_tokens_to_string_with_added_encoders by 13.7x (#20413)

Signed-off-by: Saurabh Misra <misra.saurabh1@gmail.com>
Signed-off-by: Aseem Saxena <aseem.bits@gmail.com>
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: Aseem Saxena <aseem.bits@gmail.com>
This commit is contained in:
Saurabh Misra 2025-08-20 13:17:11 -07:00 committed by GitHub
parent b95697d731
commit bf7c99dfc4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -23,26 +23,31 @@ def _convert_tokens_to_string_with_added_encoders(
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
# Performance improvements: avoid repeated attribute and function lookups;
# localize frequently used objects;
sub_texts: list[str] = []
current_sub_text: list[str] = []
all_special_tokens = set(tokenizer.all_special_tokens)
convert_tokens_to_string = tokenizer.convert_tokens_to_string
added_vocab_set = set(tokenizer.get_added_vocab())
all_special_tokens = set(
tokenizer.all_special_tokens) if skip_special_tokens else ()
for token in output_tokens:
if skip_special_tokens and token in all_special_tokens:
# Use precomputed set for skip-special check
if token in all_special_tokens:
continue
if token in tokenizer.get_added_vocab():
if token in added_vocab_set:
if current_sub_text:
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
sub_texts.append(sub_text)
current_sub_text = []
sub_texts.append(convert_tokens_to_string(current_sub_text))
current_sub_text.clear()
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
sub_texts.append(sub_text)
sub_texts.append(convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
return " ".join(sub_texts)
else:
return "".join(sub_texts)