fix: don't skip first special token. (#1497)

This commit is contained in:
Ricardo Lu 2023-10-29 19:26:36 +08:00 committed by GitHub
parent 28b47d1e49
commit beac8dd461
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -120,7 +120,11 @@ def detokenize_incrementally(
# tokenizers (bigger = more conservative).
# Subtract 1 extra to account for the generated token.
prefix_offset = max(len(output_tokens) - 6, 0)
read_offset = max(len(output_tokens) - 1, 0)
# If the first new token is a special token, we can't skip 1 extra token
if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
read_offset = max(len(output_tokens), 0)
else:
read_offset = max(len(output_tokens) - 1, 0)
else:
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(