[Bugfix]: Fix messy code when using logprobs (#19209)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-07-08 11:02:15 +08:00 committed by GitHub
parent af107d5a0e
commit 93b9d9f499
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 25 additions and 3 deletions

View File

@ -14,9 +14,12 @@ from unittest.mock import patch
import pytest
import torch
import zmq
from transformers import AutoTokenizer
from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import (
convert_ids_list_to_tokens)
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
MemorySnapshot, PlaceholderModule, StoreBoolean,
bind_kv_cache, common_broadcastable_dtype,
@ -918,3 +921,14 @@ def test_split_host_port():
def test_join_host_port():
assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
assert join_host_port("::1", 5555) == "[::1]:5555"
def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!")
# token_ids = [9707, 11, 1879, 0]
assert tokenizer.convert_ids_to_tokens(token_ids) == [
'Hello', ',', 'Ġworld', '!'
]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!']

View File

@ -35,7 +35,7 @@ def _ref_convert_id_to_token(
Returns:
String representation of input token id
"""
return tokenizer.convert_ids_to_tokens(token_id) or ""
return tokenizer.decode([token_id]) or ""
@pytest.mark.parametrize(

View File

@ -78,6 +78,7 @@ def convert_prompt_ids_to_tokens(
def convert_ids_list_to_tokens(
tokenizer: AnyTokenizer,
token_ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
"""Detokenize the input ids individually.
@ -89,8 +90,15 @@ def convert_ids_list_to_tokens(
Python list of token string representations
"""
token_str_lst = tokenizer.convert_ids_to_tokens(token_ids)
_replace_none_with_empty(token_str_lst) # type: ignore
token_str_lst = []
for token_id in token_ids:
token_str = tokenizer.decode(
[token_id],
skip_special_tokens=skip_special_tokens,
)
if token_str is None:
token_str = ""
token_str_lst.append(token_str)
return token_str_lst