[Bugfix]: Fix messy code when using logprobs (#19209)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-07-08 11:02:15 +08:00 committed by GitHub
parent af107d5a0e
commit 93b9d9f499
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 25 additions and 3 deletions

View File

@ -14,9 +14,12 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
import zmq import zmq
from transformers import AutoTokenizer
from vllm_test_utils.monitor import monitor from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import (
convert_ids_list_to_tokens)
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
MemorySnapshot, PlaceholderModule, StoreBoolean, MemorySnapshot, PlaceholderModule, StoreBoolean,
bind_kv_cache, common_broadcastable_dtype, bind_kv_cache, common_broadcastable_dtype,
@ -918,3 +921,14 @@ def test_split_host_port():
def test_join_host_port(): def test_join_host_port():
assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555" assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
assert join_host_port("::1", 5555) == "[::1]:5555" assert join_host_port("::1", 5555) == "[::1]:5555"
def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!")
# token_ids = [9707, 11, 1879, 0]
assert tokenizer.convert_ids_to_tokens(token_ids) == [
'Hello', ',', 'Ġworld', '!'
]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!']

View File

@ -35,7 +35,7 @@ def _ref_convert_id_to_token(
Returns: Returns:
String representation of input token id String representation of input token id
""" """
return tokenizer.convert_ids_to_tokens(token_id) or "" return tokenizer.decode([token_id]) or ""
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -78,6 +78,7 @@ def convert_prompt_ids_to_tokens(
def convert_ids_list_to_tokens( def convert_ids_list_to_tokens(
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
token_ids: list[int], token_ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]: ) -> list[str]:
"""Detokenize the input ids individually. """Detokenize the input ids individually.
@ -89,8 +90,15 @@ def convert_ids_list_to_tokens(
Python list of token string representations Python list of token string representations
""" """
token_str_lst = tokenizer.convert_ids_to_tokens(token_ids) token_str_lst = []
_replace_none_with_empty(token_str_lst) # type: ignore for token_id in token_ids:
token_str = tokenizer.decode(
[token_id],
skip_special_tokens=skip_special_tokens,
)
if token_str is None:
token_str = ""
token_str_lst.append(token_str)
return token_str_lst return token_str_lst