diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b4231c6d10c4..12349234c320 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params) +from vllm.utils import as_list logger = init_logger(__name__) @@ -670,10 +671,10 @@ class OpenAIServingChat(OpenAIServing): # avoid the None + list error. if previous_token_ids: - current_token_ids = previous_token_ids + list( + current_token_ids = previous_token_ids + as_list( output.token_ids) else: - current_token_ids = list(output.token_ids) + current_token_ids = as_list(output.token_ids) if self.use_harmony: if is_final: @@ -703,11 +704,10 @@ class OpenAIServingChat(OpenAIServing): # set reasoning status to end. # Only keep 'content', remove 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)) or \ - (res.prompt_token_ids and - reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids) - )): + as_list(output.token_ids)) or ( + res.prompt_token_ids + and reasoning_parser.is_reasoning_end( + res.prompt_token_ids)): reasoning_end_arr[i] = True if delta_message and delta_message.content: # This need to be added to next `delta_text` @@ -771,6 +771,7 @@ class OpenAIServingChat(OpenAIServing): assert reasoning_parser is not None assert added_content_delta_arr is not None assert reasoning_end_arr is not None + output_token_ids = as_list(output.token_ids) if not reasoning_end_arr[i]: delta_message = ( reasoning_parser. @@ -780,7 +781,7 @@ class OpenAIServingChat(OpenAIServing): delta_text, previous_token_ids, current_token_ids, - output.token_ids, + output_token_ids, )) # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, @@ -789,9 +790,9 @@ class OpenAIServingChat(OpenAIServing): # to 'reasoning_content'. if res.prompt_token_ids and \ reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids)): + res.prompt_token_ids): reasoning_end_arr[i] = True - current_token_ids = list(output.token_ids) + current_token_ids = output_token_ids if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -802,11 +803,11 @@ class OpenAIServingChat(OpenAIServing): # Remove the text and token ids related # to 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)): + output_token_ids): reasoning_end_arr[i] = True current_token_ids = \ reasoning_parser.extract_content_ids( - list(output.token_ids)) + output_token_ids) if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -815,7 +816,7 @@ class OpenAIServingChat(OpenAIServing): # handle tool calls only after reasoning is done, else: - delta_token_ids = list(output.token_ids) + delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids # to delta from previous @@ -899,7 +900,7 @@ class OpenAIServingChat(OpenAIServing): self.request_logger.log_outputs( request_id=request_id, outputs=delta_content, - output_token_ids=list(output.token_ids), + output_token_ids=as_list(output.token_ids), finish_reason=output.finish_reason, is_streaming=True, delta=True, diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 4f4522d726e8..df9e84163f16 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -44,7 +44,7 @@ class ReasoningParser: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + def is_reasoning_end(self, input_ids: list[int]) -> bool: """ Check if the reasoning content ends in the input_ids.