diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 6833f8d96d1c..f6453af89ff7 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -194,11 +194,19 @@ async def test_function_tool_use( ) output = [] + reasoning = [] async for chunk in output_stream: - if chunk.choices and chunk.choices[0].delta.tool_calls: - output.extend(chunk.choices[0].delta.tool_calls) + if chunk.choices: + if enable_thinking and getattr( + chunk.choices[0].delta, "reasoning_content", None + ): + reasoning.append(chunk.choices[0].delta.reasoning_content) + if chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) assert len(output) > 0 + if enable_thinking: + assert len(reasoning) > 0 @pytest.fixture(scope="module") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 32e6b1d96ce2..3bf887c659dc 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -563,8 +563,6 @@ class OpenAIServingChat(OpenAIServing): # For reasoning parser and tool call all enabled added_content_delta_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices - elif request.tool_choice == "required": - all_previous_token_ids = None else: all_previous_token_ids = None @@ -880,29 +878,56 @@ class OpenAIServingChat(OpenAIServing): previous_text = previous_texts[i] current_text = previous_text + delta_text fn_name_returned = function_name_returned[i] + output_token_ids = as_list(output.token_ids) - if self.reasoning_parser: - _, content = reasoning_parser.extract_reasoning_content( - current_text, request - ) - else: - content = current_text - delta_message, function_name_returned[i] = ( - self.extract_tool_call_required_streaming( - previous_text=previous_text, - current_text=content, - delta_text=delta_text, - function_name_returned=fn_name_returned, - tool_call_idx=history_tool_call_cnt, - ) - ) if ( - delta_message - and delta_message.tool_calls - and delta_message.tool_calls[0].id is not None + self.reasoning_parser is not None + and not reasoning_end_arr[i] + and res.prompt_token_ids + and reasoning_parser.is_reasoning_end(res.prompt_token_ids) ): - history_tool_call_cnt += 1 - tools_streamed[i] = True + reasoning_end_arr[i] = True + + if self.reasoning_parser and not reasoning_end_arr[i]: + delta_message = ( + reasoning_parser.extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output_token_ids, + ) + ) + if reasoning_parser.is_reasoning_end(output_token_ids): + reasoning_end_arr[i] = True + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None + else: + # reasoning ended + current_text = "" + + else: + # either finished reasoning or no reasoning at all + content = current_text + + delta_message, function_name_returned[i] = ( + self.extract_tool_call_required_streaming( + previous_text=previous_text, + current_text=content, + delta_text=delta_text, + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt, + ) + ) + if ( + delta_message + and delta_message.tool_calls + and delta_message.tool_calls[0].id is not None + ): + history_tool_call_cnt += 1 + tools_streamed[i] = True # handle streaming deltas for tools with "auto" tool choice # and reasoning parser