fixed reasoning streaming with tool_choice="required" (#24108)

Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr> Signed-off-by: ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com> Co-authored-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-12-16 05:15:00 +08:00 · 2025-10-22 11:42:55 +02:00 · 2025-10-22 11:42:55 +02:00 · a4c29e6e82
commit a4c29e6e82
parent 8f18feb191
2 changed files with 57 additions and 24 deletions
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -194,11 +194,19 @@ async def test_function_tool_use(
        )
        output = []
        reasoning = []
        async for chunk in output_stream:
-            if chunk.choices and chunk.choices[0].delta.tool_calls:
+            if chunk.choices:
                if enable_thinking and getattr(
                    chunk.choices[0].delta, "reasoning_content", None
                ):
                    reasoning.append(chunk.choices[0].delta.reasoning_content)
                if chunk.choices[0].delta.tool_calls:
                    output.extend(chunk.choices[0].delta.tool_calls)
        assert len(output) > 0
        if enable_thinking:
            assert len(reasoning) > 0
@pytest.fixture(scope="module")
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -563,8 +563,6 @@ class OpenAIServingChat(OpenAIServing):
            # For reasoning parser and tool call all enabled
            added_content_delta_arr = [False] * num_choices
            reasoning_end_arr = [False] * num_choices
        elif request.tool_choice == "required":
            all_previous_token_ids = None
        else:
            all_previous_token_ids = None
@ -880,13 +878,40 @@ class OpenAIServingChat(OpenAIServing):
                        previous_text = previous_texts[i]
                        current_text = previous_text + delta_text
                        fn_name_returned = function_name_returned[i]
                        output_token_ids = as_list(output.token_ids)
-                        if self.reasoning_parser:
+                        if (
-                            _, content = reasoning_parser.extract_reasoning_content(
+                            self.reasoning_parser is not None
-                                current_text, request
+                            and not reasoning_end_arr[i]
                            and res.prompt_token_ids
                            and reasoning_parser.is_reasoning_end(res.prompt_token_ids)
                        ):
                            reasoning_end_arr[i] = True
                        if self.reasoning_parser and not reasoning_end_arr[i]:
                            delta_message = (
                                reasoning_parser.extract_reasoning_content_streaming(
                                    previous_text,
                                    current_text,
                                    delta_text,
                                    previous_token_ids,
                                    current_token_ids,
                                    output_token_ids,
                                )
                            )
                            if reasoning_parser.is_reasoning_end(output_token_ids):
                                reasoning_end_arr[i] = True
                                if delta_message and delta_message.content:
                                    current_text = delta_message.content
                                    delta_message.content = None
                                else:
                                    # reasoning ended
                                    current_text = ""
                        else:
                            # either finished reasoning or no reasoning at all
                            content = current_text
                            delta_message, function_name_returned[i] = (
                                self.extract_tool_call_required_streaming(
                                    previous_text=previous_text,