[Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-12-10 01:45:01 +08:00 · 2025-06-05 14:24:24 +08:00 · 2025-06-05 14:24:24 +08:00 · 8fc57501d3
commit 8fc57501d3
parent af7fc84fd2
2 changed files with 98 additions and 28 deletions
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import NamedTuple
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
@ -22,7 +24,9 @@ def server():  # noqa: F811
        "--guided-decoding-backend",
        "xgrammar",
        "--tool-call-parser",
-        "hermes"
+        "hermes",
        "--reasoning-parser",
        "qwen3",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@ -35,9 +39,53 @@ async def client(server):
        yield async_client
 class TestCase(NamedTuple):
    model_name: str
    stream: bool
    tool_choice: str
    enable_thinking: bool
@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
-async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
+    "test_case",
    [
        TestCase(model_name=MODEL_NAME,
                 stream=True,
                 tool_choice="auto",
                 enable_thinking=False),
        TestCase(model_name=MODEL_NAME,
                 stream=False,
                 tool_choice="auto",
                 enable_thinking=False),
        TestCase(model_name=MODEL_NAME,
                 stream=True,
                 tool_choice="required",
                 enable_thinking=False),
        TestCase(model_name=MODEL_NAME,
                 stream=False,
                 tool_choice="required",
                 enable_thinking=False),
        TestCase(model_name=MODEL_NAME,
                 stream=True,
                 tool_choice="auto",
                 enable_thinking=True),
        TestCase(model_name=MODEL_NAME,
                 stream=False,
                 tool_choice="auto",
                 enable_thinking=True),
        TestCase(model_name=MODEL_NAME,
                 stream=True,
                 tool_choice="required",
                 enable_thinking=True),
        TestCase(model_name=MODEL_NAME,
                 stream=False,
                 tool_choice="required",
                 enable_thinking=True),
    ],
 )
 async def test_function_tool_use(client: openai.AsyncOpenAI,
                                 test_case: TestCase):
    tools = [
        {
            "type": "function",
@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
            "forecast for the next 5 days, in fahrenheit?",
        },
    ]
    if not test_case.stream:
        # Non-streaming test
        chat_completion = await client.chat.completions.create(
            messages=messages,
            model=test_case.model_name,
            tools=tools,
            tool_choice=test_case.tool_choice,
            extra_body={
                "chat_template_kwargs": {
                    "enable_thinking": test_case.enable_thinking
                }
            })
-    # Non-streaming test
+        assert chat_completion.choices[0].message.tool_calls is not None
-    chat_completion = await client.chat.completions.create(
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
-        messages=messages,
+    else:
-        model=model_name,
+        # Streaming test
-        tools=tools,
+        stream = await client.chat.completions.create(
-        tool_choice="required",
+            messages=messages,
-    )
+            model=test_case.model_name,
            tools=tools,
            tool_choice=test_case.tool_choice,
            stream=True,
            extra_body={
                "chat_template_kwargs": {
                    "enable_thinking": test_case.enable_thinking
                }
            })
-    assert chat_completion.choices[0].message.tool_calls is not None
+        output = []
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
+        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.tool_calls:
                output.extend(chunk.choices[0].delta.tool_calls)
-    # Streaming test
+        assert len(output) > 0
    stream = await client.chat.completions.create(
        messages=messages,
        model=model_name,
        tools=tools,
        tool_choice="required",
        stream=True,
    )
    output = []
    async for chunk in stream:
        if chunk.choices and chunk.choices[0].delta.tool_calls:
            output.extend(chunk.choices[0].delta.tool_calls)
    assert len(output) > 0
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -689,7 +689,21 @@ class OpenAIServingChat(OpenAIServing):
                                    current_token_ids,
                                    output.token_ids,
                                ))
-
+                            # When encountering think end id in prompt_token_ids
                            # i.e {"enable_thinking": False},
                            # set reasoning status to end.
                            # Remove the text and token ids related
                            # to 'reasoning_content'.
                            if res.prompt_token_ids and \
                                reasoning_parser.is_reasoning_end(
                                    list(res.prompt_token_ids)):
                                reasoning_end_arr[i] = True
                                current_token_ids = list(output.token_ids)
                                if delta_message and delta_message.content:
                                    current_text = delta_message.content
                                    delta_message.content = None
                                else:
                                    current_text = ""
                            # When encountering think end id in delta_token_ids,
                            # set reasoning status to end.
                            # Remove the text and token ids related