[Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-06-05 14:24:24 +08:00 committed by GitHub
parent af7fc84fd2
commit 8fc57501d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 98 additions and 28 deletions

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import NamedTuple
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
@ -22,7 +24,9 @@ def server(): # noqa: F811
"--guided-decoding-backend", "--guided-decoding-backend",
"xgrammar", "xgrammar",
"--tool-call-parser", "--tool-call-parser",
"hermes" "hermes",
"--reasoning-parser",
"qwen3",
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@ -35,9 +39,53 @@ async def client(server):
yield async_client yield async_client
class TestCase(NamedTuple):
model_name: str
stream: bool
tool_choice: str
enable_thinking: bool
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize(
async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str): "test_case",
[
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="auto",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="auto",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="required",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="required",
enable_thinking=False),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="auto",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="auto",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=True,
tool_choice="required",
enable_thinking=True),
TestCase(model_name=MODEL_NAME,
stream=False,
tool_choice="required",
enable_thinking=True),
],
)
async def test_function_tool_use(client: openai.AsyncOpenAI,
test_case: TestCase):
tools = [ tools = [
{ {
"type": "function", "type": "function",
@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
"forecast for the next 5 days, in fahrenheit?", "forecast for the next 5 days, in fahrenheit?",
}, },
] ]
if not test_case.stream:
# Non-streaming test
chat_completion = await client.chat.completions.create(
messages=messages,
model=test_case.model_name,
tools=tools,
tool_choice=test_case.tool_choice,
extra_body={
"chat_template_kwargs": {
"enable_thinking": test_case.enable_thinking
}
})
# Non-streaming test assert chat_completion.choices[0].message.tool_calls is not None
chat_completion = await client.chat.completions.create( assert len(chat_completion.choices[0].message.tool_calls) > 0
messages=messages, else:
model=model_name, # Streaming test
tools=tools, stream = await client.chat.completions.create(
tool_choice="required", messages=messages,
) model=test_case.model_name,
tools=tools,
tool_choice=test_case.tool_choice,
stream=True,
extra_body={
"chat_template_kwargs": {
"enable_thinking": test_case.enable_thinking
}
})
assert chat_completion.choices[0].message.tool_calls is not None output = []
assert len(chat_completion.choices[0].message.tool_calls) > 0 async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
# Streaming test assert len(output) > 0
stream = await client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice="required",
stream=True,
)
output = []
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
assert len(output) > 0

View File

@ -689,7 +689,21 @@ class OpenAIServingChat(OpenAIServing):
current_token_ids, current_token_ids,
output.token_ids, output.token_ids,
)) ))
# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning_content'.
if res.prompt_token_ids and \
reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids)):
reasoning_end_arr[i] = True
current_token_ids = list(output.token_ids)
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
current_text = ""
# When encountering think end id in delta_token_ids, # When encountering think end id in delta_token_ids,
# set reasoning status to end. # set reasoning status to end.
# Remove the text and token ids related # Remove the text and token ids related