mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 01:45:01 +08:00
[Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
parent
af7fc84fd2
commit
8fc57501d3
@ -1,6 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
import openai # use the official client for correctness check
|
import openai # use the official client for correctness check
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
@ -22,7 +24,9 @@ def server(): # noqa: F811
|
|||||||
"--guided-decoding-backend",
|
"--guided-decoding-backend",
|
||||||
"xgrammar",
|
"xgrammar",
|
||||||
"--tool-call-parser",
|
"--tool-call-parser",
|
||||||
"hermes"
|
"hermes",
|
||||||
|
"--reasoning-parser",
|
||||||
|
"qwen3",
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
@ -35,9 +39,53 @@ async def client(server):
|
|||||||
yield async_client
|
yield async_client
|
||||||
|
|
||||||
|
|
||||||
|
class TestCase(NamedTuple):
|
||||||
|
model_name: str
|
||||||
|
stream: bool
|
||||||
|
tool_choice: str
|
||||||
|
enable_thinking: bool
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize(
|
||||||
async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
|
"test_case",
|
||||||
|
[
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=True,
|
||||||
|
tool_choice="auto",
|
||||||
|
enable_thinking=False),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=False,
|
||||||
|
tool_choice="auto",
|
||||||
|
enable_thinking=False),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=True,
|
||||||
|
tool_choice="required",
|
||||||
|
enable_thinking=False),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=False,
|
||||||
|
tool_choice="required",
|
||||||
|
enable_thinking=False),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=True,
|
||||||
|
tool_choice="auto",
|
||||||
|
enable_thinking=True),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=False,
|
||||||
|
tool_choice="auto",
|
||||||
|
enable_thinking=True),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=True,
|
||||||
|
tool_choice="required",
|
||||||
|
enable_thinking=True),
|
||||||
|
TestCase(model_name=MODEL_NAME,
|
||||||
|
stream=False,
|
||||||
|
tool_choice="required",
|
||||||
|
enable_thinking=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_function_tool_use(client: openai.AsyncOpenAI,
|
||||||
|
test_case: TestCase):
|
||||||
tools = [
|
tools = [
|
||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
|
|||||||
"forecast for the next 5 days, in fahrenheit?",
|
"forecast for the next 5 days, in fahrenheit?",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
if not test_case.stream:
|
||||||
|
# Non-streaming test
|
||||||
|
chat_completion = await client.chat.completions.create(
|
||||||
|
messages=messages,
|
||||||
|
model=test_case.model_name,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice=test_case.tool_choice,
|
||||||
|
extra_body={
|
||||||
|
"chat_template_kwargs": {
|
||||||
|
"enable_thinking": test_case.enable_thinking
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
# Non-streaming test
|
assert chat_completion.choices[0].message.tool_calls is not None
|
||||||
chat_completion = await client.chat.completions.create(
|
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||||
messages=messages,
|
else:
|
||||||
model=model_name,
|
# Streaming test
|
||||||
tools=tools,
|
stream = await client.chat.completions.create(
|
||||||
tool_choice="required",
|
messages=messages,
|
||||||
)
|
model=test_case.model_name,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice=test_case.tool_choice,
|
||||||
|
stream=True,
|
||||||
|
extra_body={
|
||||||
|
"chat_template_kwargs": {
|
||||||
|
"enable_thinking": test_case.enable_thinking
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
assert chat_completion.choices[0].message.tool_calls is not None
|
output = []
|
||||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
async for chunk in stream:
|
||||||
|
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||||
|
output.extend(chunk.choices[0].delta.tool_calls)
|
||||||
|
|
||||||
# Streaming test
|
assert len(output) > 0
|
||||||
stream = await client.chat.completions.create(
|
|
||||||
messages=messages,
|
|
||||||
model=model_name,
|
|
||||||
tools=tools,
|
|
||||||
tool_choice="required",
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
output = []
|
|
||||||
async for chunk in stream:
|
|
||||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
|
||||||
output.extend(chunk.choices[0].delta.tool_calls)
|
|
||||||
|
|
||||||
assert len(output) > 0
|
|
||||||
|
|||||||
@ -689,7 +689,21 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
current_token_ids,
|
current_token_ids,
|
||||||
output.token_ids,
|
output.token_ids,
|
||||||
))
|
))
|
||||||
|
# When encountering think end id in prompt_token_ids
|
||||||
|
# i.e {"enable_thinking": False},
|
||||||
|
# set reasoning status to end.
|
||||||
|
# Remove the text and token ids related
|
||||||
|
# to 'reasoning_content'.
|
||||||
|
if res.prompt_token_ids and \
|
||||||
|
reasoning_parser.is_reasoning_end(
|
||||||
|
list(res.prompt_token_ids)):
|
||||||
|
reasoning_end_arr[i] = True
|
||||||
|
current_token_ids = list(output.token_ids)
|
||||||
|
if delta_message and delta_message.content:
|
||||||
|
current_text = delta_message.content
|
||||||
|
delta_message.content = None
|
||||||
|
else:
|
||||||
|
current_text = ""
|
||||||
# When encountering think end id in delta_token_ids,
|
# When encountering think end id in delta_token_ids,
|
||||||
# set reasoning status to end.
|
# set reasoning status to end.
|
||||||
# Remove the text and token ids related
|
# Remove the text and token ids related
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user