mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 13:15:34 +08:00
[Frontend] supports interleaved thinking (#28531)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
parent
fa183e9271
commit
5c9ad138d5
118
docs/features/interleaved_thinking.md
Normal file
118
docs/features/interleaved_thinking.md
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
# Interleaved Thinking
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
|
||||||
|
|
||||||
|
Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
|
||||||
|
|
||||||
|
## How Interleaved Thinking Works
|
||||||
|
|
||||||
|
With interleaved thinking, the model can:
|
||||||
|
|
||||||
|
- Reason about the results of a tool call before deciding what to do next
|
||||||
|
- Chain multiple tool calls with reasoning steps in between
|
||||||
|
- Make more nuanced decisions based on intermediate results
|
||||||
|
- Provide transparent reasoning for its tool selection process
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
|
||||||
|
vLLM currently supports the following interleaved thinking models:
|
||||||
|
|
||||||
|
| Model Series | Reasoning Parser Name |
|
||||||
|
|--------------|-----------------------|
|
||||||
|
| moonshotai/Kimi-K2-Thinking | kimi_k2 |
|
||||||
|
| MiniMaxAI/MiniMax-M2 | minimax_m2 |
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
|
||||||
|
|
||||||
|
??? code
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""
|
||||||
|
vllm serve MiniMaxAI/MiniMax-M2 \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--tool-call-parser minimax_m2 \
|
||||||
|
--reasoning-parser minimax_m2 \
|
||||||
|
--enable-auto-tool-choice
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_weather(location: str, unit: "str"):
|
||||||
|
"""Get the current weather in a given location"""
|
||||||
|
if unit == "celsius":
|
||||||
|
return f"The current temperature in {location} is 22°C."
|
||||||
|
else:
|
||||||
|
return f"The current temperature in {location} is 72°F."
|
||||||
|
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "City and state, e.g., 'San Francisco, CA'",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location", "unit"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=client.models.list().data[0].id,
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
tool_call = response.choices[0].message.tool_calls[0].function
|
||||||
|
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": response.choices[0].message.tool_calls,
|
||||||
|
"reasoning": response.choices[0].message.reasoning, # append reasoning
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate tool execution
|
||||||
|
available_tools = {"get_weather": get_current_weather}
|
||||||
|
|
||||||
|
completion_tool_calls = response.choices[0].message.tool_calls
|
||||||
|
for call in completion_tool_calls:
|
||||||
|
tool_to_call = available_tools[call.function.name]
|
||||||
|
args = json.loads(call.function.arguments)
|
||||||
|
result = tool_to_call(**args)
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"content": result,
|
||||||
|
"tool_call_id": call.id,
|
||||||
|
"name": call.function.name,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response_2 = client.chat.completions.create(
|
||||||
|
model=client.models.list().data[0].id,
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
print(response_2.choices[0].message.content)
|
||||||
|
```
|
||||||
|
This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
|
||||||
@ -161,6 +161,7 @@ def main():
|
|||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"tool_calls": chat_completion.choices[0].message.tool_calls,
|
"tool_calls": chat_completion.choices[0].message.tool_calls,
|
||||||
|
"reasoning": chat_completion.choices[0].message.reasoning,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
|
|||||||
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
|
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
|
||||||
"""The tool calls generated by the model, such as function calls."""
|
"""The tool calls generated by the model, such as function calls."""
|
||||||
|
|
||||||
|
reasoning: str | None
|
||||||
|
"""The reasoning content for interleaved thinking."""
|
||||||
|
|
||||||
|
|
||||||
ChatCompletionMessageParam: TypeAlias = (
|
ChatCompletionMessageParam: TypeAlias = (
|
||||||
OpenAIChatCompletionMessageParam
|
OpenAIChatCompletionMessageParam
|
||||||
@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False):
|
|||||||
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
|
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
|
||||||
"""The tool calls generated by the model, such as function calls."""
|
"""The tool calls generated by the model, such as function calls."""
|
||||||
|
|
||||||
|
reasoning: str | None
|
||||||
|
"""The reasoning content for interleaved thinking."""
|
||||||
|
|
||||||
|
reasoning_content: str | None
|
||||||
|
"""Deprecated: The reasoning content for interleaved thinking."""
|
||||||
|
|
||||||
|
|
||||||
# Passed in by user
|
# Passed in by user
|
||||||
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
|
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
|
||||||
@ -1374,7 +1383,7 @@ def _parse_chat_message_content(
|
|||||||
) -> list[ConversationMessage]:
|
) -> list[ConversationMessage]:
|
||||||
role = message["role"]
|
role = message["role"]
|
||||||
content = message.get("content")
|
content = message.get("content")
|
||||||
|
reasoning = message.get("reasoning") or message.get("reasoning_content")
|
||||||
if content is None:
|
if content is None:
|
||||||
content = []
|
content = []
|
||||||
elif isinstance(content, str):
|
elif isinstance(content, str):
|
||||||
@ -1396,6 +1405,12 @@ def _parse_chat_message_content(
|
|||||||
# follow the OpenAI spec.
|
# follow the OpenAI spec.
|
||||||
if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
|
if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
|
||||||
result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
|
result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
|
||||||
|
# Include reasoning if present for interleaved thinking.
|
||||||
|
if reasoning is not None:
|
||||||
|
result_msg["reasoning"] = cast(str, reasoning)
|
||||||
|
result_msg["reasoning_content"] = cast(
|
||||||
|
str, reasoning
|
||||||
|
) # keep compatibility
|
||||||
elif role == "tool":
|
elif role == "tool":
|
||||||
parsed_msg = _ToolParser(message)
|
parsed_msg = _ToolParser(message)
|
||||||
if "tool_call_id" in parsed_msg:
|
if "tool_call_id" in parsed_msg:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user