[Frontend] supports interleaved thinking (#28531)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-12-09 21:35:01 +08:00 · 2025-11-13 16:14:13 +08:00 · 2025-11-13 16:14:13 +08:00 · 5c9ad138d5
commit 5c9ad138d5
parent fa183e9271
3 changed files with 135 additions and 1 deletions
--- a/docs/features/interleaved_thinking.md
+++ b/docs/features/interleaved_thinking.md
@ -0,0 +1,118 @@
+# Interleaved Thinking
+
+## Introduction
+
+Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
+
+Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
+
+## How Interleaved Thinking Works
+
+With interleaved thinking, the model can:
+
+- Reason about the results of a tool call before deciding what to do next
+- Chain multiple tool calls with reasoning steps in between
+- Make more nuanced decisions based on intermediate results
+- Provide transparent reasoning for its tool selection process
+
+## Supported Models
+
+vLLM currently supports the following interleaved thinking models:
+
+| Model Series | Reasoning Parser Name |
+|--------------|-----------------------|
+| moonshotai/Kimi-K2-Thinking    |  kimi_k2  |
+| MiniMaxAI/MiniMax-M2           |  minimax_m2  |
+
+## Example Usage
+
+To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
+
+??? code
+
+    ```python
+    """
+    vllm serve MiniMaxAI/MiniMax-M2 \
+      --tensor-parallel-size 4 \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2 \
+      --enable-auto-tool-choice
+    """
+    import json
+    
+    from openai import OpenAI
+    
+    client = OpenAI(base_url="http://localhost:8000/v1",     api_key="dummy")
+    
+    
+    def get_current_weather(location: str, unit: "str"):
+        """Get the current weather in a given location"""
+        if unit == "celsius":
+            return f"The current temperature in {location} is 22°C."
+        else:
+            return f"The current temperature in {location} is 72°F."
+    
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given     location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g.,     'San Francisco, CA'",
+                        },
+                        "unit": {"type": "string", "enum":     ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    
+    tool_call = response.choices[0].message.tool_calls[0].function
+    
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": response.choices[0].message.tool_calls,
+            "reasoning": response.choices[0].message.reasoning, # append reasoning
+        }
+    )
+    
+    # Simulate tool execution
+    available_tools = {"get_weather": get_current_weather}
+    
+    completion_tool_calls = response.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    response_2 = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(response_2.choices[0].message.content)
+    ```
+This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@ -161,6 +161,7 @@ def main():
        {
            "role": "assistant",
            "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
        }
    )

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
    """The tool calls generated by the model, such as function calls."""

+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+

 ChatCompletionMessageParam: TypeAlias = (
    OpenAIChatCompletionMessageParam
@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False):
    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
    """The tool calls generated by the model, such as function calls."""

+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
+    reasoning_content: str | None
+    """Deprecated: The reasoning content for interleaved thinking."""
+

 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@ -1374,7 +1383,7 @@ def _parse_chat_message_content(
 ) -> list[ConversationMessage]:
    role = message["role"]
    content = message.get("content")
-
+    reasoning = message.get("reasoning") or message.get("reasoning_content")
    if content is None:
        content = []
    elif isinstance(content, str):
@ -1396,6 +1405,12 @@ def _parse_chat_message_content(
            # follow the OpenAI spec.
            if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
                result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+            # Include reasoning if present for interleaved thinking.
+            if reasoning is not None:
+                result_msg["reasoning"] = cast(str, reasoning)
+                result_msg["reasoning_content"] = cast(
+                    str, reasoning
+                )  # keep compatibility
        elif role == "tool":
            parsed_msg = _ToolParser(message)
            if "tool_call_id" in parsed_msg: