[IMPROVEMENT] Change MistralReasoningParser behavior (#30391)

Signed-off-by: juliendenize <julien.denize@mistral.ai> Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-12-22 17:25:41 +08:00 · 2025-12-11 17:53:26 +01:00 · 2025-12-11 17:53:26 +01:00 · aa3c250c48
commit aa3c250c48
parent 305b168a9f
2 changed files with 192 additions and 70 deletions
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@ -18,47 +18,53 @@ def mistral_tokenizer():
    return mistral_tokenizer
-SIMPLE_REASONING = {
+INVALID_SIMPLE_REASONING = {
    "output": "This is a reasoning section[/THINK]This is the rest",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": "This is the rest",
+    "content": "This is a reasoning sectionThis is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-COMPLETE_REASONING = {
+INVALID_COMPLETE_REASONING = {
    "output": "This is a reasoning section[/THINK]",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": None,
+    "content": "This is a reasoning section",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 NO_CONTENT = {
-    "output": "This is content",
+    "output": "[THINK]This is reasoning",
-    "reasoning": "This is content",
+    "reasoning": "This is reasoning",
    "content": None,
    "is_reasoning_end": False,
 }
 NO_REASONING = {
    "output": "This is content",
    "reasoning": None,
    "content": "This is content",
    "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
    "output": "This is a reasoning section",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": None,
+    "content": "This is a reasoning section",
    "is_reasoning_end": False,
 }
-MULTIPLE_LINES = {
+INVALID_MULTIPLE_LINES = {
    "output": "This\nThat[/THINK]This is the rest\nThat",
-    "reasoning": "This\nThat",
+    "reasoning": None,
-    "content": "This is the rest\nThat",
+    "content": "This\nThatThis is the rest\nThat",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-SHORTEST_REASONING_NO_STREAMING = {
+INVALID_SHORTEST_REASONING_NO_STREAMING = {
    "output": "[/THINK]This is the rest",
    "reasoning": "",
    "content": "This is the rest",
    "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 INVALID_SHORTEST_REASONING = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
    "is_reasoning_end": False,
 }
 REASONING_WITH_THINK = {
    "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
    "content": "This is the rest\nThat",
    "is_reasoning_end": True,
 }
-SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
    "output": "[/THINK]This is the rest",
    "reasoning": "",
    "content": "This is the rest",
    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 INVALID_SHORTEST_REASONING_WITH_THINK = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
    "is_reasoning_end": False,
 }
 THINK_NO_END = {
    "output": "[THINK]This is a reasoning section",
@ -98,8 +104,8 @@ THINK_NO_END = {
 }
 EMPTY = {
    "output": "",
-    "reasoning": "",
+    "reasoning": None,
-    "content": None,
+    "content": "",
    "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
@ -109,47 +115,48 @@ EMPTY_STREAMING = {
    "is_reasoning_end": False,
 }
 NEW_LINE = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
    "reasoning": "This is a reasoning section",
-    "content": "\nThis is the rest",
+    "content": "Before\n\nThis is the rest",
    "is_reasoning_end": True,
 }
 # Streaming cannot handle new lines at the beginning of the output
 # because we need to support [THINK]...[/THINK] and [/THINK]...
 # We cannot know if the text before [THINK] is reasoning content
 # or not.
 NEW_LINE_STREAMING = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning": "\nThis is a reasoning section",
+    "reasoning": "This is a reasoning section",
-    "content": "\nThis is the rest",
+    "content": "Before\n\nThis is the rest",
    "is_reasoning_end": True,
 }
 TEST_CASES = [
    pytest.param(
        False,
-        SIMPLE_REASONING,
+        INVALID_SIMPLE_REASONING,
-        id="simple_reasoning",
+        id="invalid_simple_reasoning",
    ),
    pytest.param(
        True,
-        SIMPLE_REASONING,
+        INVALID_SIMPLE_REASONING,
-        id="simple_reasoning_streaming",
+        id="invalid_simple_reasoning_streaming",
    ),
    pytest.param(
        False,
-        COMPLETE_REASONING,
+        INVALID_COMPLETE_REASONING,
-        id="complete_reasoning",
+        id="invalid_complete_reasoning",
    ),
    pytest.param(
        True,
-        COMPLETE_REASONING,
+        INVALID_COMPLETE_REASONING,
-        id="complete_reasoning_streaming",
+        id="invalid_complete_reasoning_streaming",
    ),
    pytest.param(
        False,
        NO_CONTENT,
-        id="no_content_token",
+        id="no_content",
    ),
    pytest.param(
        False,
        NO_REASONING,
        id="no_reasoning",
    ),
    pytest.param(
        True,
@ -158,23 +165,23 @@ TEST_CASES = [
    ),
    pytest.param(
        False,
-        MULTIPLE_LINES,
+        INVALID_MULTIPLE_LINES,
-        id="multiple_lines",
+        id="invalid_multiple_lines",
    ),
    pytest.param(
        True,
-        MULTIPLE_LINES,
+        INVALID_MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        id="invalid_multiple_lines_streaming",
    ),
    pytest.param(
        True,
-        SHORTEST_REASONING,
+        INVALID_SHORTEST_REASONING,
-        id="shortest",
+        id="invalid_shortest",
    ),
    pytest.param(
        False,
-        SHORTEST_REASONING_NO_STREAMING,
+        INVALID_SHORTEST_REASONING_NO_STREAMING,
-        id="shortest_streaming",
+        id="invalid_shortest_streaming",
    ),
    pytest.param(
        False,
@ -208,13 +215,13 @@ TEST_CASES = [
    ),
    pytest.param(
        False,
-        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
-        id="shortest_with_think",
+        id="invalid_shortest_with_think",
    ),
    pytest.param(
        True,
-        SHORTEST_REASONING_WITH_THINK,
+        INVALID_SHORTEST_REASONING_WITH_THINK,
-        id="shortest_with_think_streaming",
+        id="invalid_shortest_with_think_streaming",
    ),
    pytest.param(
        False,
@ -316,10 +323,26 @@ def test_mistral_reasoning(
    # Test extract_content
    if param_dict["content"] is not None:
-        content = parser.extract_content_ids(output_tokens)
+        # Handle the case where there are tokens outputted before Thinking.
-        assert content == mistral_tokenizer.tokenizer.encode(
+        # This should not occur if the model is well trained and prompted.
-            param_dict["content"], bos=False, eos=False
+        if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith(
            "[THINK]"
        ):
            before_content = param_dict["output"].split("[THINK]")[0]
            before_token_ids = mistral_tokenizer.tokenizer.encode(
                before_content, bos=False, eos=False
            )
            left_to_encode = param_dict["content"][len(before_content) :]
        # Normal situation.
        else:
            before_token_ids = []
            left_to_encode = param_dict["content"]
        content_tokens = parser.extract_content_ids(output_tokens)
        expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode(
            left_to_encode, bos=False, eos=False
        )
        assert content_tokens == expected_token_ids
    else:
        content = parser.extract_content_ids(output_tokens)
        assert content == []
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@ -3,20 +3,29 @@
 from functools import cached_property
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
    ResponsesRequest,
 )
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
-from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 from vllm.tokenizers import MistralTokenizer
 logger = init_logger(__name__)
-class MistralReasoningParser(DeepSeekR1ReasoningParser):
+class MistralReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for Mistral models.
-    The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning
+    The Mistral models uses `[THINK]`...`[/THINK]` tokens to denote reasoning
    text. This parser extracts the reasoning content from the model output.
    A valid reasoning trace should always start with a `[THINK]` token and end with
    a `[/THINK]` token.
    If `[THINK]` token is not generated, then this parser only returns content.
    """
    def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
@ -53,3 +62,93 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser):
        from mistral_common.tokens.tokenizers.base import SpecialTokens
        return SpecialTokens.end_think
    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        has_eot_token = False
        for id in input_ids[::-1]:
            if id == self.start_token_id:
                # Reasoning ends only if a BOT token is found before a EOT token.
                return has_eot_token
            elif id == self.end_token_id:
                has_eot_token = True
        return False
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
        Extract the content
        """
        has_bot_token = False
        has_eot_token = False
        bot_token_index = -1
        eot_token_index = -1
        # One for loop instead of multiple lookups
        for i, token_id in enumerate(input_ids):
            # We filter that we have multiple BOT tokens which should not
            # happen for a well prompted trained model
            if token_id == self.start_token_id and not has_bot_token:
                has_bot_token = True
                bot_token_index = i
            elif token_id == self.end_token_id:
                has_eot_token = True
                eot_token_index = i
                break
        # 1. Only BOT has been outputted
        if has_bot_token and not has_eot_token:
            # Should be = [] if model is well prompted and trained.
            return input_ids[:bot_token_index]
        # 2. Neither BOT or EOT have been outputted
        elif not has_bot_token and not has_eot_token:
            return input_ids
        # 3. Both BOT and EOT have been outputted.
        elif has_bot_token and has_eot_token:
            return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
        # 4. Only EOT has been outputted => this should not have occured for a model
        #    well prompted and trained.
        else:
            return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]
    def extract_reasoning(
        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
    ) -> tuple[str | None, str | None]:
        """
        Extract reasoning content from the model output.
        """
        if not model_output:
            return (None, "")
        # Check if the start token is present in the model output, remove it
        # if it is present.
        prev_bot_token, bot_token, post_bot_token = model_output.partition(
            self.start_token
        )
        has_bot_token = bool(bot_token)
        # Valid EOT tokens should follow BOT token
        has_valid_eot_token = has_bot_token and self.end_token in post_bot_token
        # 1. If there is BOT token followed by EOT token
        if has_bot_token and has_valid_eot_token:
            prev_eot_token, _, post_eot_token = post_bot_token.partition(self.end_token)
            # If model is well prompted and trained prev_bot_token should be ""
            content = prev_bot_token + post_eot_token
            return prev_eot_token, content if content else None
        # 2. Only BOT token
        elif has_bot_token:
            # If model is well prompted and trained prev_bot_token should be ""
            return post_bot_token, prev_bot_token if prev_bot_token else None
        # 3. EOT token has been outputted without BOT or neither has been outputted
        else:
            has_non_valid_eot_token = self.end_token in prev_bot_token
            # 3.a EOT token has been outputted without BOT
            # If model is well prompted and trained `has_non_valid_eot_token` should
            # be `False` and the parser outputs all tokens as 'content'
            if has_non_valid_eot_token:
                prev_eot_token, _, post_eot_token = prev_bot_token.partition(
                    self.end_token
                )
                return None, prev_eot_token + post_eot_token
            # 3.b neither BOT or EOT have been outputted
            else:
                return None, prev_bot_token