[New Model] Add Seed-Oss model (#23241)

Signed-off-by: jiabin.00 <jiabin.00@bytedance.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-07-25 04:07:09 +08:00 · 2025-08-22 12:58:10 +08:00 · 2025-08-22 12:58:10 +08:00 · 5964069367
commit 5964069367
parent de9c085e17
7 changed files with 1629 additions and 0 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -401,6 +401,7 @@ th {
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -292,6 +292,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                          trust_remote_code=True,
                                          is_available_online=False),
    "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@ -0,0 +1,459 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 import json
 from collections.abc import Generator
 from typing import Optional
 import pytest
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              ChatCompletionToolsParam,
                                              DeltaMessage, FunctionCall,
                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
 from vllm.transformers_utils.detokenizer import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 # Use a common model that is likely to be available
 MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
@pytest.fixture(scope="module")
 def seed_oss_tokenizer():
    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
@pytest.fixture
 def seed_oss_tool_parser(seed_oss_tokenizer):
    return SeedOssToolParser(seed_oss_tokenizer)
@pytest.fixture
 def sample_tools():
    return [
        ChatCompletionToolsParam(
            type="function",
            function={
                "name": "get_weather",
                "description": "Get current temperature for a given location.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description":
                            "City and country e.g. Bogotá, Colombia"
                        },
                        "unit": {
                            "type": "string",
                            "description": "this is the unit of temperature"
                        }
                    },
                    "required": ["location"],
                    "additionalProperties": False
                },
                "returns": {
                    "type": "object",
                    "properties": {
                        "temperature": {
                            "type": "number",
                            "description": "temperature in celsius"
                        }
                    },
                    "required": ["temperature"],
                    "additionalProperties": False
                },
                "strict": True
            }),
    ]
 def assert_tool_calls(actual_tool_calls: list[ToolCall],
                      expected_tool_calls: list[ToolCall]):
    assert len(actual_tool_calls) == len(expected_tool_calls)
    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
                                                    expected_tool_calls):
        # Seed-OSS tool call will not generate id
        assert actual_tool_call.type == "function"
        assert actual_tool_call.function == expected_tool_call.function
        assert actual_tool_call.function.name == expected_tool_call.function.name
        assert actual_tool_call.function.arguments == expected_tool_call.function.arguments
 def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
    model_output = "This is a test response without any tool calls"
    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
        model_output, request=None)  # type: ignore[arg-type]
    assert not extracted_tool_calls.tools_called
    assert extracted_tool_calls.tool_calls == []
    assert extracted_tool_calls.content == model_output
@pytest.mark.parametrize(
    ids=[
        "tool_call_0_thinking_budget",
        "tool_call_512_thinkg_budget",
        "tool_call_unlimited_thinking_budget",
    ],
    argnames=["model_output", "expected_tool_calls", "expected_content"],
    argvalues=[
        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
         """<seed:tool_call>\n<function=get_weather>\n"""
         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
         [
             ToolCall(function=FunctionCall(
                 name="get_weather",
                 arguments=json.dumps({
                     "location": "Barcelona, Spain",
                 }, ),
             ),
                      type='function')
         ],
         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
         ),
        (
            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
            """\n</seed:tool_call>""",
            [
                ToolCall(function=FunctionCall(
                    name="get_weather",
                    arguments=json.dumps({
                        "location": "Barcelona, Spain",
                    }, ),
                ),
                         type='function')
            ],
            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
        ),
        (
            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
            """First, I need to remember the function I can use: get_weather. The function requires a """
            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
            """let me check the function docstring again. Oh, the function says unit is optional, and """
            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
            """call should be as above. Then wait for the result to come back and tell the user the """
            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
            [
                ToolCall(function=FunctionCall(
                    name="get_weather",
                    arguments=json.dumps(
                        {
                            "location": "Barcelona, Spain",
                            "unit": "celsius",
                        }, ),
                ),
                         type='function')
            ],
            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
            """First, I need to remember the function I can use: get_weather. The function requires a """
            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
            """let me check the function docstring again. Oh, the function says unit is optional, and """
            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
            """call should be as above. Then wait for the result to come back and tell the user the """
            """temperature in Celsius.</seed:think>""",
        ),
    ],
 )
 def test_extract_tool_calls(seed_oss_tool_parser, sample_tools, model_output,
                            expected_tool_calls, expected_content):
    request = ChatCompletionRequest(model=MODEL,
                                    messages=[],
                                    tools=sample_tools)
    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
        model_output, request=request)  # type: ignore[arg-type]
    assert extracted_tool_calls.tools_called
    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
    assert extracted_tool_calls.content == expected_content
 def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
    model_output = "This is a test response without any tool calls"
    result = seed_oss_tool_parser.extract_tool_calls_streaming(
        previous_text="his is a test response",
        current_text=model_output,
        delta_text=" without any tool calls.",
        previous_token_ids=[],
        current_token_ids=[],
        delta_token_ids=[],
        request=None,
    )
    # Should return the delta text as content
    assert result is not None
    assert hasattr(result, 'content')
    assert result.content == " without any tool calls."
 def stream_delta_message_generator(
    seed_oss_tool_parser: SeedOssToolParser,
    seed_oss_tokenizer: AnyTokenizer,
    model_output: str,
    request: Optional[ChatCompletionRequest] = None
 ) -> Generator[DeltaMessage, None, None]:
    all_token_ids = seed_oss_tokenizer.encode(model_output,
                                              add_special_tokens=False)
    previous_text = ""
    previous_tokens = None
    prefix_offset = 0
    read_offset = 0
    for i, delta_token in enumerate(all_token_ids):
        delta_token_ids = [delta_token]
        previous_token_ids = all_token_ids[:i]
        current_token_ids = all_token_ids[:i + 1]
        (new_tokens, delta_text, new_prefix_offset,
         new_read_offset) = detokenize_incrementally(
             tokenizer=seed_oss_tokenizer,
             all_input_ids=current_token_ids,
             prev_tokens=previous_tokens,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
             skip_special_tokens=False,
             spaces_between_special_tokens=True,
         )
        current_text = previous_text + delta_text
        delta_message = seed_oss_tool_parser.extract_tool_calls_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
            request=request,
        )
        if delta_message:
            yield delta_message
        previous_text = current_text
        previous_tokens = (previous_tokens +
                           new_tokens if previous_tokens else new_tokens)
        prefix_offset = new_prefix_offset
        read_offset = new_read_offset
@pytest.mark.parametrize(
    ids=[
        "tool_call_0_thinking_budget",
        "tool_call_512_thinkg_budget",
        "tool_call_unlimited_thinking_budget",
    ],
    argnames=["model_output", "expected_tool_calls", "expected_content"],
    argvalues=[
        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
         """<seed:tool_call>\n<function=get_weather>\n"""
         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
         [
             ToolCall(function=FunctionCall(
                 name="get_weather",
                 arguments=json.dumps({
                     "location": "Barcelona, Spain",
                 }, ),
             ),
                      type='function')
         ],
         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
         ),
        (
            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
            """\n</seed:tool_call>""",
            [
                ToolCall(function=FunctionCall(
                    name="get_weather",
                    arguments=json.dumps({
                        "location": "Barcelona, Spain",
                    }, ),
                ),
                         type='function')
            ],
            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
        ),
        (
            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
            """First, I need to remember the function I can use: get_weather. The function requires a """
            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
            """let me check the function docstring again. Oh, the function says unit is optional, and """
            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
            """call should be as above. Then wait for the result to come back and tell the user the """
            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
            [
                ToolCall(function=FunctionCall(
                    name="get_weather",
                    arguments=json.dumps(
                        {
                            "location": "Barcelona, Spain",
                            "unit": "celsius",
                        }, ),
                ),
                         type='function')
            ],
            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
            """First, I need to remember the function I can use: get_weather. The function requires a """
            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
            """let me check the function docstring again. Oh, the function says unit is optional, and """
            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
            """call should be as above. Then wait for the result to come back and tell the user the """
            """temperature in Celsius.</seed:think>""",
        ),
    ],
 )
 def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
                              sample_tools, model_output, expected_tool_calls,
                              expected_content):
    """Test incremental streaming behavior"""
    request = ChatCompletionRequest(model=MODEL,
                                    messages=[],
                                    tools=sample_tools)
    other_content = ''
    tool_states = {}  # Track state per tool index
    for delta_message in stream_delta_message_generator(
            seed_oss_tool_parser, seed_oss_tokenizer, model_output, request):
        # role should never be streamed from tool parser
        assert not delta_message.role
        if delta_message.content:
            other_content += delta_message.content
        if delta_message.tool_calls:
            for tool_call in delta_message.tool_calls:
                idx = tool_call.index
                # Initialize state for new tool
                if idx not in tool_states:
                    tool_states[idx] = {
                        "id": None,
                        "name": None,
                        "arguments": "",
                        "type": None
                    }
                # First chunk should have id, name, and type
                if tool_call.id:
                    tool_states[idx]["id"] = tool_call.id
                if tool_call.type:
                    assert tool_call.type == "function"
                    tool_states[idx]["type"] = tool_call.type
                if tool_call.function:
                    if tool_call.function.name:
                        # Should only be set once
                        assert tool_states[idx]["name"] is None
                        tool_states[idx]["name"] = tool_call.function.name
                    if tool_call.function.arguments is not None:
                        # Accumulate arguments incrementally
                        tool_states[idx][
                            "arguments"] += tool_call.function.arguments
    # Verify final content
    assert other_content == expected_content
    # Verify we got all expected tool calls
    assert len(tool_states) == len(expected_tool_calls)
    # Verify each tool call
    for idx, expected_tool in enumerate(expected_tool_calls):
        state = tool_states[idx]
        assert state["id"] is not None
        assert state["type"] == "function"
        assert state["name"] == expected_tool.function.name
        # Parse accumulated arguments
        arguments_str = state["arguments"]
        assert arguments_str is not None
        actual_args = json.loads(arguments_str)
        expected_args = json.loads(expected_tool.function.arguments)
        assert actual_args == expected_args
--- a/vllm/entrypoints/openai/tool_parsers/init.py
+++ b/vllm/entrypoints/openai/tool_parsers/init.py
@ -18,6 +18,7 @@ from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
 from .seed_oss_tool_parser import SeedOssToolParser
 from .step3_tool_parser import Step3ToolParser
 from .xlam_tool_parser import xLAMToolParser
@ -41,5 +42,6 @@ __all__ = [
    "HunyuanA13BToolParser",
    "Glm4MoeModelToolParser",
    "Qwen3CoderToolParser",
    "SeedOssToolParser",
    "Step3ToolParser",
 ]
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@ -0,0 +1,676 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from qwen3coder xml parser, All rights reserved.
 # ruff: noqa: E501
 import ast
 import json
 import uuid
 from collections.abc import Sequence
 from typing import Any, Optional, Union
 import regex as re
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              ChatCompletionToolsParam,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
                                              ExtractedToolCallInformation,
                                              FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser, ToolParserManager)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
@ToolParserManager.register_module("seed_oss")
 class SeedOssToolParser(ToolParser):
    TOOL_CALL_START = "<seed:tool_call>"
    TOOL_CALL_END = "</seed:tool_call>"
    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        # --- streaming state ---
        self._reset_streaming_state()
        self.prev_tool_call_arr: list[dict] = []
        self.tool_call_start_token: str = self.TOOL_CALL_START
        self.tool_call_end_token: str = self.TOOL_CALL_END
        # Sentinel tokens for streaming mode
        self.tool_call_prefix: str = "<function="
        self.function_end_token: str = "</function>"
        self.parameter_prefix: str = "<parameter="
        self.parameter_end_token: str = "</parameter>"
        self.think_start_token: str = "<seed:think>"
        self.think_end_token: str = "</seed:think>"
        self.is_tool_call_started: bool = False
        self.is_thinking_end: bool = False
        self.failed_count: int = 0
        self._reset_streaming_state()
        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
        self.think_end_token_id = self.vocab.get(self.think_end_token)
        if (self.tool_call_start_token_id is None
                or self.tool_call_end_token_id is None):
            raise RuntimeError(
                "Seed_Oss XML parser: tokenizer did not include "
                "<seed:tool_call> or its closing tag.")
        tool_start_re = re.escape(self.tool_call_start_token)
        tool_end_re = re.escape(self.tool_call_end_token)
        self.tool_call_complete_regex = re.compile(
            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
        self.tool_call_regex = re.compile(
            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
            re.DOTALL)
        self.tool_call_function_regex = re.compile(
            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
        self.tool_call_parameter_regex = re.compile(
            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
                    self.__class__.__name__)
    def _generate_tool_call_id(self) -> str:
        """Generate a unique tool call ID."""
        return f"call_{uuid.uuid4().hex[:24]}"
    def _reset_streaming_state(self):
        """Reset all streaming state."""
        self.current_tool_index = 0
        self.is_tool_call_started = False
        self.header_sent = False
        self.current_tool_id = -1
        self.current_function_name = None
        self.current_param_name = None
        self.current_param_value = ""
        self.param_count = 0
        self.in_param = False
        self.in_function = False
        self.accumulated_text = ""
        self.json_started = False
        self.json_closed = False
    def _parse_xml_function_call(
            self, function_call_str: str,
            tools: Optional[list[ChatCompletionToolsParam]]
    ) -> Optional[ToolCall]:
        def get_arguments_config(func_name: str) -> dict:
            if tools is None:
                return {}
            for config in tools:
                if not hasattr(config, "type") or not (
                        hasattr(config, "function")
                        and hasattr(config.function, "name")):
                    continue
                if (config.type == "function"
                        and config.function.name == func_name):
                    if not hasattr(config.function, "parameters"):
                        return {}
                    params = config.function.parameters
                    if isinstance(params, dict) and "properties" in params:
                        return params["properties"]
                    elif isinstance(params, dict):
                        return params
                    else:
                        return {}
            logger.warning("Tool '%s' is not defined in the tools list.",
                           func_name)
            return {}
        def convert_param_value(param_value: str, param_name: str,
                                param_config: dict, func_name: str) -> Any:
            # Handle null value for any type
            if param_value.lower() == "null":
                return None
            if param_name not in param_config:
                if param_config != {}:
                    logger.warning(
                        "Parsed parameter '%s' is not defined in "
                        "the tool parameters for tool '%s', "
                        "directly returning the string value.", param_name,
                        func_name)
                return param_value
            if (isinstance(param_config[param_name], dict)
                    and "type" in param_config[param_name]):
                param_type = str(
                    param_config[param_name]["type"]).strip().lower()
            else:
                param_type = "string"
            if param_type in [
                    "string", "str", "text", "varchar", "char", "enum"
            ]:
                return param_value
            elif (param_type.startswith("int") or param_type.startswith("uint")
                  or param_type.startswith("long")
                  or param_type.startswith("short")
                  or param_type.startswith("unsigned")):
                try:
                    param_value = int(param_value)  # type: ignore
                except (ValueError, TypeError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
                        "'%s', degenerating to string.", param_value,
                        param_name, func_name)
                return param_value
            elif param_type.startswith("num") or param_type.startswith(
                    "float"):
                try:
                    float_param_value = float(param_value)
                    param_value = float_param_value if float_param_value - int(
                        float_param_value) != 0 else int(
                            float_param_value)  # type: ignore
                except (ValueError, TypeError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not a float in tool "
                        "'%s', degenerating to string.", param_value,
                        param_name, func_name)
                return param_value
            elif param_type in ["boolean", "bool", "binary"]:
                param_value = param_value.lower()
                if param_value not in ["true", "false"]:
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not a boolean "
                        "(`true` of `false`) in tool '%s', degenerating to false.",
                        param_value, param_name, func_name)
                return param_value == "true"
            else:
                if param_type == "object" or param_type.startswith("dict"):
                    try:
                        param_value = json.loads(param_value)
                        return param_value
                    except (ValueError, TypeError, json.JSONDecodeError):
                        logger.warning(
                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
                            "object in tool '%s', will try other methods to parse it.",
                            param_value, param_name, func_name)
                try:
                    param_value = ast.literal_eval(param_value)
                except (ValueError, SyntaxError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' cannot be converted via "
                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
                        param_value, param_name, func_name)
                return param_value
        # Extract function name
        end_index = function_call_str.index(">")
        function_name = function_call_str[:end_index]
        param_config = get_arguments_config(function_name)
        parameters = function_call_str[end_index + 1:]
        param_dict = {}
        for match in self.tool_call_parameter_regex.findall(parameters):
            match_text = match[0] if match[0] else match[1]
            idx = match_text.index(">")
            param_name = match_text[:idx]
            param_value = str(match_text[idx + 1:])
            # Remove prefix and trailing \n
            if param_value.startswith("\n"):
                param_value = param_value[1:]
            if param_value.endswith("\n"):
                param_value = param_value[:-1]
            param_dict[param_name] = convert_param_value(
                param_value, param_name, param_config, function_name)
        return ToolCall(
            type="function",
            function=FunctionCall(name=function_name,
                                  arguments=json.dumps(param_dict,
                                                       ensure_ascii=False)),
        )
    def _get_function_calls(self, model_output: str) -> list[str]:
        # Find all tool calls
        matched_ranges = self.tool_call_regex.findall(model_output)
        raw_tool_calls = [
            match[0] if match[0] else match[1] for match in matched_ranges
        ]
        # Back-off strategy if no tool_call tags found
        if len(raw_tool_calls) == 0:
            raw_tool_calls = [model_output]
        raw_function_calls = []
        for tool_call in raw_tool_calls:
            raw_function_calls.extend(
                self.tool_call_function_regex.findall(tool_call))
        function_calls = [
            match[0] if match[0] else match[1] for match in raw_function_calls
        ]
        return function_calls
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        # Quick check to avoid unnecessary processing
        if self.tool_call_prefix not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)
        # Check if both think start and end tokens are present
        if (self.think_start_token in model_output
                and self.think_end_token in model_output):
            # Find the position of think end token
            think_end_index = model_output.find(self.think_end_token) + len(
                self.think_end_token)
            # Extract content after think end token
            result_content = model_output[think_end_index:]
            thinking_content = model_output[:think_end_index]
        try:
            function_calls = self._get_function_calls(result_content)
            if len(function_calls) == 0:
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)
            tool_calls = [
                self._parse_xml_function_call(function_call_str, request.tools)
                for function_call_str in function_calls
            ]
            # Populate prev_tool_call_arr for serving layer to set finish_reason
            self.prev_tool_call_arr.clear()  # Clear previous calls
            for tool_call in tool_calls:
                if tool_call:
                    self.prev_tool_call_arr.append({
                        "name":
                        tool_call.function.name,
                        "arguments":
                        tool_call.function.arguments,
                    })
            # Extract content before tool calls
            tool_call_start_index = result_content.find(
                self.tool_call_start_token)
            tool_call_start_index = (
                tool_call_start_index if tool_call_start_index >= 0 else
                result_content.find(self.tool_call_prefix))
            content = thinking_content + result_content[:tool_call_start_index]
            return ExtractedToolCallInformation(
                tools_called=(len(tool_calls) > 0),
                tool_calls=tool_calls,
                content=content if content else None,
            )
        except Exception:
            logger.exception("Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)
    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        # If no delta text, return None unless
        # it's an EOS token after tool calls
        if not delta_text:
            # Check if this is an EOS token after all tool calls are complete
            # We check for tool calls in the text even if is_tool_call_started
            # is False because it might have been reset after processing all tools
            if (delta_token_ids
                    and self.tool_call_end_token_id not in delta_token_ids):
                # Count complete tool calls
                complete_calls = len(
                    self.tool_call_complete_regex.findall(current_text))
                # If we have completed tool calls and populated prev_tool_call_arr
                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                    # Check if all tool calls are closed
                    open_calls = current_text.count(
                        self.tool_call_start_token) - current_text.count(
                            self.tool_call_end_token)
                    if open_calls == 0:
                        # Return empty delta message to allow finish_reason processing
                        return DeltaMessage(content="")
                elif not self.is_tool_call_started and current_text:
                    # This is a regular content response that's now complete
                    return DeltaMessage(content="")
            return None
        # Check if this is the first call (reset state if needed)
        if not previous_text:
            self._reset_streaming_state()
        # Update accumulated text
        self.accumulated_text = current_text
        # Check if we need to advance to next tool
        if self.json_closed and not self.in_function:
            # Check if this tool call has ended
            tool_ends = current_text.count(self.tool_call_end_token)
            if tool_ends > self.current_tool_index:
                # This tool has ended, advance to next
                self.current_tool_index += 1
                self.header_sent = False
                self.param_count = 0
                self.json_started = False
                self.json_closed = False
                # Check if there are more tool calls
                if self.current_tool_index >= current_text.count(
                        self.tool_call_start_token):
                    # No more tool calls
                    self.is_tool_call_started = False
                # Continue processing next tool
                return None
        # Check if end thinking
        if (not self.is_thinking_end
                and (self.think_end_token_id in delta_token_ids
                     or self.think_end_token in delta_text)):
            self.is_thinking_end = True
        # If thinking hasn't ended yet, don't process any tool calls
        if not self.is_thinking_end:
            return DeltaMessage(content=delta_text)
        # Handle normal content before tool calls
        if not self.is_tool_call_started:
            # Check if tool call is starting
            if (self.tool_call_start_token_id in delta_token_ids
                    or self.tool_call_start_token in delta_text):
                self.is_tool_call_started = True
                # Return any content before the tool call
                if self.tool_call_start_token in delta_text:
                    content_before = delta_text[:delta_text.index(
                        self.tool_call_start_token)]
                    if content_before:
                        return DeltaMessage(content=content_before)
                return None
            else:
                # Check if we're between tool calls - skip whitespace
                if (current_text.rstrip().endswith(self.tool_call_end_token)
                        and delta_text.strip() == ""):
                    # We just ended a tool call, skip whitespace
                    return None
                # Normal content, no tool call
                return DeltaMessage(content=delta_text)
        # Check if we're between tool calls (waiting for next one)
        # Count tool calls we've seen vs processed
        tool_starts_count = current_text.count(self.tool_call_start_token)
        if self.current_tool_index >= tool_starts_count:
            # We're past all tool calls, shouldn't be here
            return None
        # We're in a tool call, find the current tool call portion
        # Need to find the correct tool call based on current_tool_index
        # Only process tool calls after think_end_token
        think_end_index = current_text.find(self.think_end_token) + len(
            self.think_end_token
        ) if self.think_end_token in current_text else 0
        tool_starts: list[int] = []
        idx = think_end_index
        while True:
            idx = current_text.find(self.tool_call_start_token, idx)
            if idx == -1:
                break
            tool_starts.append(idx)
            idx += len(self.tool_call_start_token)
        if self.current_tool_index >= len(tool_starts):
            # No more tool calls to process yet
            return None
        tool_start_idx = tool_starts[self.current_tool_index]
        # Find where this tool call ends (or current position if not ended yet)
        tool_end_idx = current_text.find(self.tool_call_end_token,
                                         tool_start_idx)
        if tool_end_idx == -1:
            tool_text = current_text[tool_start_idx:]
        else:
            tool_text = current_text[tool_start_idx:tool_end_idx +
                                     len(self.tool_call_end_token)]
        # Looking for function header
        if not self.header_sent:
            if self.tool_call_prefix in tool_text:
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_end = tool_text.find(">", func_start)
                if func_end != -1:
                    # Found complete function name
                    self.current_function_name = tool_text[func_start:func_end]
                    self.current_tool_id = self._generate_tool_call_id(
                    )  # type: ignore
                    self.header_sent = True
                    self.in_function = True
                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
                    already_added = any(
                        tool.get("name") == self.current_function_name
                        for tool in self.prev_tool_call_arr)
                    if not already_added:
                        self.prev_tool_call_arr.append({
                            "name": self.current_function_name,
                            "arguments":
                            "{}",  # Placeholder, will be updated later
                        })
                    # Send header with function info
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            id=self.current_tool_id,
                            function=DeltaFunctionCall(
                                name=self.current_function_name, arguments=""),
                            type="function",
                        )
                    ])
            return None
        # We've sent header, now handle function body
        if self.in_function:
            # Send opening brace if not sent yet
            if (not self.json_started
                    and self.parameter_prefix not in delta_text):
                self.json_started = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="{"),
                    )
                ])
            # Make sure json_started is set if we're processing parameters
            if not self.json_started:
                self.json_started = True
            # Check for function end in accumulated text
            if not self.json_closed and self.function_end_token in tool_text:
                # Close JSON
                self.json_closed = True
                # Extract the complete tool call to update prev_tool_call_arr with final arguments
                # Find the function content
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_content_end = tool_text.find(self.function_end_token,
                                                  func_start)
                if func_content_end != -1:
                    func_content = tool_text[func_start:func_content_end]
                    # Parse to get the complete arguments
                    try:
                        parsed_tool = self._parse_xml_function_call(
                            func_content, request.tools if request else None)
                        if parsed_tool:
                            # Update existing entry in prev_tool_call_arr with complete arguments
                            for i, tool in enumerate(self.prev_tool_call_arr):
                                if tool.get(
                                        "name") == parsed_tool.function.name:
                                    self.prev_tool_call_arr[i]["arguments"] = (
                                        parsed_tool.function.arguments)
                                    break
                    except Exception:
                        logger.warning(
                            "Failed to parse tool arguments during streaming.",
                            exc_info=True)
                result = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="}"),
                    )
                ])
                # Reset state for next tool
                self.in_function = False
                self.json_closed = True
                return result
            # Look for parameters
            # Count how many complete parameters we have processed
            complete_params = tool_text.count(self.parameter_end_token)
            # Check if we should start a new parameter
            if not self.in_param and self.param_count < complete_params:
                # Find the unprocessed parameter
                # Count parameter starts
                param_starts = []
                idx = 0
                while True:
                    idx = tool_text.find(self.parameter_prefix, idx)
                    if idx == -1:
                        break
                    param_starts.append(idx)
                    idx += len(self.parameter_prefix)
                if len(param_starts) > self.param_count:
                    # Process the next parameter
                    param_idx = param_starts[self.param_count]
                    param_start = param_idx + len(self.parameter_prefix)
                    remaining = tool_text[param_start:]
                    if ">" in remaining:
                        # We have the complete parameter name
                        name_end = remaining.find(">")
                        self.current_param_name = remaining[:name_end]
                        # Find the parameter value
                        value_start = param_start + name_end + 1
                        value_text = tool_text[value_start:]
                        if value_text.startswith("\n"):
                            value_text = value_text[1:]
                        # Find where this parameter ends
                        param_end_idx = value_text.find(
                            self.parameter_end_token)
                        if param_end_idx != -1:
                            # Complete parameter found
                            param_value = value_text[:param_end_idx]
                            if param_value.endswith("\n"):
                                param_value = param_value[:-1]
                            # Build complete JSON fragment for this parameter
                            if self.param_count == 0:
                                json_fragment = (
                                    '"' + self.current_param_name + '": "' +
                                    json.dumps(param_value)[1:-1] + '"')
                            else:
                                json_fragment = (
                                    ', "' + self.current_param_name + '": "' +
                                    json.dumps(param_value)[1:-1] + '"')
                            self.param_count += 1
                            return DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=self.current_tool_index,
                                    function=DeltaFunctionCall(
                                        arguments=json_fragment),
                                )
                            ])
            # Continue parameter value
            if self.in_param:
                if self.parameter_end_token in delta_text:
                    # End of parameter
                    end_idx = delta_text.find(self.parameter_end_token)
                    value_chunk = delta_text[:end_idx]
                    # Skip past > if at start
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]
                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]
                    # Calculate incremental JSON
                    full_value = self.current_param_value + value_chunk
                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
                                    if self.current_param_value else "")
                    full_escaped = json.dumps(full_value)[1:-1]
                    delta_escaped = full_escaped[len(prev_escaped):]
                    self.in_param = False
                    self.current_param_value = ""
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            function=DeltaFunctionCall(
                                arguments=delta_escaped + '"'),
                        )
                    ])
                else:
                    # Continue accumulating value
                    value_chunk = delta_text
                    # Handle first chunk after param name
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]
                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]
                    if value_chunk:
                        # Stream the escaped delta
                        prev_escaped = (json.dumps(
                            self.current_param_value)[1:-1]
                                        if self.current_param_value else "")
                        self.current_param_value += value_chunk
                        full_escaped = json.dumps(
                            self.current_param_value)[1:-1]
                        delta_escaped = full_escaped[len(prev_escaped):]
                        if delta_escaped:
                            return DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=self.current_tool_index,
                                    function=DeltaFunctionCall(
                                        arguments=delta_escaped),
                                )
                            ])
        return None
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -130,6 +130,7 @@ _TEXT_GENERATION_MODELS = {
    "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
    "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
    "SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"),
    "Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"),
    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@ -0,0 +1,487 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2025 The Seed team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only SeedOss model compatible with HuggingFace weights."""
 from collections.abc import Iterable
 from typing import Optional, Union
 import torch
 from torch import nn
 from transformers import PretrainedConfig as SeedOssConfig
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
 logger = init_logger(__name__)
 class SeedOssMLP(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ) -> None:
        super().__init__()
        self.gate_up_proj = MergedColumnParallelLinear(
            hidden_size,
            [intermediate_size] * 2,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.gate_up_proj",
        )
        self.down_proj = RowParallelLinear(
            intermediate_size,
            hidden_size,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.down_proj",
        )
        if hidden_act != "silu":
            raise ValueError(f"Unsupported activation: {hidden_act}. "
                             "Only silu is supported for now.")
        self.act_fn = SiluAndMul()
    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
        return x
 class SeedOssAttention(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        head_dim: int,
        max_position: int = 4096 * 32,
        rope_theta: float = 10000,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        rope_scaling: Optional[tuple] = None,
        prefix: str = "",
        attn_type: str = AttentionType.DECODER,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
        assert self.total_num_heads % tp_size == 0
        self.num_heads = self.total_num_heads // tp_size
        self.total_num_kv_heads = num_kv_heads
        self.head_dim = head_dim
        if self.total_num_kv_heads >= tp_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.rope_theta = rope_theta
        self.qkv_proj = QKVParallelLinear(
            hidden_size,
            self.head_dim,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.o_proj",
        )
        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position,
            base=self.rope_theta,
            rope_scaling=rope_scaling,
        )
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            cache_config=cache_config,
            quant_config=quant_config,
            attn_type=attn_type,
            prefix=f"{prefix}.attn",
        )
    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
        return output
 class SeedOssDecoderLayer(nn.Module):
    def __init__(
        self,
        config: SeedOssConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ) -> None:
        super().__init__()
        self.hidden_size = config.hidden_size
        # Requires transformers > 4.32.0
        rope_theta = getattr(config, "rope_theta", 1000000)
        rope_scaling = getattr(config, "rope_scaling", None)
        # By default, SeedOss uses causal attention as it is a
        # decoder-only model.
        # You can override the HF config with `is_causal=False` to enable
        # bidirectional attention, which is used in some embedding models
        if getattr(config, "is_causal", True):
            attn_type = AttentionType.DECODER
        else:
            attn_type = AttentionType.ENCODER_ONLY
        self.self_attn = SeedOssAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            max_position=config.max_position_embeddings,
            num_kv_heads=config.num_key_value_heads,
            head_dim=config.head_dim,
            rope_theta=rope_theta,
            cache_config=cache_config,
            quant_config=quant_config,
            rope_scaling=rope_scaling,
            prefix=f"{prefix}.self_attn",
            attn_type=attn_type,
        )
        self.mlp = SeedOssMLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
            quant_config=quant_config,
            prefix=f"{prefix}.mlp",
        )
        self.input_layernorm = RMSNorm(config.hidden_size,
                                       eps=config.rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                eps=config.rms_norm_eps)
    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Self Attention
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
            hidden_states, residual = self.input_layernorm(
                hidden_states, residual)
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
        )
        # Fully Connected
        hidden_states, residual = self.post_attention_layernorm(
            hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        return hidden_states, residual
@support_torch_compile(
    dynamic_arg_dims={
        "input_ids": 0,
        "positions": -1,
        "intermediate_tensors": 0,
        "inputs_embeds": 0,
    })
 class SeedOssModel(nn.Module):
    def __init__(self,
                 *,
                 vllm_config: VllmConfig,
                 prefix: str = "",
                 decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer):
        super().__init__()
        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        # TODO (@robertgshaw2): see if this can be moved out
        if (cache_config.sliding_window is not None
                and hasattr(config, "max_window_layers")):
            assert config.max_window_layers == config.num_hidden_layers, (
                "Sliding window for some but all layers is not supported. "
                "This model uses sliding window but `max_window_layers` = {} "
                "is less than `num_hidden_layers` = {}. Please open an issue "
                "to discuss this feature.".format(
                    config.max_window_layers,
                    config.num_hidden_layers,
                ))
        self.config = config
        self.quant_config = quant_config
        self.vocab_size = config.vocab_size
        if get_pp_group().is_first_rank or (config.tie_word_embeddings
                                            and get_pp_group().is_last_rank):
            self.embed_tokens = VocabParallelEmbedding(
                config.vocab_size,
                config.hidden_size,
                quant_config=quant_config,
                prefix=f"{prefix}.embed_tokens",
            )
        else:
            self.embed_tokens = PPMissingLayer()
        # Use the provided decoder layer type or default to SeedDecoderLayer
        decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.num_hidden_layers,
            lambda prefix: decoder_layer_type(config=config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
                                              prefix=prefix),
            prefix=f"{prefix}.layers",
        )
        self.make_empty_intermediate_tensors = (
            make_empty_intermediate_tensors_factory(
                ["hidden_states", "residual"], config.hidden_size))
        if get_pp_group().is_last_rank:
            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        else:
            self.norm = PPMissingLayer()
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                hidden_states = self.get_input_embeddings(input_ids)
            residual = None
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]
            residual = intermediate_tensors["residual"]
        for layer in self.layers[self.start_layer:self.end_layer]:
            hidden_states, residual = layer(
                positions,
                hidden_states,
                residual,
            )
        if not get_pp_group().is_last_rank:
            return IntermediateTensors({
                "hidden_states": hidden_states,
                "residual": residual
            })
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states
    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        loaded_params: set[str] = set()
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name:
                continue
            if (self.quant_config is not None and
                (scale_name := self.quant_config.get_cache_scale(name))):
                # Loading kv cache quantization scales
                param = params_dict[scale_name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
                                 loaded_weight[0])
                weight_loader(param, loaded_weight)
                loaded_params.add(scale_name)
                continue
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params
 class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    }
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config
        self.config = config
        self.lora_config = lora_config
        self.quant_config = quant_config
        self.model = SeedOssModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
        if get_pp_group().is_last_rank:
            if config.tie_word_embeddings:
                self.lm_head = self.model.embed_tokens
            else:
                self.lm_head = ParallelLMHead(config.vocab_size,
                                              config.hidden_size,
                                              quant_config=quant_config,
                                              prefix=maybe_prefix(
                                                  prefix, "lm_head"))
        else:
            self.lm_head = PPMissingLayer()
        self.logits_processor = LogitsProcessor(config.vocab_size)
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                   inputs_embeds)
        return hidden_states
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits
    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(
            self,
            skip_prefixes=(["lm_head."]
                           if self.config.tie_word_embeddings else None),
        )
        return loader.load_weights(weights)