[Frontend] refactor harmony utils output message parsing (#29820)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
2026-06-08 06:09:10 +08:00 · 2025-12-03 23:36:57 -08:00 · 2025-12-03 23:36:57 -08:00 · 404fc4bfc0
commit 404fc4bfc0
parent 82a64b3d8f
1 changed files with 117 additions and 99 deletions
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@ -328,6 +328,105 @@ def render_for_completion(messages: list[Message]) -> list[int]:
    return token_ids
 def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
    """Parse browser tool calls (search, open, find) into web search items."""
    if len(message.content) != 1:
        raise ValueError("Invalid number of contents in browser message")
    content = message.content[0]
    # Parse JSON args (with retry detection)
    try:
        browser_call = json.loads(content.text)
    except json.JSONDecodeError:
        json_retry_output_message = (
            f"Invalid JSON args, caught and retried: {content.text}"
        )
        browser_call = {
            "query": json_retry_output_message,
            "url": json_retry_output_message,
            "pattern": json_retry_output_message,
        }
    # Create appropriate action based on recipient
    if recipient == "browser.search":
        action = ActionSearch(
            query=f"cursor:{browser_call.get('query', '')}", type="search"
        )
    elif recipient == "browser.open":
        action = ActionOpenPage(
            url=f"cursor:{browser_call.get('url', '')}", type="open_page"
        )
    elif recipient == "browser.find":
        action = ActionFind(
            pattern=browser_call.get("pattern", ""),
            url=f"cursor:{browser_call.get('url', '')}",
            type="find",
        )
    else:
        raise ValueError(f"Unknown browser action: {recipient}")
    return ResponseFunctionWebSearch(
        id=f"ws_{random_uuid()}",
        action=action,
        status="completed",
        type="web_search_call",
    )
 def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
    """Parse function calls into function tool call items."""
    function_name = recipient.split(".")[-1]
    output_items = []
    for content in message.content:
        random_id = random_uuid()
        response_item = ResponseFunctionToolCall(
            arguments=content.text,
            call_id=f"call_{random_id}",
            type="function_call",
            name=function_name,
            id=f"fc_{random_id}",
        )
        output_items.append(response_item)
    return output_items
 def _parse_reasoning_content(message: Message) -> list[ResponseOutputItem]:
    """Parse reasoning/analysis content into reasoning items."""
    output_items = []
    for content in message.content:
        reasoning_item = ResponseReasoningItem(
            id=f"rs_{random_uuid()}",
            summary=[],
            type="reasoning",
            content=[
                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
            ],
            status=None,
        )
        output_items.append(reasoning_item)
    return output_items
 def _parse_final_message(message: Message) -> ResponseOutputItem:
    """Parse final channel messages into output message items."""
    contents = []
    for content in message.content:
        output_text = ResponseOutputText(
            text=content.text,
            annotations=[],  # TODO
            type="output_text",
            logprobs=None,  # TODO
        )
        contents.append(output_text)
    return ResponseOutputMessage(
        id=f"msg_{random_uuid()}",
        content=contents,
        role=message.author.role,
        status="completed",
        type="message",
    )
 def parse_output_message(message: Message) -> list[ResponseOutputItem]:
    """
    Parse a Harmony message into a list of output response items.
@ -340,119 +439,38 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
    output_items: list[ResponseOutputItem] = []
    recipient = message.recipient
    # Browser tool calls
    if recipient is not None and recipient.startswith("browser."):
-        if len(message.content) != 1:
+        output_items.append(_parse_browser_tool_call(message, recipient))
-            raise ValueError("Invalid number of contents in browser message")
+
-        content = message.content[0]
+    # Analysis channel (reasoning/chain-of-thought)
        # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY
        # env variable since if it is not set, we are certain the json is valid
        # The use of Actions for web search will be removed entirely in
        # the future, so this is only necessary temporarily
        try:
            browser_call = json.loads(content.text)
        except json.JSONDecodeError:
            # If the content is not valid JSON, then it was
            # caught and retried by vLLM, which means we
            # need to make note of that so the user is aware
            json_retry_output_message = (
                f"Invalid JSON args, caught and retried: {content.text}"
            )
            browser_call = {
                "query": json_retry_output_message,
                "url": json_retry_output_message,
                "pattern": json_retry_output_message,
            }
        # TODO: translate to url properly!
        if recipient == "browser.search":
            action = ActionSearch(
                query=f"cursor:{browser_call.get('query', '')}", type="search"
            )
        elif recipient == "browser.open":
            action = ActionOpenPage(
                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
            )
        elif recipient == "browser.find":
            action = ActionFind(
                pattern=browser_call["pattern"],
                url=f"cursor:{browser_call.get('url', '')}",
                type="find",
            )
        else:
            raise ValueError(f"Unknown browser action: {recipient}")
        web_search_item = ResponseFunctionWebSearch(
            id=f"ws_{random_uuid()}",
            action=action,
            status="completed",
            type="web_search_call",
        )
        output_items.append(web_search_item)
    elif message.channel == "analysis":
-        for content in message.content:
+        output_items.extend(_parse_reasoning_content(message))
-            reasoning_item = ResponseReasoningItem(
+
-                id=f"rs_{random_uuid()}",
+    # Commentary channel
                summary=[],
                type="reasoning",
                content=[
                    ResponseReasoningTextContent(
                        text=content.text, type="reasoning_text"
                    )
                ],
                status=None,
            )
            output_items.append(reasoning_item)
    elif message.channel == "commentary":
        # Function calls
        if recipient is not None and recipient.startswith("functions."):
-            function_name = recipient.split(".")[-1]
+            output_items.extend(_parse_function_call(message, recipient))
-            for content in message.content:
+
-                random_id = random_uuid()
+        # Built-in tools on commentary channel are treated as reasoning for now
                response_item = ResponseFunctionToolCall(
                    arguments=content.text,
                    call_id=f"call_{random_id}",
                    type="function_call",
                    name=function_name,
                    id=f"fc_{random_id}",
                )
                output_items.append(response_item)
        elif recipient is not None and (
            recipient.startswith("python")
            or recipient.startswith("browser")
            or recipient.startswith("container")
        ):
-            for content in message.content:
+            output_items.extend(_parse_reasoning_content(message))
                reasoning_item = ResponseReasoningItem(
                    id=f"rs_{random_uuid()}",
                    summary=[],
                    type="reasoning",
                    content=[
                        ResponseReasoningTextContent(
                            text=content.text, type="reasoning_text"
                        )
                    ],
                    status=None,
                )
                output_items.append(reasoning_item)
        else:
            raise ValueError(f"Unknown recipient: {recipient}")
    # Final output message
    elif message.channel == "final":
-        contents = []
+        output_items.append(_parse_final_message(message))
-        for content in message.content:
+
            output_text = ResponseOutputText(
                text=content.text,
                annotations=[],  # TODO
                type="output_text",
                logprobs=None,  # TODO
            )
            contents.append(output_text)
        text_item = ResponseOutputMessage(
            id=f"msg_{random_uuid()}",
            content=contents,
            role=message.author.role,
            status="completed",
            type="message",
        )
        output_items.append(text_item)
    else:
        raise ValueError(f"Unknown channel: {message.channel}")
    return output_items