mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 02:25:42 +08:00
[Frontend] Add OpenAI API support for input_audio (#11027)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
0064f697d3
commit
66d4b16724
@ -34,11 +34,6 @@ We currently support the following OpenAI APIs:
|
|||||||
- *Note: `suffix` parameter is not supported.*
|
- *Note: `suffix` parameter is not supported.*
|
||||||
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
|
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
|
||||||
- Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
|
- Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
|
||||||
- [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
|
|
||||||
- *Note: `image_url.detail` parameter is not supported.*
|
|
||||||
- We also support `audio_url` content type for audio files.
|
|
||||||
- Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
|
|
||||||
- *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
|
|
||||||
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
|
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
|
||||||
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
|
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
|
||||||
- Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
|
- Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
|
||||||
@ -209,6 +204,11 @@ The following extra parameters are supported:
|
|||||||
|
|
||||||
Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
|
Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
|
||||||
|
|
||||||
|
We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
|
||||||
|
[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
|
||||||
|
see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
|
||||||
|
- *Note: `image_url.detail` parameter is not supported.*
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
|
The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
|
||||||
|
|||||||
@ -315,7 +315,95 @@ You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/e
|
|||||||
Audio
|
Audio
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`.
|
Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
|
||||||
|
Here is a simple example using Ultravox-v0.3.
|
||||||
|
|
||||||
|
First, launch the OpenAI-compatible server:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
vllm serve fixie-ai/ultravox-v0_3
|
||||||
|
|
||||||
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import requests
|
||||||
|
from openai import OpenAI
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
|
def encode_base64_content_from_url(content_url: str) -> str:
|
||||||
|
"""Encode a content retrieved from a remote url to base64 format."""
|
||||||
|
|
||||||
|
with requests.get(content_url) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
result = base64.b64encode(response.content).decode('utf-8')
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Any format supported by librosa is supported
|
||||||
|
audio_url = AudioAsset("winning_call").url
|
||||||
|
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||||
|
|
||||||
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this audio?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {
|
||||||
|
"data": audio_base64,
|
||||||
|
"format": "wav"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_base64.choices[0].message.content
|
||||||
|
print("Chat completion output from input audio:", result)
|
||||||
|
|
||||||
|
Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this audio?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_url",
|
||||||
|
"audio_url": {
|
||||||
|
"url": audio_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from audio url:", result)
|
||||||
|
|
||||||
A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
|
A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
|
||||||
|
|
||||||
|
|||||||
@ -153,10 +153,37 @@ def run_multi_image() -> None:
|
|||||||
|
|
||||||
# Audio input inference
|
# Audio input inference
|
||||||
def run_audio() -> None:
|
def run_audio() -> None:
|
||||||
# Any format supported by librosa is supported
|
|
||||||
audio_url = AudioAsset("winning_call").url
|
audio_url = AudioAsset("winning_call").url
|
||||||
|
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||||
|
|
||||||
# Use audio url in the payload
|
# OpenAI-compatible schema (`input_audio`)
|
||||||
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this audio?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {
|
||||||
|
# Any format supported by librosa is supported
|
||||||
|
"data": audio_base64,
|
||||||
|
"format": "wav"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_base64.choices[0].message.content
|
||||||
|
print("Chat completion output from input audio:", result)
|
||||||
|
|
||||||
|
# HTTP URL
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
messages=[{
|
messages=[{
|
||||||
"role":
|
"role":
|
||||||
@ -169,6 +196,7 @@ def run_audio() -> None:
|
|||||||
{
|
{
|
||||||
"type": "audio_url",
|
"type": "audio_url",
|
||||||
"audio_url": {
|
"audio_url": {
|
||||||
|
# Any format supported by librosa is supported
|
||||||
"url": audio_url
|
"url": audio_url
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -181,7 +209,7 @@ def run_audio() -> None:
|
|||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print("Chat completion output from audio url:", result)
|
print("Chat completion output from audio url:", result)
|
||||||
|
|
||||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
# base64 URL
|
||||||
chat_completion_from_base64 = client.chat.completions.create(
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
messages=[{
|
messages=[{
|
||||||
"role":
|
"role":
|
||||||
|
|||||||
@ -155,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
assert message.content is not None and len(message.content) >= 0
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||||
|
async def test_single_chat_session_input_audio(
|
||||||
|
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
|
||||||
|
base64_encoded_audio: Dict[str, str]):
|
||||||
|
messages = [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {
|
||||||
|
"data": base64_encoded_audio[audio_url],
|
||||||
|
"format": "wav"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's happening in this audio?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
# test single completion
|
||||||
|
chat_completion = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
logprobs=True,
|
||||||
|
top_logprobs=5)
|
||||||
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
|
choice = chat_completion.choices[0]
|
||||||
|
assert choice.finish_reason == "length"
|
||||||
|
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||||
|
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||||
|
|
||||||
|
message = choice.message
|
||||||
|
message = chat_completion.choices[0].message
|
||||||
|
assert message.content is not None and len(message.content) >= 10
|
||||||
|
assert message.role == "assistant"
|
||||||
|
messages.append({"role": "assistant", "content": message.content})
|
||||||
|
|
||||||
|
# test multi-turn dialogue
|
||||||
|
messages.append({"role": "user", "content": "express your result in json"})
|
||||||
|
chat_completion = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
)
|
||||||
|
message = chat_completion.choices[0].message
|
||||||
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||||
@ -212,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
|
|||||||
assert "".join(chunks) == output
|
assert "".join(chunks) == output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||||
|
async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
|
||||||
|
model_name: str, audio_url: str,
|
||||||
|
base64_encoded_audio: Dict[str,
|
||||||
|
str]):
|
||||||
|
messages = [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {
|
||||||
|
"data": base64_encoded_audio[audio_url],
|
||||||
|
"format": "wav"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's happening in this audio?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
# test single completion
|
||||||
|
chat_completion = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
output = chat_completion.choices[0].message.content
|
||||||
|
stop_reason = chat_completion.choices[0].finish_reason
|
||||||
|
|
||||||
|
# test streaming
|
||||||
|
stream = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
temperature=0.0,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
chunks: List[str] = []
|
||||||
|
finish_reason_count = 0
|
||||||
|
async for chunk in stream:
|
||||||
|
delta = chunk.choices[0].delta
|
||||||
|
if delta.role:
|
||||||
|
assert delta.role == "assistant"
|
||||||
|
if delta.content:
|
||||||
|
chunks.append(delta.content)
|
||||||
|
if chunk.choices[0].finish_reason is not None:
|
||||||
|
finish_reason_count += 1
|
||||||
|
# finish reason should only return in last block
|
||||||
|
assert finish_reason_count == 1
|
||||||
|
assert chunk.choices[0].finish_reason == stop_reason
|
||||||
|
assert delta.content
|
||||||
|
assert "".join(chunks) == output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||||
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
|
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
|
||||||
audio_url: str):
|
audio_url: str,
|
||||||
|
base64_encoded_audio: Dict[str, str]):
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role":
|
"role":
|
||||||
@ -229,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "audio_url",
|
"type": "input_audio",
|
||||||
"audio_url": {
|
"input_audio": {
|
||||||
"url": audio_url
|
"data": base64_encoded_audio[audio_url],
|
||||||
|
"format": "wav"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@ -13,7 +13,8 @@ import transformers.utils.chat_template_utils as hf_chat_utils
|
|||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from openai.types.chat import (ChatCompletionAssistantMessageParam,
|
from openai.types.chat import (ChatCompletionAssistantMessageParam,
|
||||||
ChatCompletionContentPartImageParam)
|
ChatCompletionContentPartImageParam,
|
||||||
|
ChatCompletionContentPartInputAudioParam)
|
||||||
from openai.types.chat import (
|
from openai.types.chat import (
|
||||||
ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
|
ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
|
||||||
from openai.types.chat import (ChatCompletionContentPartRefusalParam,
|
from openai.types.chat import (ChatCompletionContentPartRefusalParam,
|
||||||
@ -105,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
|||||||
|
|
||||||
ChatCompletionContentPartParam: TypeAlias = Union[
|
ChatCompletionContentPartParam: TypeAlias = Union[
|
||||||
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
|
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
|
||||||
|
ChatCompletionContentPartInputAudioParam,
|
||||||
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
|
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
|
||||||
CustomChatCompletionContentSimpleImageParam,
|
CustomChatCompletionContentSimpleImageParam,
|
||||||
CustomChatCompletionContentSimpleAudioParam,
|
CustomChatCompletionContentSimpleAudioParam,
|
||||||
@ -519,6 +521,10 @@ class BaseMultiModalContentParser(ABC):
|
|||||||
def parse_audio(self, audio_url: str) -> None:
|
def parse_audio(self, audio_url: str) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_video(self, video_url: str) -> None:
|
def parse_video(self, video_url: str) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@ -545,6 +551,15 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
placeholder = self._tracker.add("audio", audio)
|
placeholder = self._tracker.add("audio", audio)
|
||||||
self._add_placeholder(placeholder)
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
|
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||||
|
input_audio_data = input_audio.get("data","")
|
||||||
|
input_audio_format = input_audio.get("format","")
|
||||||
|
audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
|
||||||
|
audio = get_and_parse_audio(audio_url)
|
||||||
|
|
||||||
|
placeholder = self._tracker.add("audio", audio)
|
||||||
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
def parse_video(self, video_url: str) -> None:
|
def parse_video(self, video_url: str) -> None:
|
||||||
video = get_and_parse_video(video_url)
|
video = get_and_parse_video(video_url)
|
||||||
|
|
||||||
@ -574,6 +589,15 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
placeholder = self._tracker.add("audio", audio_coro)
|
placeholder = self._tracker.add("audio", audio_coro)
|
||||||
self._add_placeholder(placeholder)
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
|
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||||
|
input_audio_data = input_audio.get("data","")
|
||||||
|
input_audio_format = input_audio.get("format","")
|
||||||
|
audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
|
||||||
|
audio_coro = async_get_and_parse_audio(audio_url)
|
||||||
|
|
||||||
|
placeholder = self._tracker.add("audio", audio_coro)
|
||||||
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
def parse_video(self, video_url: str) -> None:
|
def parse_video(self, video_url: str) -> None:
|
||||||
video = async_get_and_parse_video(video_url)
|
video = async_get_and_parse_video(video_url)
|
||||||
|
|
||||||
@ -667,17 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
|
|||||||
_TextParser = partial(cast, ChatCompletionContentPartTextParam)
|
_TextParser = partial(cast, ChatCompletionContentPartTextParam)
|
||||||
_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
|
_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
|
||||||
_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
|
_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
|
||||||
|
_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
|
||||||
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
||||||
_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
|
_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
|
||||||
|
|
||||||
# Define a mapping from part types to their corresponding parsing functions.
|
# Define a mapping from part types to their corresponding parsing functions.
|
||||||
MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
|
MM_PARSER_MAP: Dict[str,
|
||||||
|
Callable[[ChatCompletionContentPartParam],
|
||||||
|
Union[str, Dict[str,str]]]] = {
|
||||||
"text":
|
"text":
|
||||||
lambda part: _TextParser(part).get("text", ""),
|
lambda part: _TextParser(part).get("text", ""),
|
||||||
"image_url":
|
"image_url":
|
||||||
lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
|
lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
|
||||||
"audio_url":
|
"audio_url":
|
||||||
lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
|
lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
|
||||||
|
"input_audio":
|
||||||
|
lambda part: _InputAudioParser(part).get("input_audio", {}),
|
||||||
"refusal":
|
"refusal":
|
||||||
lambda part: _RefusalParser(part).get("refusal", ""),
|
lambda part: _RefusalParser(part).get("refusal", ""),
|
||||||
"video_url":
|
"video_url":
|
||||||
@ -686,7 +715,8 @@ MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
|
|||||||
|
|
||||||
|
|
||||||
def _parse_chat_message_content_mm_part(
|
def _parse_chat_message_content_mm_part(
|
||||||
part: ChatCompletionContentPartParam) -> Tuple[str, str]:
|
part: ChatCompletionContentPartParam) -> Tuple[str,
|
||||||
|
Union[str, Dict[str, str]]]:
|
||||||
"""
|
"""
|
||||||
Parses a given multi-modal content part based on its type.
|
Parses a given multi-modal content part based on its type.
|
||||||
|
|
||||||
@ -717,6 +747,7 @@ def _parse_chat_message_content_mm_part(
|
|||||||
return part_type, content
|
return part_type, content
|
||||||
|
|
||||||
# Handle missing 'type' but provided direct URL fields.
|
# Handle missing 'type' but provided direct URL fields.
|
||||||
|
# 'type' is required field by pydantic
|
||||||
if part_type is None:
|
if part_type is None:
|
||||||
if part.get("image_url") is not None:
|
if part.get("image_url") is not None:
|
||||||
image_params = cast(CustomChatCompletionContentSimpleImageParam,
|
image_params = cast(CustomChatCompletionContentSimpleImageParam,
|
||||||
@ -726,6 +757,9 @@ def _parse_chat_message_content_mm_part(
|
|||||||
audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
|
audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
|
||||||
part)
|
part)
|
||||||
return "audio_url", audio_params.get("audio_url", "")
|
return "audio_url", audio_params.get("audio_url", "")
|
||||||
|
if part.get("input_audio") is not None:
|
||||||
|
input_audio_params = cast(Dict[str, str], part)
|
||||||
|
return "input_audio", input_audio_params
|
||||||
if part.get("video_url") is not None:
|
if part.get("video_url") is not None:
|
||||||
video_params = cast(CustomChatCompletionContentSimpleVideoParam,
|
video_params = cast(CustomChatCompletionContentSimpleVideoParam,
|
||||||
part)
|
part)
|
||||||
@ -739,7 +773,7 @@ def _parse_chat_message_content_mm_part(
|
|||||||
|
|
||||||
|
|
||||||
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
|
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
|
||||||
"audio_url", "video_url")
|
"audio_url", "input_audio", "video_url")
|
||||||
|
|
||||||
|
|
||||||
def _parse_chat_message_content_parts(
|
def _parse_chat_message_content_parts(
|
||||||
@ -795,7 +829,7 @@ def _parse_chat_message_content_part(
|
|||||||
# Handle structured dictionary parts
|
# Handle structured dictionary parts
|
||||||
part_type, content = _parse_chat_message_content_mm_part(part)
|
part_type, content = _parse_chat_message_content_mm_part(part)
|
||||||
|
|
||||||
# if part_type is text/refusal/image_url/audio_url/video_url but
|
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
|
||||||
# content is empty, log a warning and skip
|
# content is empty, log a warning and skip
|
||||||
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
|
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -804,18 +838,30 @@ def _parse_chat_message_content_part(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if part_type in ("text", "refusal"):
|
if part_type in ("text", "refusal"):
|
||||||
return {'type': 'text', 'text': content} if wrap_dicts else content
|
str_content = cast(str, content)
|
||||||
|
if wrap_dicts:
|
||||||
|
return {'type': 'text', 'text': str_content}
|
||||||
|
else:
|
||||||
|
return str_content
|
||||||
|
|
||||||
if part_type == "image_url":
|
if part_type == "image_url":
|
||||||
mm_parser.parse_image(content)
|
str_content = cast(str, content)
|
||||||
|
mm_parser.parse_image(str_content)
|
||||||
return {'type': 'image'} if wrap_dicts else None
|
return {'type': 'image'} if wrap_dicts else None
|
||||||
|
|
||||||
if part_type == "audio_url":
|
if part_type == "audio_url":
|
||||||
mm_parser.parse_audio(content)
|
str_content = cast(str, content)
|
||||||
|
mm_parser.parse_audio(str_content)
|
||||||
|
return {'type': 'audio'} if wrap_dicts else None
|
||||||
|
|
||||||
|
if part_type == "input_audio":
|
||||||
|
dict_content = cast(Dict[str, str], content)
|
||||||
|
mm_parser.parse_input_audio(dict_content)
|
||||||
return {'type': 'audio'} if wrap_dicts else None
|
return {'type': 'audio'} if wrap_dicts else None
|
||||||
|
|
||||||
if part_type == "video_url":
|
if part_type == "video_url":
|
||||||
mm_parser.parse_video(content)
|
str_content = cast(str, content)
|
||||||
|
mm_parser.parse_video(str_content)
|
||||||
return {'type': 'video'} if wrap_dicts else None
|
return {'type': 'video'} if wrap_dicts else None
|
||||||
|
|
||||||
raise NotImplementedError(f"Unknown part type: {part_type}")
|
raise NotImplementedError(f"Unknown part type: {part_type}")
|
||||||
@ -840,7 +886,6 @@ def _parse_chat_message_content(
|
|||||||
content = [
|
content = [
|
||||||
ChatCompletionContentPartTextParam(type="text", text=content)
|
ChatCompletionContentPartTextParam(type="text", text=content)
|
||||||
]
|
]
|
||||||
|
|
||||||
result = _parse_chat_message_content_parts(
|
result = _parse_chat_message_content_parts(
|
||||||
role,
|
role,
|
||||||
content, # type: ignore
|
content, # type: ignore
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user