mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 12:57:54 +08:00
[Frontend][Multimodal] Allow skipping media data when UUIDs are provided. (#23950)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
parent
4fdd6f5cbf
commit
7f2ea7074e
@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t
|
|||||||
print(o.outputs[0].text)
|
print(o.outputs[0].text)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
|
||||||
|
|
||||||
|
??? code
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
# Qwen2.5-VL example with two images
|
||||||
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
|
||||||
|
|
||||||
|
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
|
||||||
|
img_b = Image.open("/path/to/b.jpg")
|
||||||
|
|
||||||
|
outputs = llm.generate({
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {"image": [None, img_b]},
|
||||||
|
# Since img_a is expected to be cached, we can skip sending the actual
|
||||||
|
# image entirely.
|
||||||
|
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
|
||||||
|
})
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
print(o.outputs[0].text)
|
||||||
|
```
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
|
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
|
||||||
|
|
||||||
@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Image/video/audio URL:
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid,
|
||||||
|
},
|
||||||
|
|
||||||
|
# image_embeds
|
||||||
|
{
|
||||||
|
"type": "image_embeds",
|
||||||
|
"image_embeds": None,
|
||||||
|
"uuid": image_uuid
|
||||||
|
},
|
||||||
|
|
||||||
|
# input_audio:
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": None,
|
||||||
|
"uuid": audio_uuid
|
||||||
|
},
|
||||||
|
|
||||||
|
# PIL Image:
|
||||||
|
{
|
||||||
|
"type": "image_pil",
|
||||||
|
"image_pil": None
|
||||||
|
"uuid": image_uuid
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Only one message can contain `{"type": "image_embeds"}`.
|
Only one message can contain `{"type": "image_embeds"}`.
|
||||||
If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
|
If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
|
||||||
|
|||||||
@ -1764,6 +1764,7 @@ def apply_image_repeat(
|
|||||||
probs = [1.0 - image_repeat_prob, image_repeat_prob]
|
probs = [1.0 - image_repeat_prob, image_repeat_prob]
|
||||||
|
|
||||||
inputs = []
|
inputs = []
|
||||||
|
inputs_with_empty_media = []
|
||||||
cur_image = data
|
cur_image = data
|
||||||
for i in range(num_prompts):
|
for i in range(num_prompts):
|
||||||
if image_repeat_prob is not None:
|
if image_repeat_prob is not None:
|
||||||
@ -1774,14 +1775,25 @@ def apply_image_repeat(
|
|||||||
new_val = (i // 256 // 256, i // 256, i % 256)
|
new_val = (i // 256 // 256, i // 256, i % 256)
|
||||||
cur_image.putpixel((0, 0), new_val)
|
cur_image.putpixel((0, 0), new_val)
|
||||||
|
|
||||||
|
uuid = "uuid_{}".format(i)
|
||||||
|
|
||||||
inputs.append(
|
inputs.append(
|
||||||
{
|
{
|
||||||
"prompt": prompts[i % len(prompts)],
|
"prompt": prompts[i % len(prompts)],
|
||||||
"multi_modal_data": {modality: cur_image},
|
"multi_modal_data": {modality: cur_image},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return inputs
|
inputs_with_empty_media.append(
|
||||||
|
{
|
||||||
|
"prompt": prompts[i % len(prompts)],
|
||||||
|
"multi_modal_data": {modality: None},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return inputs, inputs_with_empty_media
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
@ -1860,6 +1872,13 @@ def parse_args():
|
|||||||
help="If True, then use different prompt (with the same multi-modal "
|
help="If True, then use different prompt (with the same multi-modal "
|
||||||
"data) for each request.",
|
"data) for each request.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--verify-mm-cache-hit-with-uuids",
|
||||||
|
action="store_true",
|
||||||
|
help="If True, will send all requests in a second batch with empty mm "
|
||||||
|
"data to verify cache hits with UUIDs.",
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -1903,26 +1922,48 @@ def main(args):
|
|||||||
assert args.num_prompts > 0
|
assert args.num_prompts > 0
|
||||||
if args.num_prompts == 1:
|
if args.num_prompts == 1:
|
||||||
# Single inference
|
# Single inference
|
||||||
|
uuid = "uuid_0"
|
||||||
inputs = {
|
inputs = {
|
||||||
"prompt": prompts[0],
|
"prompt": prompts[0],
|
||||||
"multi_modal_data": {modality: data},
|
"multi_modal_data": {modality: data},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
|
}
|
||||||
|
inputs_with_empty_media = {
|
||||||
|
"prompt": prompts[0],
|
||||||
|
"multi_modal_data": {modality: None},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
# Batch inference
|
# Batch inference
|
||||||
if args.image_repeat_prob is not None:
|
if args.image_repeat_prob is not None:
|
||||||
# Repeat images with specified probability of "image_repeat_prob"
|
# Repeat images with specified probability of "image_repeat_prob"
|
||||||
inputs = apply_image_repeat(
|
inputs, inputs_with_empty_media = apply_image_repeat(
|
||||||
args.image_repeat_prob, args.num_prompts, data, prompts, modality
|
args.image_repeat_prob,
|
||||||
|
args.num_prompts,
|
||||||
|
data,
|
||||||
|
prompts,
|
||||||
|
modality,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Use the same image for all prompts
|
# Use the same image for all prompts
|
||||||
inputs = [
|
inputs = []
|
||||||
{
|
inputs_with_empty_media = []
|
||||||
"prompt": prompts[i % len(prompts)],
|
for i in range(args.num_prompts):
|
||||||
"multi_modal_data": {modality: data},
|
uuid = "uuid_{}".format(i)
|
||||||
}
|
inputs.append(
|
||||||
for i in range(args.num_prompts)
|
{
|
||||||
]
|
"prompt": prompts[i % len(prompts)],
|
||||||
|
"multi_modal_data": {modality: data},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
inputs_with_empty_media.append(
|
||||||
|
{
|
||||||
|
"prompt": prompts[i % len(prompts)],
|
||||||
|
"multi_modal_data": {modality: None},
|
||||||
|
"multi_modal_uuids": {modality: uuid},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Add LoRA request if applicable
|
# Add LoRA request if applicable
|
||||||
lora_request = (
|
lora_request = (
|
||||||
@ -1942,6 +1983,26 @@ def main(args):
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
|
|
||||||
|
if args.verify_mm_cache_hit_with_uuids:
|
||||||
|
try:
|
||||||
|
# Verify cache hits with UUIDs
|
||||||
|
print(
|
||||||
|
"Sending a second batch of requests with empty media"
|
||||||
|
" and matching UUIDs."
|
||||||
|
)
|
||||||
|
outputs = llm.generate(
|
||||||
|
inputs_with_empty_media,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
lora_request=lora_request,
|
||||||
|
)
|
||||||
|
print("-" * 50)
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
print("-" * 50)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to verify cache hits with UUIDs. Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|||||||
@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid(
|
|||||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||||
assert len(chat_completion.choices[0].message.content) > 0
|
assert len(chat_completion.choices[0].message.content) > 0
|
||||||
|
|
||||||
|
# Second request, with empty image but the same uuid.
|
||||||
|
chat_completion_with_empty_image = await client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Describe this image.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {},
|
||||||
|
"uuid": image_url
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
model=model_name,
|
||||||
|
)
|
||||||
|
assert chat_completion_with_empty_image.choices[
|
||||||
|
0].message.content is not None
|
||||||
|
assert isinstance(
|
||||||
|
chat_completion_with_empty_image.choices[0].message.content, str)
|
||||||
|
assert len(
|
||||||
|
chat_completion_with_empty_image.choices[0].message.content) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
|
||||||
|
client: openai.AsyncOpenAI,
|
||||||
|
model_name: str,
|
||||||
|
):
|
||||||
|
with pytest.raises(openai.BadRequestError):
|
||||||
|
_ = await client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Describe this image.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {},
|
||||||
|
"uuid": "uuid_not_previously_seen"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
model=model_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
|||||||
@ -79,6 +79,28 @@ def phi3v_tokenizer():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function")
|
||||||
|
def qwen2_audio_model_config():
|
||||||
|
return ModelConfig(
|
||||||
|
QWEN2AUDIO_MODEL_ID,
|
||||||
|
runner="generate",
|
||||||
|
trust_remote_code=True,
|
||||||
|
limit_mm_per_prompt={
|
||||||
|
"audio": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def qwen2_audio_tokenizer():
|
||||||
|
return TokenizerGroup(
|
||||||
|
tokenizer_id=QWEN2AUDIO_MODEL_ID,
|
||||||
|
enable_lora=False,
|
||||||
|
max_num_seqs=5,
|
||||||
|
max_input_length=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def qwen25omni_model_config_mm_interleaved():
|
def qwen25omni_model_config_mm_interleaved():
|
||||||
return ModelConfig(
|
return ModelConfig(
|
||||||
@ -169,6 +191,7 @@ def audio_url():
|
|||||||
def _assert_mm_data_is_image_input(
|
def _assert_mm_data_is_image_input(
|
||||||
mm_data: Optional[MultiModalDataDict],
|
mm_data: Optional[MultiModalDataDict],
|
||||||
image_count: int,
|
image_count: int,
|
||||||
|
skipped_image_indices: Optional[list] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
assert mm_data is not None
|
assert mm_data is not None
|
||||||
assert set(mm_data.keys()) == {"image"}
|
assert set(mm_data.keys()) == {"image"}
|
||||||
@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input(
|
|||||||
assert image_data is not None
|
assert image_data is not None
|
||||||
|
|
||||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||||
|
if skipped_image_indices is not None:
|
||||||
|
for i in skipped_image_indices:
|
||||||
|
assert image_data[i] is None
|
||||||
|
|
||||||
|
|
||||||
def _assert_mm_uuids(
|
def _assert_mm_uuids(
|
||||||
@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int]
|
|||||||
|
|
||||||
|
|
||||||
def _assert_mm_data_inputs(
|
def _assert_mm_data_inputs(
|
||||||
mm_data: Optional[MultiModalDataDict],
|
mm_data: Optional[MultiModalDataDict],
|
||||||
data_count: MultiModalDataCounts,
|
data_count: MultiModalDataCounts,
|
||||||
|
skipped_media_indices: Optional[dict[
|
||||||
|
str, list]] = None, # modality -> list[int]
|
||||||
) -> None:
|
) -> None:
|
||||||
assert mm_data is not None
|
assert mm_data is not None
|
||||||
assert set(data_count.keys()) == (set(mm_data.keys()))
|
assert set(data_count.keys()) == (set(mm_data.keys()))
|
||||||
@ -216,6 +244,13 @@ def _assert_mm_data_inputs(
|
|||||||
assert modality_data is not None
|
assert modality_data is not None
|
||||||
assert isinstance(modality_data, list) and len(modality_data) == n
|
assert isinstance(modality_data, list) and len(modality_data) == n
|
||||||
|
|
||||||
|
if skipped_media_indices is not None:
|
||||||
|
skipped_media_indices_for_modality = skipped_media_indices.get(
|
||||||
|
modality)
|
||||||
|
assert skipped_media_indices_for_modality is not None
|
||||||
|
for i in skipped_media_indices_for_modality:
|
||||||
|
assert modality_data[i] is None
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_single_image(
|
def test_parse_chat_messages_single_image(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid(
|
|||||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_single_empty_image_with_uuid(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
image_url,
|
||||||
|
):
|
||||||
|
image_uuid = str(hash(image_url))
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in the image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "<|image_1|>\nWhat's in the image?"
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
|
||||||
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
phi3v_tokenizer,
|
phi3v_tokenizer,
|
||||||
@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids(
|
|||||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
image_url,
|
||||||
|
):
|
||||||
|
image_uuid1 = "my_uuid_1"
|
||||||
|
image_uuid2 = "my_uuid_2"
|
||||||
|
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in the image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
|
||||||
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
image_url,
|
||||||
|
):
|
||||||
|
image_uuid1 = "my_uuid_1"
|
||||||
|
image_uuid2 = "my_uuid_2"
|
||||||
|
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url,
|
||||||
|
},
|
||||||
|
"uuid": image_uuid1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in the image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
|
||||||
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_chat_messages_single_image_with_uuid_async(
|
async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
|
|||||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_chat_messages_empty_image_with_uuid_async(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
image_url,
|
||||||
|
):
|
||||||
|
image_uuid = str(hash(image_url))
|
||||||
|
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in the image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "<|image_1|>\nWhat's in the image?"
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(await mm_future,
|
||||||
|
1,
|
||||||
|
skipped_image_indices=[0])
|
||||||
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
|||||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
image_url,
|
||||||
|
):
|
||||||
|
image_uuid1 = "my_uuid_1"
|
||||||
|
image_uuid2 = "my_uuid_2"
|
||||||
|
|
||||||
|
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": image_uuid1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_pil",
|
||||||
|
"image_pil": None,
|
||||||
|
"uuid": image_uuid2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in these images?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(await mm_future,
|
||||||
|
2,
|
||||||
|
skipped_image_indices=[0, 1])
|
||||||
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images(
|
|||||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
):
|
||||||
|
uuid = "abcd"
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_pil",
|
||||||
|
"image_pil": None,
|
||||||
|
"uuid": uuid
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "<|image_1|>\nWhat's in this image?",
|
||||||
|
}]
|
||||||
|
_assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
|
||||||
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
):
|
||||||
|
uuid = "abcd"
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_embeds",
|
||||||
|
"image_embeds": None,
|
||||||
|
"uuid": uuid
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "<|image_1|>\nWhat's in this image?",
|
||||||
|
}]
|
||||||
|
assert mm_data is not None
|
||||||
|
assert "image" in mm_data
|
||||||
|
assert mm_data["image"] is None
|
||||||
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
):
|
||||||
|
uuid = "abcd"
|
||||||
|
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_embeds",
|
||||||
|
"image_embeds": None,
|
||||||
|
"uuid": uuid
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this image?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
phi3v_model_config,
|
||||||
|
phi3v_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "<|image_1|>\nWhat's in this image?",
|
||||||
|
}]
|
||||||
|
mm_data = await mm_future
|
||||||
|
assert mm_data is not None
|
||||||
|
assert "image" in mm_data
|
||||||
|
assert mm_data["image"] is None
|
||||||
|
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_chat_messages_multiple_images_async(
|
async def test_parse_chat_messages_multiple_images_async(
|
||||||
phi3v_model_config,
|
phi3v_model_config,
|
||||||
@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
|||||||
expected_uuids=["audio_123"])
|
expected_uuids=["audio_123"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501
|
||||||
|
qwen25omni_model_config_mm_interleaved,
|
||||||
|
qwen25omni_tokenizer,
|
||||||
|
image_url,
|
||||||
|
video_url,
|
||||||
|
audio_url,
|
||||||
|
):
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's on this image?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": "image_123",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Now listen to this audio"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_url",
|
||||||
|
"audio_url": None,
|
||||||
|
"uuid": "audio_123",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Some stuff."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's on this image?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": None,
|
||||||
|
"uuid": "image_123",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "And what's in the video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": None,
|
||||||
|
"uuid": "video_123",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
qwen25omni_model_config_mm_interleaved,
|
||||||
|
qwen25omni_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||||
|
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Some stuff."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||||
|
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
_assert_mm_data_inputs(mm_data, {
|
||||||
|
"image": 2,
|
||||||
|
"video": 1,
|
||||||
|
"audio": 1
|
||||||
|
},
|
||||||
|
skipped_media_indices={
|
||||||
|
"image": [0, 1],
|
||||||
|
"video": [0],
|
||||||
|
"audio": [0]
|
||||||
|
})
|
||||||
|
_assert_mm_uuids(mm_uuids,
|
||||||
|
2,
|
||||||
|
modality="image",
|
||||||
|
expected_uuids=["image_123", "image_123"])
|
||||||
|
_assert_mm_uuids(mm_uuids,
|
||||||
|
1,
|
||||||
|
modality="video",
|
||||||
|
expected_uuids=["video_123"])
|
||||||
|
_assert_mm_uuids(mm_uuids,
|
||||||
|
1,
|
||||||
|
modality="audio",
|
||||||
|
expected_uuids=["audio_123"])
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
||||||
qwen25omni_model_config_mm_interleaved,
|
qwen25omni_model_config_mm_interleaved,
|
||||||
qwen25omni_tokenizer,
|
qwen25omni_tokenizer,
|
||||||
@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk():
|
|||||||
r"[INST]Thanks, what is 3+3?[/INST]")
|
r"[INST]Thanks, what is 3+3?[/INST]")
|
||||||
|
|
||||||
assert string_tokens == expected_tokens
|
assert string_tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||||
|
qwen2_audio_model_config,
|
||||||
|
qwen2_audio_tokenizer,
|
||||||
|
):
|
||||||
|
audio_uuid = "abcd"
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {},
|
||||||
|
"uuid": audio_uuid,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What does the audio say?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
qwen2_audio_model_config,
|
||||||
|
qwen2_audio_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
||||||
|
}]
|
||||||
|
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||||
|
_assert_mm_uuids(mm_uuids,
|
||||||
|
1,
|
||||||
|
modality="audio",
|
||||||
|
expected_uuids=[audio_uuid])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||||
|
qwen2_audio_model_config,
|
||||||
|
qwen2_audio_tokenizer,
|
||||||
|
):
|
||||||
|
audio_uuid = "abcd"
|
||||||
|
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {},
|
||||||
|
"uuid": audio_uuid,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What does the audio say?"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
qwen2_audio_model_config,
|
||||||
|
qwen2_audio_tokenizer,
|
||||||
|
content_format="string",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert conversation == [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content":
|
||||||
|
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
|
||||||
|
}]
|
||||||
|
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
||||||
|
_assert_mm_uuids(mm_uuids,
|
||||||
|
1,
|
||||||
|
modality="audio",
|
||||||
|
expected_uuids=[audio_uuid])
|
||||||
|
|||||||
@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
|
|||||||
|
|
||||||
type: Required[Literal["audio_url"]]
|
type: Required[Literal["audio_url"]]
|
||||||
"""The type of the content part."""
|
"""The type of the content part."""
|
||||||
uuid: Optional[str]
|
|
||||||
"""
|
|
||||||
User-provided UUID of a media. User must guarantee that it is properly
|
|
||||||
generated and unique for different medias.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
|
class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
|
||||||
image_embeds: Required[Union[str, dict[str, str]]]
|
image_embeds: Optional[Union[str, dict[str, str]]]
|
||||||
"""
|
"""
|
||||||
The image embeddings. It can be either:
|
The image embeddings. It can be either:
|
||||||
- A single base64 string.
|
- A single base64 string.
|
||||||
@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
|
|||||||
|
|
||||||
type: Required[Literal["video_url"]]
|
type: Required[Literal["video_url"]]
|
||||||
"""The type of the content part."""
|
"""The type of the content part."""
|
||||||
uuid: Optional[str]
|
|
||||||
"""
|
|
||||||
User-provided UUID of a media. User must guarantee that it is properly
|
|
||||||
generated and unique for different medias.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class PILImage(BaseModel):
|
class PILImage(BaseModel):
|
||||||
@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_pil: Required[PILImage]
|
image_pil: Optional[PILImage]
|
||||||
uuid: Optional[str]
|
uuid: Optional[str]
|
||||||
"""
|
"""
|
||||||
User-provided UUID of a media. User must guarantee that it is properly
|
User-provided UUID of a media. User must guarantee that it is properly
|
||||||
@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_url: Required[str]
|
image_url: Optional[str]
|
||||||
uuid: Optional[str]
|
uuid: Optional[str]
|
||||||
"""
|
"""
|
||||||
User-provided UUID of a media. User must guarantee that it is properly
|
User-provided UUID of a media. User must guarantee that it is properly
|
||||||
@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
audio_url: Required[str]
|
audio_url: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
||||||
@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
video_url: Required[str]
|
video_url: Optional[str]
|
||||||
uuid: Optional[str]
|
uuid: Optional[str]
|
||||||
"""
|
"""
|
||||||
User-provided UUID of a media. User must guarantee that it is properly
|
User-provided UUID of a media. User must guarantee that it is properly
|
||||||
@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
self._model_config = model_config
|
self._model_config = model_config
|
||||||
self._tokenizer = tokenizer
|
self._tokenizer = tokenizer
|
||||||
|
|
||||||
self._items_by_modality = defaultdict[str, list[_T]](list)
|
self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
|
||||||
self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
|
self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -624,7 +614,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
return self.mm_registry.create_processor(self.model_config)
|
return self.mm_registry.create_processor(self.model_config)
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
|
self,
|
||||||
|
modality: ModalityStr,
|
||||||
|
item: Optional[_T],
|
||||||
|
uuid: Optional[str] = None,
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Add a multi-modal item to the current prompt and returns the
|
Add a multi-modal item to the current prompt and returns the
|
||||||
@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
|
|||||||
if not self._items_by_modality:
|
if not self._items_by_modality:
|
||||||
return None
|
return None
|
||||||
mm_inputs = {}
|
mm_inputs = {}
|
||||||
items_by_modality = {
|
items_by_modality = {}
|
||||||
modality: await asyncio.gather(*items)
|
for modality, items in self._items_by_modality.items():
|
||||||
for modality, items in self._items_by_modality.items()
|
coros = []
|
||||||
}
|
for item in items:
|
||||||
|
if item is not None:
|
||||||
|
coros.append(item)
|
||||||
|
else:
|
||||||
|
coros.append(asyncio.sleep(0))
|
||||||
|
items_by_modality[modality] = await asyncio.gather(*coros)
|
||||||
|
|
||||||
if "image" in items_by_modality and "image_embeds" in items_by_modality:
|
if "image" in items_by_modality and "image_embeds" in items_by_modality:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC):
|
|||||||
return dict(self._placeholder_storage)
|
return dict(self._placeholder_storage)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
def parse_image(
|
||||||
|
self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_image_embeds(
|
def parse_image_embeds(
|
||||||
self,
|
self,
|
||||||
image_embeds: Union[str, dict[str, str]],
|
image_embeds: Union[str, dict[str, str], None],
|
||||||
uuid: Optional[str] = None,
|
uuid: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_image_pil(
|
def parse_image_pil(
|
||||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
def parse_audio(
|
||||||
|
self, audio_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_input_audio(
|
def parse_input_audio(
|
||||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
def parse_video(
|
||||||
|
self, video_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
def parse_image(
|
||||||
image = self._connector.fetch_image(image_url)
|
self, image_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
image = self._connector.fetch_image(image_url) if image_url else None
|
||||||
|
|
||||||
placeholder = self._tracker.add("image", image, uuid)
|
placeholder = self._tracker.add("image", image, uuid)
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_image_embeds(
|
def parse_image_embeds(
|
||||||
self,
|
self,
|
||||||
image_embeds: Union[str, dict[str, str]],
|
image_embeds: Union[str, dict[str, str], None],
|
||||||
uuid: Optional[str] = None,
|
uuid: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if isinstance(image_embeds, dict):
|
if isinstance(image_embeds, dict):
|
||||||
@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
embedding = self._connector.fetch_image_embedding(image_embeds)
|
embedding = self._connector.fetch_image_embedding(image_embeds)
|
||||||
placeholder = self._tracker.add("image_embeds", embedding, uuid)
|
placeholder = self._tracker.add("image_embeds", embedding, uuid)
|
||||||
|
|
||||||
|
if image_embeds is None:
|
||||||
|
placeholder = self._tracker.add("image_embeds", None, uuid)
|
||||||
|
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_image_pil(
|
def parse_image_pil(
|
||||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
placeholder = self._tracker.add("image", image_pil, uuid)
|
placeholder = self._tracker.add("image", image_pil, uuid)
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
def parse_audio(
|
||||||
audio = self._connector.fetch_audio(audio_url)
|
self, audio_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
audio = self._connector.fetch_audio(audio_url) if audio_url else None
|
||||||
|
|
||||||
placeholder = self._tracker.add("audio", audio, uuid)
|
placeholder = self._tracker.add("audio", audio, uuid)
|
||||||
self._add_placeholder("audio", placeholder)
|
self._add_placeholder("audio", placeholder)
|
||||||
|
|
||||||
def parse_input_audio(
|
def parse_input_audio(
|
||||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
audio_data = input_audio.get("data", "")
|
if input_audio:
|
||||||
audio_format = input_audio.get("format", "")
|
audio_data = input_audio.get("data", "")
|
||||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
audio_format = input_audio.get("format", "")
|
||||||
|
if audio_data:
|
||||||
|
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||||
|
else:
|
||||||
|
# If a UUID is provided, audio data may be empty.
|
||||||
|
audio_url = None
|
||||||
|
else:
|
||||||
|
audio_url = None
|
||||||
|
|
||||||
return self.parse_audio(audio_url, uuid)
|
return self.parse_audio(audio_url, uuid)
|
||||||
|
|
||||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
def parse_video(
|
||||||
video = self._connector.fetch_video(video_url=video_url)
|
self, video_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
video = (
|
||||||
|
self._connector.fetch_video(video_url=video_url)
|
||||||
|
if video_url
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
placeholder = self._tracker.add("video", video, uuid)
|
placeholder = self._tracker.add("video", video, uuid)
|
||||||
self._add_placeholder("video", placeholder)
|
self._add_placeholder("video", placeholder)
|
||||||
@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
|
def parse_image(
|
||||||
image_coro = self._connector.fetch_image_async(image_url)
|
self, image_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
image_coro = (
|
||||||
|
self._connector.fetch_image_async(image_url) if image_url else None
|
||||||
|
)
|
||||||
|
|
||||||
placeholder = self._tracker.add("image", image_coro, uuid)
|
placeholder = self._tracker.add("image", image_coro, uuid)
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_image_embeds(
|
def parse_image_embeds(
|
||||||
self,
|
self,
|
||||||
image_embeds: Union[str, dict[str, str]],
|
image_embeds: Union[str, dict[str, str], None],
|
||||||
uuid: Optional[str] = None,
|
uuid: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
|
future: asyncio.Future[Union[str, dict[str, str], None]] = (
|
||||||
|
asyncio.Future()
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(image_embeds, dict):
|
if isinstance(image_embeds, dict):
|
||||||
embeds = {
|
embeds = {
|
||||||
@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
embedding = self._connector.fetch_image_embedding(image_embeds)
|
embedding = self._connector.fetch_image_embedding(image_embeds)
|
||||||
future.set_result(embedding)
|
future.set_result(embedding)
|
||||||
|
|
||||||
|
if image_embeds is None:
|
||||||
|
future.set_result(None)
|
||||||
|
|
||||||
placeholder = self._tracker.add("image_embeds", future, uuid)
|
placeholder = self._tracker.add("image_embeds", future, uuid)
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_image_pil(
|
def parse_image_pil(
|
||||||
self, image_pil: Image.Image, uuid: Optional[str] = None
|
self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
future: asyncio.Future[Image.Image] = asyncio.Future()
|
future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
|
||||||
future.set_result(image_pil)
|
if image_pil:
|
||||||
|
future.set_result(image_pil)
|
||||||
|
else:
|
||||||
|
future.set_result(None)
|
||||||
|
|
||||||
placeholder = self._tracker.add("image", future, uuid)
|
placeholder = self._tracker.add("image", future, uuid)
|
||||||
self._add_placeholder("image", placeholder)
|
self._add_placeholder("image", placeholder)
|
||||||
|
|
||||||
def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
|
def parse_audio(
|
||||||
audio_coro = self._connector.fetch_audio_async(audio_url)
|
self, audio_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
audio_coro = (
|
||||||
|
self._connector.fetch_audio_async(audio_url) if audio_url else None
|
||||||
|
)
|
||||||
|
|
||||||
placeholder = self._tracker.add("audio", audio_coro, uuid)
|
placeholder = self._tracker.add("audio", audio_coro, uuid)
|
||||||
self._add_placeholder("audio", placeholder)
|
self._add_placeholder("audio", placeholder)
|
||||||
|
|
||||||
def parse_input_audio(
|
def parse_input_audio(
|
||||||
self, input_audio: InputAudio, uuid: Optional[str] = None
|
self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
audio_data = input_audio.get("data", "")
|
if input_audio:
|
||||||
audio_format = input_audio.get("format", "")
|
audio_data = input_audio.get("data", "")
|
||||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
audio_format = input_audio.get("format", "")
|
||||||
|
if audio_data:
|
||||||
|
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||||
|
else:
|
||||||
|
# If a UUID is provided, audio data may be empty.
|
||||||
|
audio_url = None
|
||||||
|
else:
|
||||||
|
audio_url = None
|
||||||
|
|
||||||
return self.parse_audio(audio_url, uuid)
|
return self.parse_audio(audio_url, uuid)
|
||||||
|
|
||||||
def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
|
def parse_video(
|
||||||
video = self._connector.fetch_video_async(video_url=video_url)
|
self, video_url: Optional[str], uuid: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
video = (
|
||||||
|
self._connector.fetch_video_async(video_url=video_url)
|
||||||
|
if video_url
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
placeholder = self._tracker.add("video", video, uuid)
|
placeholder = self._tracker.add("video", video, uuid)
|
||||||
self._add_placeholder("video", placeholder)
|
self._add_placeholder("video", placeholder)
|
||||||
@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part(
|
|||||||
part, dict
|
part, dict
|
||||||
) # This is needed to avoid mypy errors: part.get() from str
|
) # This is needed to avoid mypy errors: part.get() from str
|
||||||
part_type = part.get("type", None)
|
part_type = part.get("type", None)
|
||||||
|
uuid = part.get("uuid", None)
|
||||||
|
|
||||||
if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
|
if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501
|
||||||
content = MM_PARSER_MAP[part_type](part)
|
content = MM_PARSER_MAP[part_type](part)
|
||||||
|
|
||||||
# Special case for 'image_url.detail'
|
# Special case for 'image_url.detail'
|
||||||
@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part(
|
|||||||
|
|
||||||
# Handle missing 'type' but provided direct URL fields.
|
# Handle missing 'type' but provided direct URL fields.
|
||||||
# 'type' is required field by pydantic
|
# 'type' is required field by pydantic
|
||||||
if part_type is None:
|
if part_type is None or uuid is not None:
|
||||||
if part.get("image_url") is not None:
|
if "image_url" in part:
|
||||||
image_params = cast(
|
image_params = cast(
|
||||||
CustomChatCompletionContentSimpleImageParam, part
|
CustomChatCompletionContentSimpleImageParam, part
|
||||||
)
|
)
|
||||||
return "image_url", image_params.get("image_url", "")
|
image_url = image_params.get("image_url", None)
|
||||||
if part.get("audio_url") is not None:
|
if isinstance(image_url, dict):
|
||||||
|
# Can potentially happen if user provides a uuid
|
||||||
|
# with url as a dict of {"url": url}
|
||||||
|
image_url = image_url.get("url", None)
|
||||||
|
return "image_url", image_url
|
||||||
|
if "image_pil" in part:
|
||||||
|
# "image_pil" could be None if UUID is provided.
|
||||||
|
image_params = cast( # type: ignore
|
||||||
|
CustomChatCompletionContentPILImageParam, part
|
||||||
|
)
|
||||||
|
image_pil = image_params.get("image_pil", None)
|
||||||
|
return "image_pil", image_pil
|
||||||
|
if "image_embeds" in part:
|
||||||
|
# "image_embeds" could be None if UUID is provided.
|
||||||
|
image_params = cast( # type: ignore
|
||||||
|
ChatCompletionContentPartImageEmbedsParam, part
|
||||||
|
)
|
||||||
|
image_embeds = image_params.get("image_embeds", None)
|
||||||
|
return "image_embeds", image_embeds
|
||||||
|
if "audio_url" in part:
|
||||||
audio_params = cast(
|
audio_params = cast(
|
||||||
CustomChatCompletionContentSimpleAudioParam, part
|
CustomChatCompletionContentSimpleAudioParam, part
|
||||||
)
|
)
|
||||||
return "audio_url", audio_params.get("audio_url", "")
|
audio_url = audio_params.get("audio_url", None)
|
||||||
|
if isinstance(audio_url, dict):
|
||||||
|
# Can potentially happen if user provides a uuid
|
||||||
|
# with url as a dict of {"url": url}
|
||||||
|
audio_url = audio_url.get("url", None)
|
||||||
|
return "audio_url", audio_url
|
||||||
if part.get("input_audio") is not None:
|
if part.get("input_audio") is not None:
|
||||||
input_audio_params = cast(dict[str, str], part)
|
input_audio_params = cast(dict[str, str], part)
|
||||||
return "input_audio", input_audio_params
|
return "input_audio", input_audio_params
|
||||||
if part.get("video_url") is not None:
|
if "video_url" in part:
|
||||||
video_params = cast(
|
video_params = cast(
|
||||||
CustomChatCompletionContentSimpleVideoParam, part
|
CustomChatCompletionContentSimpleVideoParam, part
|
||||||
)
|
)
|
||||||
return "video_url", video_params.get("video_url", "")
|
video_url = video_params.get("video_url", None)
|
||||||
|
if isinstance(video_url, dict):
|
||||||
|
# Can potentially happen if user provides a uuid
|
||||||
|
# with url as a dict of {"url": url}
|
||||||
|
video_url = video_url.get("url", None)
|
||||||
|
return "video_url", video_url
|
||||||
# Raise an error if no 'type' or direct URL is found.
|
# Raise an error if no 'type' or direct URL is found.
|
||||||
raise ValueError("Missing 'type' field in multimodal part.")
|
raise ValueError("Missing 'type' field in multimodal part.")
|
||||||
|
|
||||||
@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part(
|
|||||||
return part_type, "unknown part_type content"
|
return part_type, "unknown part_type content"
|
||||||
|
|
||||||
|
|
||||||
VALID_MESSAGE_CONTENT_MM_PART_TYPES = (
|
PART_TYPES_TO_SKIP_NONE_CONTENT = (
|
||||||
"text",
|
"text",
|
||||||
"refusal",
|
"refusal",
|
||||||
"image_url",
|
|
||||||
"image_embeds",
|
|
||||||
"image_pil",
|
|
||||||
"audio_url",
|
|
||||||
"input_audio",
|
|
||||||
"video_url",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part(
|
|||||||
part_type, content = _parse_chat_message_content_mm_part(part)
|
part_type, content = _parse_chat_message_content_mm_part(part)
|
||||||
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
|
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
|
||||||
# content is None, log a warning and skip
|
# content is None, log a warning and skip
|
||||||
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
|
if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Skipping multimodal part '%s' (type: '%s') "
|
"Skipping multimodal part '%s' (type: '%s') "
|
||||||
"with empty / unparsable content.",
|
"with empty / unparsable content.",
|
||||||
@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part(
|
|||||||
|
|
||||||
modality = None
|
modality = None
|
||||||
if part_type == "image_pil":
|
if part_type == "image_pil":
|
||||||
image_content = cast(Image.Image, content)
|
if content is not None:
|
||||||
|
image_content = cast(Image.Image, content)
|
||||||
|
else:
|
||||||
|
image_content = None
|
||||||
mm_parser.parse_image_pil(image_content, uuid)
|
mm_parser.parse_image_pil(image_content, uuid)
|
||||||
modality = "image"
|
modality = "image"
|
||||||
elif part_type in ("image_url", "input_image"):
|
elif part_type in ("image_url", "input_image"):
|
||||||
@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part(
|
|||||||
mm_parser.parse_image(str_content, uuid)
|
mm_parser.parse_image(str_content, uuid)
|
||||||
modality = "image"
|
modality = "image"
|
||||||
elif part_type == "image_embeds":
|
elif part_type == "image_embeds":
|
||||||
content = cast(Union[str, dict[str, str]], content)
|
if content is not None:
|
||||||
|
content = cast(Union[str, dict[str, str]], content)
|
||||||
|
else:
|
||||||
|
content = None
|
||||||
mm_parser.parse_image_embeds(content, uuid)
|
mm_parser.parse_image_embeds(content, uuid)
|
||||||
modality = "image"
|
modality = "image"
|
||||||
elif part_type == "audio_url":
|
elif part_type == "audio_url":
|
||||||
|
|||||||
@ -1491,6 +1491,11 @@ class LLM:
|
|||||||
|
|
||||||
for i, prompt in enumerate(it):
|
for i, prompt in enumerate(it):
|
||||||
|
|
||||||
|
if isinstance(prompt, dict):
|
||||||
|
self._validate_mm_data_and_uuids(
|
||||||
|
prompt.get("multi_modal_data"),
|
||||||
|
prompt.get("multi_modal_uuids"))
|
||||||
|
|
||||||
param = params[i] if isinstance(params, Sequence) else params
|
param = params[i] if isinstance(params, Sequence) else params
|
||||||
|
|
||||||
tokenization_kwargs: dict[str, Any] = {}
|
tokenization_kwargs: dict[str, Any] = {}
|
||||||
@ -1507,6 +1512,41 @@ class LLM:
|
|||||||
priority=priority[i] if priority else 0,
|
priority=priority[i] if priority else 0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _validate_mm_data_and_uuids(
|
||||||
|
self,
|
||||||
|
multi_modal_data: Optional[Any], # MultiModalDataDict
|
||||||
|
multi_modal_uuids: Optional[Any], # MultiModalUUIDDict
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Validate that if any multi-modal data is skipped (i.e. None),
|
||||||
|
then its corresponding UUID must be set.
|
||||||
|
"""
|
||||||
|
if multi_modal_data is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
for modality, data in multi_modal_data.items():
|
||||||
|
if isinstance(data, list):
|
||||||
|
for i, d in enumerate(data):
|
||||||
|
if d is None:
|
||||||
|
if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[ # noqa: E501
|
||||||
|
modality] is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Multi-modal data for {modality} is None "
|
||||||
|
f"but UUID is not provided")
|
||||||
|
else:
|
||||||
|
if len(
|
||||||
|
multi_modal_uuids[modality]
|
||||||
|
) <= i or multi_modal_uuids[modality][i] is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Multi-modal data for {modality} is None "
|
||||||
|
f"but UUID is not provided")
|
||||||
|
else:
|
||||||
|
if data is None and (multi_modal_uuids is None
|
||||||
|
or modality not in multi_modal_uuids
|
||||||
|
or multi_modal_uuids[modality] is None):
|
||||||
|
raise ValueError(f"Multi-modal data for {modality} is None"
|
||||||
|
f" but UUID is not provided")
|
||||||
|
|
||||||
def _add_request(
|
def _add_request(
|
||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
|
|||||||
@ -85,9 +85,10 @@ which are treated as audio embeddings;
|
|||||||
these are directly passed to the model without HF processing.
|
these are directly passed to the model without HF processing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ModalityData: TypeAlias = Union[_T, list[_T]]
|
ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
|
||||||
"""
|
"""
|
||||||
Either a single data item, or a list of data items.
|
Either a single data item, or a list of data items. Can only be None if UUID
|
||||||
|
is provided.
|
||||||
|
|
||||||
The number of data items allowed per modality is restricted by
|
The number of data items allowed per modality is restricted by
|
||||||
`--limit-mm-per-prompt`.
|
`--limit-mm-per-prompt`.
|
||||||
|
|||||||
@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
|
|||||||
def __init__(self, data: _T, modality: str) -> None:
|
def __init__(self, data: _T, modality: str) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.data = data
|
self.data: _T = data
|
||||||
self.modality = modality
|
self.modality = modality
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
|
|||||||
|
|
||||||
class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
|
class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
|
||||||
|
|
||||||
def __init__(self, data: Sequence[HfAudioItem]) -> None:
|
def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
|
||||||
|
if data is None:
|
||||||
|
data = [None]
|
||||||
super().__init__(data, "audio")
|
super().__init__(data, "audio")
|
||||||
|
|
||||||
def get_audio_length(self, item_idx: int) -> int:
|
def get_audio_length(self, item_idx: int) -> int:
|
||||||
@ -198,7 +200,9 @@ class ImageSize(NamedTuple):
|
|||||||
|
|
||||||
class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
|
class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
|
||||||
|
|
||||||
def __init__(self, data: Sequence[HfImageItem]) -> None:
|
def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
|
||||||
|
if data is None:
|
||||||
|
data = [None]
|
||||||
super().__init__(data, "image")
|
super().__init__(data, "image")
|
||||||
|
|
||||||
def get_image_size(self, item_idx: int) -> ImageSize:
|
def get_image_size(self, item_idx: int) -> ImageSize:
|
||||||
@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
data: Sequence[HfVideoItem],
|
data: Optional[Sequence[HfVideoItem]],
|
||||||
metadata: Optional[Union[dict[str, Any],
|
metadata: Optional[Union[dict[str, Any],
|
||||||
list[Optional[dict[str, Any]]]]] = None,
|
list[Optional[dict[str, Any]]]]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if data is None:
|
||||||
|
data = [None]
|
||||||
super().__init__(data, "video")
|
super().__init__(data, "video")
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
|
|
||||||
@ -385,6 +391,9 @@ class MultiModalDataParser:
|
|||||||
self,
|
self,
|
||||||
data: ModalityData[AudioItem],
|
data: ModalityData[AudioItem],
|
||||||
) -> Optional[ModalityDataItems[Any, Any]]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if data is None:
|
||||||
|
return AudioProcessorItems(None)
|
||||||
|
|
||||||
# also check single audio item with sampling rate
|
# also check single audio item with sampling rate
|
||||||
if self._is_empty(data) or (isinstance(data, tuple)
|
if self._is_empty(data) or (isinstance(data, tuple)
|
||||||
and self._is_empty(data[0])):
|
and self._is_empty(data[0])):
|
||||||
@ -420,6 +429,9 @@ class MultiModalDataParser:
|
|||||||
self,
|
self,
|
||||||
data: ModalityData[ImageItem],
|
data: ModalityData[ImageItem],
|
||||||
) -> Optional[ModalityDataItems[Any, Any]]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if data is None:
|
||||||
|
return ImageProcessorItems(None)
|
||||||
|
|
||||||
if self._is_empty(data):
|
if self._is_empty(data):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -441,6 +453,9 @@ class MultiModalDataParser:
|
|||||||
self,
|
self,
|
||||||
data: ModalityData[VideoItem],
|
data: ModalityData[VideoItem],
|
||||||
) -> Optional[ModalityDataItems[Any, Any]]:
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if data is None:
|
||||||
|
return VideoProcessorItems(None)
|
||||||
|
|
||||||
if self._is_empty(data):
|
if self._is_empty(data):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
|
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
|
||||||
"""
|
"""
|
||||||
mm_items = self.data_parser.parse_mm_data(mm_data)
|
mm_items = self.data_parser.parse_mm_data(mm_data)
|
||||||
|
|
||||||
for modality, items in mm_items.items():
|
for modality, items in mm_items.items():
|
||||||
self.validate_num_items(modality, len(items))
|
self.validate_num_items(modality, len(items))
|
||||||
|
|
||||||
@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
]
|
]
|
||||||
for modality, items_is_cached in mm_is_cached.items()
|
for modality, items_is_cached in mm_is_cached.items()
|
||||||
}
|
}
|
||||||
mm_missing_data = {
|
mm_missing_data = {}
|
||||||
modality: [mm_data_items[modality][idx] for idx in idxs]
|
for modality, idxs in mm_missing_idxs.items():
|
||||||
for modality, idxs in mm_missing_idxs.items()
|
missing_modality_data = []
|
||||||
}
|
for idx in idxs:
|
||||||
|
data = mm_data_items[modality][idx]
|
||||||
|
if data is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Cache miss for {modality} at index {idx} "
|
||||||
|
f"but data is not provided.")
|
||||||
|
else:
|
||||||
|
missing_modality_data.append(data)
|
||||||
|
mm_missing_data[modality] = missing_modality_data
|
||||||
|
|
||||||
return self._to_mm_items(mm_missing_data)
|
return self._to_mm_items(mm_missing_data)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user