diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 8cc51a5d73b3..a00387ef6b8c 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -30,6 +30,7 @@ QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B" +MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @pytest.fixture(scope="function") @@ -80,6 +81,30 @@ def mllama_tokenizer(): ) +@pytest.fixture(scope="function") +def mistral_model_config(): + return ModelConfig(MISTRAL_MODEL_ID, + task="generate", + tokenizer=MISTRAL_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="auto", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + +@pytest.fixture(scope="module") +def mistral_tokenizer(): + return TokenizerGroup( + tokenizer_id=MISTRAL_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="module") def image_url(): image = ImageAsset('cherry_blossom') @@ -131,6 +156,66 @@ def test_parse_chat_messages_single_image( _assert_mm_data_is_image_input(mm_data, 1) +def test_parse_chat_messages_empty_system( + mistral_model_config, + mistral_tokenizer, +): + # Test string format + conversation, _ = parse_chat_messages( + [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }], + mistral_model_config, + mistral_tokenizer, + content_format="string", + ) + assert conversation == [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": "Who are you?" + }] + + # Test openai format + conversation, _ = parse_chat_messages( + [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }], + mistral_model_config, + mistral_tokenizer, + content_format="openai", + ) + assert conversation == [{ + "role": "system", + "content": [{ + "type": "text", + "text": "" + }] + }, { + "role": + "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }] + + @pytest.mark.asyncio async def test_parse_chat_messages_single_image_async( phi3v_model_config, @@ -671,7 +756,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): # Build a config for the model model_config = ModelConfig(model, task="generate", - tokenizer=MLLAMA_MODEL_ID, + tokenizer=model, tokenizer_mode="auto", trust_remote_code=True, dtype="auto", @@ -682,7 +767,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): # Build the tokenizer group and grab the underlying tokenizer tokenizer_group = TokenizerGroup( - MLLAMA_MODEL_ID, + model, enable_lora=False, max_num_seqs=5, max_input_length=None, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index d7e8d045108e..11c759a6174e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -872,19 +872,19 @@ MM_PARSER_MAP: dict[ Callable[[ChatCompletionContentPartParam], _ContentPart], ] = { "text": - lambda part: _TextParser(part).get("text", ""), + lambda part: _TextParser(part).get("text", None), "image_url": - lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), + lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": - lambda part: _ImageEmbedsParser(part).get("image_embeds", {}), + lambda part: _ImageEmbedsParser(part).get("image_embeds", None), "audio_url": - lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": - lambda part: _InputAudioParser(part).get("input_audio", {}), + lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": - lambda part: _RefusalParser(part).get("refusal", ""), + lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser(part).get("video_url", {}).get("url", ""), + lambda part: _VideoParser(part).get("video_url", {}).get("url", None), } @@ -1003,11 +1003,11 @@ def _parse_chat_message_content_part( part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but - # content is empty, log a warning and skip - if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: + # content is None, log a warning and skip + if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: logger.warning( - "Skipping multimodal part (type: '%s') " - "with empty / unparsable content.", part_type) + "Skipping multimodal part '%s' (type: '%s') " + "with empty / unparsable content.", part, part_type) return None if part_type in ("text", "refusal"):