Delete HF version of Phi 4 MM (#30049)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-12-09 01:35:01 +08:00 · 2025-12-04 13:44:50 +00:00 · 2025-12-04 13:44:50 +00:00 · 9998ea5b57
commit 9998ea5b57
parent 74c4d80c6c
6 changed files with 1 additions and 1756 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
-| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@ -1,281 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from collections.abc import Sequence
-
-import librosa
-import pytest
-from huggingface_hub import snapshot_download
-
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
-
-from ....conftest import (
-    IMAGE_ASSETS,
-    HfRunner,
-    PromptAudioInput,
-    PromptImageInput,
-    VllmRunner,
-)
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
-    {
-        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-    }
-)
-HF_MULTIIMAGE_IMAGE_PROMPT = (
-    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-)
-
-model_path = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-)
-# Since the vision-lora and speech-lora co-exist with the base model,
-# we have to manually specify the path of the lora weights.
-vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(
-    model_path, "examples", "what_is_shown_in_this_image.wav"
-)
-models = [model_path]
-
-target_dtype = "half"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
-    model: str,
-    *,
-    max_model_len: int,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: str | None = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(
-        model,
-        task="generate",
-        max_model_len=max_model_len,
-        max_num_seqs=2,
-        dtype=dtype,
-        limit_mm_per_prompt={"image": mm_limit},
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-        enable_lora=True,
-        max_lora_rank=320,
-        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-        enforce_eager=True,
-        trust_remote_code=False,
-    ) as vllm_model:
-        lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                lora_request=lora_request,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_model.model.load_adapter(
-            vision_lora_path,
-            adapter_name="vision",
-        )
-        hf_processor = hf_model.processor
-        eos_token_id = hf_processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                eos_token_id=eos_token_id,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [
-        (
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-            None,
-        )
-        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        # [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [25600])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        (
-            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [
-                [rescale_image_size(image, factor) for image in images]
-                for factor in size_factors
-            ],
-            None,
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    # use the example speech question so that the model outputs are reasonable
-    audio = librosa.load(speech_question, sr=16000)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-
-    inputs_vision_speech = [
-        (
-            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
-            [image],
-            [audio],
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_vision_speech,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -396,28 +396,6 @@ def test_processing_correctness(
    )


-# Phi4MultimodalForCausalLM share same model repo with original format
-# Phi4MMForCausalLM, so we add it as a separate test case
-# Remove this test after conversion PR merged:
-# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
-@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-def test_processing_correctness_phi4_multimodal(
-    model_arch: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_arch,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
 def _assert_inputs_equal(
    a: MultiModalInputs,
    b: MultiModalInputs,
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -771,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Phi4MMForCausalLM": _HfExamplesInfo(
        "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
    ),
-    "Phi4MultimodalForCausalLM": _HfExamplesInfo(
-        "microsoft/Phi-4-multimodal-instruct",
-        revision="refs/pr/70",
-    ),
    "PixtralForConditionalGeneration": _HfExamplesInfo(
        "mistralai/Pixtral-12B-2409",
        extras={
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -374,7 +374,6 @@ _MULTIMODAL_MODELS = {
    ),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
-    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
@ -507,6 +506,7 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
    "MotifForCausalLM": "0.10.2",
    "Phi3SmallForCausalLM": "0.9.2",
    "Phi4FlashForCausalLM": "0.10.2",
+    "Phi4MultimodalForCausalLM": "0.12.0",
    # encoder-decoder models except whisper
    # have been removed for V0 deprecation.
    "BartModel": "0.10.2",