[VLM] Clean up models (#16873)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-12-12 08:35:01 +08:00 · 2025-04-19 20:13:06 +08:00 · 2025-04-19 20:13:06 +08:00 · 205d84aaa9
commit 205d84aaa9
parent 5124f5bf51
4 changed files with 2 additions and 42 deletions
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
        config_format="mistral" if args.format == "mistral" else "auto",
        load_format="mistral" if args.format == "mistral" else "auto",
        limit_mm_per_prompt={"image": 1},
        max_model_len=4096,
        max_num_seqs=2,
        tensor_parallel_size=2,
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -957,7 +957,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )
    if modality == "image":
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@ -503,26 +503,6 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None, "image": None}
    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int]:
        return {
            "image": self.get_max_image_tokens(),
            "audio": self.get_max_audio_tokens(),
        }
    def get_max_audio_tokens(self) -> int:
        sr = self.get_feature_extractor().sampling_rate
        num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
        return self._compute_audio_embed_size(num_frames)
    def get_max_image_tokens(self) -> int:
        target_width, target_height = self.get_image_size_with_most_features()
        return self.get_num_image_tokens(image_width=target_width,
                                         image_height=target_height)
    def _find_target_aspect_ratio(
        self,
        orig_width: int,
@ -764,9 +744,6 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
        num_audios = mm_counts.get("audio", 0)
        num_images = mm_counts.get("image", 0)
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@ -172,26 +172,9 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
        assert isinstance(feature_extractor, WhisperFeatureExtractor)
        return feature_extractor
    def get_max_audio_tokens(self) -> int:
        hf_config = self.get_hf_config()
        max_source_position = hf_config.audio_config.max_source_positions
        output_lengths = (max_source_position - 2) // 2 + 1
        return output_lengths
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None, "image": None, "video": None}
    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int]:
        return {
            "audio": self.get_max_audio_tokens(),
            "image": self.get_max_image_tokens(),
            "video": self.get_max_video_tokens(seq_len, mm_counts),
        }
 class Qwen2_5OmniThinkerDummyInputsBuilder(
        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
@ -210,7 +193,6 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
        return (audio_token * num_audios + image_token * num_images +
                video_token * num_videos)
    # TODO: @abstractmethod after transition
    def get_dummy_mm_data(
        self,
        seq_len: int,