[Bugfix] Fix deepseek-vl2 inference with more than 2 images (#13818)

2025-12-13 12:15:56 +08:00 · 2025-02-25 22:03:02 +08:00 · 2025-02-25 22:03:02 +08:00 · 6ff518626c
commit 6ff518626c
parent fa82074167
2 changed files with 46 additions and 10 deletions
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@ -25,7 +25,8 @@ from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, ProcessingCache,
                                        PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@ -138,18 +139,24 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}
-    def get_num_image_tokens(self, *, image_width: int,
+    def get_num_image_tokens(self,
-                             image_height: int) -> int:
+                             *,
                             image_width: int,
                             image_height: int,
                             cropping: bool = True) -> int:
        hf_processor = self.get_hf_processor()
        image_size = hf_processor.image_size
        patch_size = hf_processor.patch_size
        downsample_ratio = hf_processor.downsample_ratio
-        best_width, best_height = hf_processor.select_best_resolution(
+        if cropping:
-            (image_width, image_height))
+            best_width, best_height = hf_processor.select_best_resolution(
                (image_width, image_height))
            num_width_tiles, num_height_tiles = (best_width // image_size,
                                                 best_height // image_size)
        else:
            num_width_tiles = num_height_tiles = 1
        num_width_tiles, num_height_tiles = (best_width // image_size,
                                             best_height // image_size)
        h = w = math.ceil((image_size // patch_size) / downsample_ratio)
        global_views_tokens = h * (w + 1)
@ -169,10 +176,12 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int]:
        num_images = mm_counts.get("image", 0)
        max_image_size = self.get_image_size_with_most_features()
        max_image_tokens = self.get_num_image_tokens(
            image_height=max_image_size.height,
-            image_width=max_image_size.width)
+            image_width=max_image_size.width,
            cropping=num_images <= 2)
        return {"image": max_image_tokens}
@ -207,6 +216,30 @@ class DeepseekVL2DummyInputsBuilder(
 class DeepseekVL2MultiModalProcessor(
        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
    def __init__(
            self,
            info: DeepseekVL2ProcessingInfo,
            dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
            *,
            cache: Optional[ProcessingCache] = None,
            enable_sanity_checks: bool = True) -> None:
        super().__init__(
            info,
            dummy_inputs,
            cache=cache,
            enable_sanity_checks=enable_sanity_checks,
        )
        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
        if self.cache is not None and mm_limit["image"] > 2:
            # The processor output depends on the number of images passed,
            # making it incompatible with processing cache which is supposed
            # to be invariant of how many images are passed per prompt
            self.cache = None
            logger.warning_once(
                f"{type(self).__name__} does not support processing cache with "
                "image limit larger than 2.")
    def _call_hf_processor(
        self,
        prompt: str,
@ -271,6 +304,7 @@ class DeepseekVL2MultiModalProcessor(
                num_image_tokens = self.info.get_num_image_tokens(
                    image_width=image_size.width,
                    image_height=image_size.height,
                    cropping=len(images) <= 2,
                )
            return [image_token_id] * num_image_tokens
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@ -477,13 +477,15 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
            enable_sanity_checks=enable_sanity_checks,
        )
-        if self.cache is not None:
+        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
        if self.cache is not None and mm_limit["image"] >= 2:
            # The processor output depends on the number of images passed,
            # making it incompatible with processing cache which is supposed
            # to be invariant of how many images are passed per prompt
            self.cache = None
            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache.")
+                f"{type(self).__name__} does not support processing cache with "
                "multi-image support enabled.")
    def _get_prompt_replacements(
        self,