mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:16:06 +08:00
[V1] Fix multimodal profiling for Molmo (#11325)
Signed-off-by: ywang96 <ywang@example.com> Co-authored-by: ywang96 <ywang@example.com>
This commit is contained in:
parent
6c7f881541
commit
7379b3d4b2
@ -928,7 +928,11 @@ def image_input_mapper_for_molmo(
|
||||
data: object,
|
||||
):
|
||||
if isinstance(data, list):
|
||||
assert len(data) == 1, "Molmo supports only one image per prompt."
|
||||
data = data[0]
|
||||
|
||||
# Remove unused dummy PIL image
|
||||
data.pop('raw_mm_data', None)
|
||||
return MultiModalKwargs(data)
|
||||
|
||||
|
||||
@ -974,6 +978,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
|
||||
dummy_imgdata = {
|
||||
"images": out["images"],
|
||||
"image_input_idx": out["image_input_idx"],
|
||||
"raw_mm_data": dummy_image,
|
||||
}
|
||||
if "image_masks" in out:
|
||||
dummy_imgdata["image_masks"] = out["image_masks"]
|
||||
|
||||
@ -151,17 +151,31 @@ class MMHasher:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def hash_mm_data(
|
||||
def hash_dummy_mm_data(
|
||||
self,
|
||||
mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
|
||||
"""Hash user-defined dummy multimodal data used for profiling."""
|
||||
|
||||
if mm_data is None:
|
||||
return None
|
||||
|
||||
image_inputs = mm_data['image']
|
||||
|
||||
# This is a temporary workaround for models (e.g, Molmo) that
|
||||
# process multimodal data in the input processor (therefore
|
||||
# image_inputs is MultiModalKwargs instead of raw input format).
|
||||
# `raw_mm_data` with the original input format is expected
|
||||
# in this case.
|
||||
if isinstance(image_inputs, dict):
|
||||
assert "raw_mm_data" in image_inputs and isinstance(
|
||||
image_inputs["raw_mm_data"], PIL.Image.Image)
|
||||
image_inputs = image_inputs.pop("raw_mm_data")
|
||||
|
||||
return self.hash_images(image_inputs)
|
||||
|
||||
def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
|
||||
def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
|
||||
"""Hash multimodal data in the user input prompt if they exist."""
|
||||
|
||||
if "multi_modal_data" not in prompt:
|
||||
return None
|
||||
|
||||
@ -171,6 +185,7 @@ class MMHasher:
|
||||
return self.hash_images(image_inputs)
|
||||
|
||||
def hash_images(self, image_inputs) -> Optional[List[str]]:
|
||||
"""Hash PIL image objects to strings."""
|
||||
if not isinstance(image_inputs, list):
|
||||
image_inputs = [image_inputs]
|
||||
assert len(image_inputs) > 0
|
||||
|
||||
@ -79,7 +79,7 @@ class Processor:
|
||||
# Compute MM hashes (if enabled)
|
||||
mm_hashes = None
|
||||
if self.use_hash:
|
||||
mm_hashes = self.mm_hasher.hash_prompt(prompt)
|
||||
mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
|
||||
|
||||
# Process inputs.
|
||||
preprocessed_inputs = self.input_preprocessor.preprocess(
|
||||
|
||||
@ -638,7 +638,7 @@ class GPUModelRunner:
|
||||
# Compute MM hashes (if enabled)
|
||||
mm_hashes = None
|
||||
if self.use_hash:
|
||||
mm_hashes = self.mm_hasher.hash_mm_data(dummy_mm_data)
|
||||
mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data)
|
||||
|
||||
dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
|
||||
mm_data=dummy_mm_data,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user