mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-23 15:37:51 +08:00
[VLM] Avoid unnecessary dummy multimodal data during processing (#16416)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
dd143ef541
commit
56d4aefa33
@ -21,12 +21,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
|||||||
from vllm.model_executor.model_loader.weight_utils import (
|
from vllm.model_executor.model_loader.weight_utils import (
|
||||||
default_weight_loader, maybe_remap_kv_scale_name)
|
default_weight_loader, maybe_remap_kv_scale_name)
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@ -415,31 +416,31 @@ class AriaProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
|
class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token: str = processor.tokenizer.image_token # type: ignore
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
vision_config = self.info.get_vision_config()
|
vision_config = self.info.get_vision_config()
|
||||||
|
|
||||||
max_image_size = vision_config.image_size
|
max_image_size = vision_config.image_size
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=max_image_size,
|
self._get_dummy_images(width=max_image_size,
|
||||||
height=max_image_size,
|
height=max_image_size,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
hf_processor = self.info.get_hf_processor()
|
|
||||||
image_token: str = hf_processor.tokenizer.image_token # type: ignore
|
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
|
class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from vllm.jsontree import json_map_leaves
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalKwargs
|
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
@ -28,7 +28,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
PromptReplacement, PromptUpdate,
|
PromptReplacement, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -146,28 +146,29 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
|
|||||||
class AyaVisionDummyInputsBuilder(
|
class AyaVisionDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
|
BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
num_images = mm_counts.get("image", 0)
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
processor = self.info.get_hf_processor()
|
processor = self.info.get_hf_processor()
|
||||||
image_token = processor.image_token
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
image_size = \
|
image_size = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=image_size.width,
|
self._get_dummy_images(width=image_size.width,
|
||||||
height=image_size.height,
|
height=image_size.height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AyaVisionMultiModalProcessor(
|
class AyaVisionMultiModalProcessor(
|
||||||
|
|||||||
@ -15,12 +15,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptIndexTargets,
|
BaseProcessingInfo, PromptIndexTargets,
|
||||||
PromptInsertion, PromptUpdate)
|
PromptInsertion, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .blip import BlipVisionModel
|
from .blip import BlipVisionModel
|
||||||
@ -413,29 +414,27 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
|
class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
vision_config = hf_config.vision_config
|
vision_config = hf_config.vision_config
|
||||||
|
|
||||||
max_image_size = vision_config.image_size
|
max_image_size = vision_config.image_size
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=max_image_size,
|
self._get_dummy_images(width=max_image_size,
|
||||||
height=max_image_size,
|
height=max_image_size,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -30,12 +30,13 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
|
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
|
||||||
@ -72,28 +73,31 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
|
|||||||
class ChameleonDummyInputsBuilder(
|
class ChameleonDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[ChameleonProcessingInfo]):
|
BaseDummyInputsBuilder[ChameleonProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
config = self.info.get_hf_config()
|
config = self.info.get_hf_config()
|
||||||
|
|
||||||
width = height = config.vq_config.resolution
|
width = height = config.vq_config.resolution
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=width,
|
self._get_dummy_images(width=width,
|
||||||
height=height,
|
height=height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<image>" * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ChameleonMultiModalProcessor(
|
class ChameleonMultiModalProcessor(
|
||||||
BaseMultiModalProcessor[ChameleonProcessingInfo]):
|
BaseMultiModalProcessor[ChameleonProcessingInfo]):
|
||||||
|
|||||||
@ -19,14 +19,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
|
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
|
||||||
MlpProjectorConfig,
|
MlpProjectorConfig,
|
||||||
@ -172,29 +172,30 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
|
|||||||
class DeepseekVL2DummyInputsBuilder(
|
class DeepseekVL2DummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
|
BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
hf_processor = self.info.get_hf_processor()
|
|
||||||
image_token: str = hf_processor.image_token
|
|
||||||
|
|
||||||
max_image_size = self.info.get_image_size_with_most_features()
|
max_image_size = self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=max_image_size.width,
|
self._get_dummy_images(width=max_image_size.width,
|
||||||
height=max_image_size.height,
|
height=max_image_size.height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DeepseekVL2MultiModalProcessor(
|
class DeepseekVL2MultiModalProcessor(
|
||||||
BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
|
BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
|
||||||
|
|||||||
@ -21,13 +21,14 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
|
|||||||
BartScaledWordEmbedding)
|
BartScaledWordEmbedding)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
|
MultiModalKwargs)
|
||||||
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||||
EncDecMultiModalProcessor,
|
EncDecMultiModalProcessor,
|
||||||
PromptIndexTargets, PromptInsertion,
|
PromptIndexTargets, PromptInsertion,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
|
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
|
||||||
@ -772,27 +773,25 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
|
|||||||
class Florence2DummyInputsBuilder(
|
class Florence2DummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[Florence2ProcessingInfo]):
|
BaseDummyInputsBuilder[Florence2ProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width = target_height = self.info.get_hf_config().projection_dim
|
target_width = target_height = self.info.get_hf_config().projection_dim
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Florence2MultiModalProcessor(
|
class Florence2MultiModalProcessor(
|
||||||
EncDecMultiModalProcessor[Florence2ProcessingInfo]):
|
EncDecMultiModalProcessor[Florence2ProcessingInfo]):
|
||||||
|
|||||||
@ -31,13 +31,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput
|
|||||||
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -125,27 +126,25 @@ class FuyuProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
|
class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -15,8 +15,9 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@ -28,7 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
find_mm_placeholders,
|
find_mm_placeholders,
|
||||||
replace_token_matches)
|
replace_token_matches)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
@ -224,31 +225,31 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
num_images = mm_counts.get("image", 0)
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
processor = self.info.get_hf_processor()
|
processor = self.info.get_hf_processor()
|
||||||
image_token = processor.boi_token
|
image_token = processor.boi_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from torch import nn
|
|||||||
from torch.nn import LayerNorm
|
from torch.nn import LayerNorm
|
||||||
from torchvision import transforms
|
from torchvision import transforms
|
||||||
from torchvision.transforms import InterpolationMode
|
from torchvision.transforms import InterpolationMode
|
||||||
from transformers import PreTrainedTokenizer, TensorType
|
from transformers import BatchFeature, PreTrainedTokenizer, TensorType
|
||||||
from transformers.image_utils import ImageInput
|
from transformers.image_utils import ImageInput
|
||||||
from transformers.tokenization_utils_base import TextInput
|
from transformers.tokenization_utils_base import TextInput
|
||||||
|
|
||||||
@ -28,13 +28,13 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
|||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, BatchFeature,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
MultiModalFieldConfig,
|
PromptUpdate)
|
||||||
PromptReplacement, PromptUpdate)
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.configs import ChatGLMConfig
|
from vllm.transformers_utils.configs import ChatGLMConfig
|
||||||
|
|
||||||
@ -447,31 +447,31 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
|
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
|
||||||
|
|
||||||
|
return base_text * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
vision_config = hf_config.vision_config
|
vision_config = hf_config.vision_config
|
||||||
|
|
||||||
target_width = target_height = vision_config["image_size"]
|
target_width = target_height = vision_config["image_size"]
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
|
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=base_text * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
|
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -32,18 +32,18 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo,
|
BaseProcessingInfo,
|
||||||
MultiModalDataItems,
|
MultiModalDataItems, PromptReplacement,
|
||||||
MultiModalFieldConfig,
|
PromptUpdate, PromptUpdateDetails)
|
||||||
PromptReplacement, PromptUpdate,
|
|
||||||
PromptUpdateDetails)
|
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@ -284,29 +284,31 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
|
|||||||
class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
|
class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
|
||||||
):
|
):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token, _, _ = self.info._get_image_token(processor)
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
hf_processor = self.info.get_hf_processor()
|
hf_processor = self.info.get_hf_processor()
|
||||||
image_processor: Idefics3ImageProcessor = hf_processor.image_processor
|
image_processor: Idefics3ImageProcessor = hf_processor.image_processor
|
||||||
longest_edge = image_processor.max_image_size['longest_edge']
|
longest_edge = image_processor.max_image_size['longest_edge']
|
||||||
image_token, _, _ = self.info._get_image_token(hf_processor)
|
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=longest_edge,
|
self._get_dummy_images(width=longest_edge,
|
||||||
height=longest_edge,
|
height=longest_edge,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Idefics3MultiModalProcessor(
|
class Idefics3MultiModalProcessor(
|
||||||
BaseMultiModalProcessor[Idefics3ProcessingInfo]):
|
BaseMultiModalProcessor[Idefics3ProcessingInfo]):
|
||||||
|
|||||||
@ -25,14 +25,14 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
|
|||||||
InternVisionPatchModel)
|
InternVisionPatchModel)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
|
||||||
@ -504,27 +504,27 @@ _I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
|
|||||||
|
|
||||||
class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
return "<image>" * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<image>" * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
BaseProcessingInfo, ProcessingCache,
|
BaseProcessingInfo, ProcessingCache,
|
||||||
PromptReplacement, PromptUpdate,
|
PromptReplacement, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .clip import CLIPVisionModel
|
from .clip import CLIPVisionModel
|
||||||
@ -186,30 +186,31 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
|
|||||||
|
|
||||||
class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
processor = self.info.get_hf_processor()
|
processor = self.info.get_hf_processor()
|
||||||
image_token = processor.image_token
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
|
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
|
||||||
|
|
||||||
|
|||||||
@ -16,13 +16,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.models.clip import CLIPVisionModel
|
from vllm.model_executor.models.clip import CLIPVisionModel
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||||
VideoEmbeddingItems, VideoProcessorItems)
|
VideoEmbeddingItems, VideoProcessorItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import is_list_of
|
from vllm.utils import is_list_of
|
||||||
|
|
||||||
@ -130,22 +131,27 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
|
|||||||
class LlavaNextVideoDummyInputsBuilder(
|
class LlavaNextVideoDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
|
BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
num_videos = mm_counts.get("video", 0)
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
processor = self.info.get_hf_processor()
|
processor = self.info.get_hf_processor()
|
||||||
video_token = processor.video_token
|
video_token = processor.video_token
|
||||||
|
|
||||||
|
return video_token * num_videos
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
target_num_frames = \
|
target_num_frames = \
|
||||||
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"video":
|
"video":
|
||||||
self._get_dummy_videos(
|
self._get_dummy_videos(
|
||||||
width=target_width,
|
width=target_width,
|
||||||
@ -155,11 +161,6 @@ class LlavaNextVideoDummyInputsBuilder(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=video_token * num_videos,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextVideoMultiModalProcessor(
|
class LlavaNextVideoMultiModalProcessor(
|
||||||
BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
|
BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
|
||||||
|
|||||||
@ -19,11 +19,11 @@ from vllm.model_executor.layers.activation import get_act_fn
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||||
VideoEmbeddingItems, VideoProcessorItems)
|
VideoEmbeddingItems, VideoProcessorItems)
|
||||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||||
from vllm.multimodal.profiling import ProcessorInputs
|
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .clip import CLIPVisionModel
|
from .clip import CLIPVisionModel
|
||||||
@ -226,11 +226,7 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
|
|||||||
class LlavaOnevisionDummyInputsBuilder(
|
class LlavaOnevisionDummyInputsBuilder(
|
||||||
LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
|
LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
num_videos = mm_counts.get("video", 0)
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
@ -238,13 +234,23 @@ class LlavaOnevisionDummyInputsBuilder(
|
|||||||
image_token = processor.image_token
|
image_token = processor.image_token
|
||||||
video_token = processor.video_token
|
video_token = processor.video_token
|
||||||
|
|
||||||
|
return image_token * num_images + video_token * num_videos
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
target_num_frames = \
|
target_num_frames = \
|
||||||
self.info.get_num_frames_with_most_features(seq_len,
|
self.info.get_num_frames_with_most_features(seq_len,
|
||||||
mm_counts)
|
mm_counts)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
@ -258,11 +264,6 @@ class LlavaOnevisionDummyInputsBuilder(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images + video_token * num_videos,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaOnevisionMultiModalProcessor(
|
class LlavaOnevisionMultiModalProcessor(
|
||||||
BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
|
BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
|
||||||
|
|||||||
@ -35,14 +35,14 @@ from transformers.models.whisper.modeling_whisper import (
|
|||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
NestedTensors)
|
||||||
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
|
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
|
||||||
DictEmbeddingItems, ModalityData,
|
DictEmbeddingItems, ModalityData,
|
||||||
ModalityDataItems, MultiModalDataItems,
|
ModalityDataItems, MultiModalDataItems,
|
||||||
MultiModalDataParser)
|
MultiModalDataParser)
|
||||||
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
|
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import ProcessorInputs
|
|
||||||
|
|
||||||
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
|
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
|
||||||
MiniCPMVDummyInputsBuilder,
|
MiniCPMVDummyInputsBuilder,
|
||||||
@ -206,29 +206,31 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
|
|||||||
class MiniCPMODummyInputsBuilder(
|
class MiniCPMODummyInputsBuilder(
|
||||||
MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
|
MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self, seq_len: int, mm_counts: Mapping[str,
|
num_audios = mm_counts.get("audio", 0)
|
||||||
int]) -> ProcessorInputs:
|
|
||||||
|
audio_prompt_texts = self.info.audio_pattern * num_audios
|
||||||
|
|
||||||
|
return super().get_dummy_text(mm_counts) + audio_prompt_texts
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
num_audios = mm_counts.get("audio", 0)
|
num_audios = mm_counts.get("audio", 0)
|
||||||
audio_len = self.info.get_max_audio_chunks_with_most_features() * \
|
audio_len = self.info.get_max_audio_chunks_with_most_features() * \
|
||||||
self.info.get_default_audio_sampling_rate()
|
self.info.get_default_audio_sampling_rate()
|
||||||
|
|
||||||
processor_inputs = super().get_dummy_processor_inputs(
|
|
||||||
seq_len, mm_counts)
|
|
||||||
|
|
||||||
audio_prompt_texts = self.info.audio_pattern * num_audios
|
|
||||||
audio_mm_data = {
|
audio_mm_data = {
|
||||||
"audio":
|
"audio":
|
||||||
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
return {
|
||||||
prompt_text=processor_inputs.prompt_text + audio_prompt_texts,
|
**super().get_dummy_mm_data(seq_len, mm_counts),
|
||||||
mm_data={
|
**audio_mm_data,
|
||||||
**processor_inputs.mm_data,
|
}
|
||||||
**audio_mm_data,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MiniCPMOMultiModalProcessor(
|
class MiniCPMOMultiModalProcessor(
|
||||||
|
|||||||
@ -48,7 +48,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
|||||||
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
|
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
NestedTensors)
|
||||||
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
|
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
|
||||||
ImageProcessorItems, ImageSize,
|
ImageProcessorItems, ImageSize,
|
||||||
ModalityData, ModalityDataItems,
|
ModalityData, ModalityDataItems,
|
||||||
@ -57,7 +58,7 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
|
|||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import flatten_2d_lists
|
from vllm.utils import flatten_2d_lists
|
||||||
@ -471,11 +472,20 @@ _I = TypeVar("_I",
|
|||||||
|
|
||||||
class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
|
image_prompt_texts = self.info.image_pattern * num_images
|
||||||
|
video_prompt_texts = self.info.video_pattern * num_videos
|
||||||
|
|
||||||
|
return image_prompt_texts + video_prompt_texts
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
num_videos = mm_counts.get("video", 0)
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
@ -486,7 +496,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
|||||||
num_video_frames = \
|
num_video_frames = \
|
||||||
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=image_width,
|
self._get_dummy_images(width=image_width,
|
||||||
height=image_height,
|
height=image_height,
|
||||||
@ -498,13 +508,6 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
|||||||
] * num_videos,
|
] * num_videos,
|
||||||
}
|
}
|
||||||
|
|
||||||
image_prompt_texts = self.info.image_pattern * num_images
|
|
||||||
video_prompt_texts = self.info.video_pattern * num_videos
|
|
||||||
|
|
||||||
return ProcessorInputs(prompt_text=image_prompt_texts +
|
|
||||||
video_prompt_texts,
|
|
||||||
mm_data=mm_data)
|
|
||||||
|
|
||||||
|
|
||||||
class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||||
|
|
||||||
|
|||||||
@ -22,14 +22,15 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, ProcessingCache,
|
BaseProcessingInfo, ProcessingCache,
|
||||||
PromptReplacement, PromptUpdate,
|
PromptReplacement, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -185,30 +186,31 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
|
|||||||
|
|
||||||
class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
processor = self.info.get_hf_processor()
|
processor = self.info.get_hf_processor()
|
||||||
image_token = processor.image_token
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
|
class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
|
||||||
|
|
||||||
|
|||||||
@ -54,14 +54,14 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
default_weight_loader, maybe_remap_kv_scale_name)
|
default_weight_loader, maybe_remap_kv_scale_name)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalEncDecInputs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||||
MultiModalFieldConfig, MultiModalKwargs)
|
MultiModalFieldConfig, MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataDict, MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||||
EncDecMultiModalProcessor,
|
EncDecMultiModalProcessor,
|
||||||
PromptReplacement, PromptUpdate)
|
PromptReplacement, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
|
|
||||||
from .clip import CLIPMLP
|
from .clip import CLIPMLP
|
||||||
from .interfaces import SupportsMultiModal, SupportsV0Only
|
from .interfaces import SupportsMultiModal, SupportsV0Only
|
||||||
@ -131,31 +131,31 @@ class MllamaProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
|
class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token = processor.image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
hf_processor = self.info.get_hf_processor()
|
|
||||||
image_token: str = hf_processor.image_token
|
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||||
):
|
):
|
||||||
|
|||||||
@ -43,14 +43,14 @@ from vllm.model_executor.model_loader.loader import _initialize_model
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -619,29 +619,31 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
|||||||
|
|
||||||
class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
|
class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
processor = self.info.get_hf_processor()
|
||||||
|
image_token = processor.fake_image_token
|
||||||
|
|
||||||
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
(target_width,
|
(target_width,
|
||||||
target_height) = self.info.get_image_size_with_most_features()
|
target_height) = self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
image_token = self.info.get_hf_processor().fake_image_token
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(
|
@MULTIMODAL_REGISTRY.register_processor(
|
||||||
Mllama4MultiModalProcessor,
|
Mllama4MultiModalProcessor,
|
||||||
|
|||||||
@ -41,14 +41,15 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptIndexTargets,
|
BaseProcessingInfo, PromptIndexTargets,
|
||||||
PromptInsertion, PromptUpdate,
|
PromptInsertion, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
@ -1216,27 +1217,25 @@ class MolmoProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
|
class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
|
class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -15,12 +15,11 @@ from transformers import PretrainedConfig
|
|||||||
|
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalKwargs
|
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
|
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import ProcessorInputs
|
|
||||||
|
|
||||||
from .intern_vit import InternVisionModel
|
from .intern_vit import InternVisionModel
|
||||||
from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
|
from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
|
||||||
@ -87,29 +86,29 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
|
|||||||
|
|
||||||
class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
|
class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
# The newline is necessary to separate ">" of the current item
|
||||||
|
# and "<" of the next item
|
||||||
|
return "<image>\n" * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
# The newline is necessary to separate ">" of the current item
|
|
||||||
# and "<" of the next item
|
|
||||||
prompt_text="<image>\n" * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
|
class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
BaseProcessingInfo, PromptIndexTargets,
|
BaseProcessingInfo, PromptIndexTargets,
|
||||||
PromptInsertion, PromptUpdate,
|
PromptInsertion, PromptUpdate,
|
||||||
PromptUpdateDetails)
|
PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -90,29 +90,27 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
|
|||||||
class PaliGemmaDummyInputsBuilder(
|
class PaliGemmaDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
|
BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
vision_config = hf_config.vision_config
|
vision_config = hf_config.vision_config
|
||||||
max_image_size = vision_config.image_size
|
max_image_size = vision_config.image_size
|
||||||
|
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=max_image_size,
|
self._get_dummy_images(width=max_image_size,
|
||||||
height=max_image_size,
|
height=max_image_size,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PaliGemmaMultiModalProcessor(
|
class PaliGemmaMultiModalProcessor(
|
||||||
BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
|
BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
|
||||||
|
|||||||
@ -32,7 +32,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|||||||
VocabParallelEmbedding)
|
VocabParallelEmbedding)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
@ -42,7 +43,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
PlaceholderFeaturesInfo,
|
PlaceholderFeaturesInfo,
|
||||||
PromptReplacement, PromptUpdate)
|
PromptReplacement, PromptUpdate)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import is_list_of
|
from vllm.utils import is_list_of
|
||||||
|
|
||||||
@ -343,31 +344,31 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
|
class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
hf_processor = self.info.get_hf_processor()
|
||||||
|
image_tokens: list[str] = hf_processor.img_tokens # type: ignore
|
||||||
|
|
||||||
|
return "".join(image_tokens[:num_images])
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
hf_processor = self.info.get_hf_processor()
|
|
||||||
image_tokens: list[str] = hf_processor.img_tokens # type: ignore
|
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="".join(image_tokens[:num_images]),
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -32,13 +32,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
|
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
|
||||||
cached_tokenizer_from_config)
|
cached_tokenizer_from_config)
|
||||||
@ -203,28 +204,26 @@ class PixtralProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="",
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||||
):
|
):
|
||||||
|
|||||||
@ -35,7 +35,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
|||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptUpdate)
|
BaseProcessingInfo, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import (IntermediateTensors, PoolerOutput,
|
from vllm.sequence import (IntermediateTensors, PoolerOutput,
|
||||||
PoolingSequenceGroupOutput)
|
PoolingSequenceGroupOutput)
|
||||||
|
|
||||||
@ -49,20 +49,21 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
|
|||||||
class PrithviGeoSpatialMAEInputBuilder(
|
class PrithviGeoSpatialMAEInputBuilder(
|
||||||
BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
|
BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
return ProcessorInputs(
|
# This model input is fixed and is in the form of a torch Tensor.
|
||||||
prompt_text="",
|
# The size of pixel_values might change in the cases where we resize
|
||||||
# This model input is fixed and is in the form of a torch Tensor.
|
# the input but never exceeds the dimensions below.
|
||||||
# The size of pixel_values might change in the cases where we resize
|
return {
|
||||||
# the input but never exceeds the dimensions below.
|
"pixel_values": torch.full((1, 6, 512, 512), 1.0),
|
||||||
mm_data={
|
"location_coords": torch.full((1, 2), 1.0),
|
||||||
"pixel_values": torch.full((1, 6, 512, 512), 1.0),
|
}
|
||||||
"location_coords": torch.full((1, 2), 1.0)
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||||
|
|||||||
@ -37,13 +37,14 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||||
MultiModalDataParser)
|
MultiModalDataParser)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
@ -113,27 +114,30 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
|||||||
class Qwen2AudioDummyInputsBuilder(
|
class Qwen2AudioDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
|
hf_processor = self.info.get_hf_processor()
|
||||||
|
audio_token = hf_processor.audio_token
|
||||||
|
|
||||||
|
return audio_token * num_audios
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
feature_extractor = self.info.get_feature_extractor()
|
feature_extractor = self.info.get_feature_extractor()
|
||||||
|
|
||||||
sampling_rate = feature_extractor.sampling_rate
|
sampling_rate = feature_extractor.sampling_rate
|
||||||
audio_len = feature_extractor.chunk_length * sampling_rate
|
audio_len = feature_extractor.chunk_length * sampling_rate
|
||||||
num_audios = mm_counts.get("audio", 0)
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"audio":
|
"audio":
|
||||||
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<|AUDIO|>" * num_audios,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen2AudioMultiModalProcessor(
|
class Qwen2AudioMultiModalProcessor(
|
||||||
BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
|
BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
|
||||||
|
|||||||
@ -56,15 +56,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||||
MultiModalFieldConfig, MultiModalKwargs,
|
MultiModalDataDict, MultiModalFieldConfig,
|
||||||
VideoItem)
|
MultiModalKwargs, VideoItem)
|
||||||
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
|
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
|
||||||
ModalityDataItems, MultiModalDataItems,
|
ModalityDataItems, MultiModalDataItems,
|
||||||
MultiModalDataParser)
|
MultiModalDataParser)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.platforms import _Backend
|
from vllm.platforms import _Backend
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.config import uses_mrope
|
from vllm.transformers_utils.config import uses_mrope
|
||||||
@ -965,11 +965,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
|
class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> ProcessorInputs:
|
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
num_videos = mm_counts.get("video", 0)
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
@ -977,12 +973,22 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
|
|||||||
image_token: str = hf_processor.image_token
|
image_token: str = hf_processor.image_token
|
||||||
video_token: str = hf_processor.video_token
|
video_token: str = hf_processor.video_token
|
||||||
|
|
||||||
|
return image_token * num_images + video_token * num_videos
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
num_videos = mm_counts.get("video", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
target_num_frames = \
|
target_num_frames = \
|
||||||
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
@ -996,11 +1002,6 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=image_token * num_images + video_token * num_videos,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||||
):
|
):
|
||||||
|
|||||||
@ -32,12 +32,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
|||||||
from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
|
from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
@ -542,34 +543,34 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
hf_processor = self.info.get_hf_processor()
|
||||||
|
img_start = hf_processor.image_start_tag
|
||||||
|
img_end = hf_processor.image_end_tag
|
||||||
|
|
||||||
|
return "".join(f"Picture {i}: {img_start}{img_end}\n"
|
||||||
|
for i in range(1, num_images + 1))
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
vision_config = hf_config.visual
|
vision_config = hf_config.visual
|
||||||
|
|
||||||
processor = self.info.get_hf_processor()
|
|
||||||
img_start = processor.image_start_tag
|
|
||||||
img_end = processor.image_end_tag
|
|
||||||
|
|
||||||
target_width = target_height = vision_config["image_size"]
|
target_width = target_height = vision_config["image_size"]
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
|
|
||||||
for i in range(1, num_images + 1)),
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
@ -26,14 +26,14 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
|
|||||||
InternVisionPatchModel)
|
InternVisionPatchModel)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
|
||||||
@ -505,27 +505,27 @@ _I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
|
|||||||
|
|
||||||
class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
return "<image>" * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<image>" * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||||
|
|
||||||
|
|||||||
@ -23,13 +23,13 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
|
|||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||||
|
|
||||||
@ -110,11 +110,16 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
|
|||||||
class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
|
class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
|
||||||
):
|
):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
|
return "<|audio|>" * num_audios
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
feature_extractor = self.info.get_feature_extractor()
|
feature_extractor = self.info.get_feature_extractor()
|
||||||
|
|
||||||
sampling_rate = feature_extractor.sampling_rate
|
sampling_rate = feature_extractor.sampling_rate
|
||||||
@ -122,16 +127,11 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
|
|||||||
_MAX_ENCODER_BATCH_SIZE)
|
_MAX_ENCODER_BATCH_SIZE)
|
||||||
num_audios = mm_counts.get("audio", 0)
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"audio":
|
"audio":
|
||||||
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<|audio|>" * num_audios,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class UltravoxMultiModalProcessor(
|
class UltravoxMultiModalProcessor(
|
||||||
BaseMultiModalProcessor[UltravoxProcessingInfo]):
|
BaseMultiModalProcessor[UltravoxProcessingInfo]):
|
||||||
|
|||||||
@ -26,13 +26,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
|
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
from vllm.multimodal.parse import (MultiModalDataDict, MultiModalDataItems,
|
MultiModalKwargs)
|
||||||
MultiModalDataParser)
|
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||||
EncDecMultiModalProcessor,
|
EncDecMultiModalProcessor,
|
||||||
PromptReplacement, PromptUpdate)
|
PromptReplacement, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
|
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
|
||||||
SupportsTranscription, SupportsV0Only)
|
SupportsTranscription, SupportsV0Only)
|
||||||
@ -544,27 +544,27 @@ class WhisperProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
|
class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
|
return "<|startoftranscript|>" * num_audios
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
feature_extractor = self.info.get_feature_extractor()
|
feature_extractor = self.info.get_feature_extractor()
|
||||||
|
|
||||||
sampling_rate = feature_extractor.sampling_rate
|
sampling_rate = feature_extractor.sampling_rate
|
||||||
audio_len = feature_extractor.chunk_length * sampling_rate
|
audio_len = feature_extractor.chunk_length * sampling_rate
|
||||||
num_audios = mm_counts.get("audio", 0)
|
num_audios = mm_counts.get("audio", 0)
|
||||||
|
|
||||||
mm_data = {
|
return {
|
||||||
"audio":
|
"audio":
|
||||||
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text="<|startoftranscript|>" * num_audios,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class WhisperMultiModalProcessor(
|
class WhisperMultiModalProcessor(
|
||||||
EncDecMultiModalProcessor[WhisperProcessingInfo]):
|
EncDecMultiModalProcessor[WhisperProcessingInfo]):
|
||||||
|
|||||||
@ -1051,12 +1051,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
*,
|
*,
|
||||||
cache: Optional[ProcessingCache] = None,
|
cache: Optional[ProcessingCache] = None,
|
||||||
enable_sanity_checks: bool = True) -> None:
|
enable_sanity_checks: bool = True) -> None:
|
||||||
if get_repls := getattr(self, "_get_prompt_replacements", None):
|
|
||||||
logger.warning_once("`_get_prompt_replacements` has been renamed "
|
|
||||||
"to `_get_prompt_updates`. The old name will "
|
|
||||||
"be removed in an upcoming release.")
|
|
||||||
self._get_prompt_updates = get_repls # type: ignore[method-assign]
|
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.info = info
|
self.info = info
|
||||||
@ -1274,13 +1268,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
"""
|
"""
|
||||||
mm_counts = mm_items.get_all_counts()
|
mm_counts = mm_items.get_all_counts()
|
||||||
|
|
||||||
dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
|
|
||||||
self.info.ctx.model_config.max_model_len,
|
|
||||||
mm_counts,
|
|
||||||
)
|
|
||||||
|
|
||||||
_, mm_kwargs, _ = self._apply_hf_processor_text_mm(
|
_, mm_kwargs, _ = self._apply_hf_processor_text_mm(
|
||||||
prompt_text=dummy_inputs.prompt_text,
|
prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
|
||||||
mm_items=mm_items,
|
mm_items=mm_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Generic, NamedTuple, Optional, TypeVar, cast
|
from typing import Generic, NamedTuple, Optional, TypeVar, cast
|
||||||
@ -60,7 +60,35 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
|||||||
|
|
||||||
self.info = info
|
self.info = info
|
||||||
|
|
||||||
@abstractmethod
|
# TODO: @abstractmethod after transition
|
||||||
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
"""
|
||||||
|
Build the text input corresponding to :code:`mm_counts`.
|
||||||
|
"""
|
||||||
|
if (type(self).get_dummy_processor_inputs ==
|
||||||
|
BaseDummyInputsBuilder.get_dummy_processor_inputs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
logger.warning_once("`get_dummy_processor_inputs` has been split up "
|
||||||
|
"into `get_dummy_text` and `get_dummy_mm_data`. "
|
||||||
|
"These two methods will be marked as abstract "
|
||||||
|
"in an upcoming release.")
|
||||||
|
|
||||||
|
seq_len = self.info.ctx.model_config.max_model_len
|
||||||
|
return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text
|
||||||
|
|
||||||
|
# TODO: @abstractmethod after transition
|
||||||
|
def get_dummy_mm_data(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
"""
|
||||||
|
Build the multimodal input which, after processing, results in
|
||||||
|
the maximum possible number of placeholder tokens.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
def get_dummy_processor_inputs(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
@ -70,7 +98,10 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
|||||||
Build the input which, after processing, results in
|
Build the input which, after processing, results in
|
||||||
the maximum possible number of placeholder tokens.
|
the maximum possible number of placeholder tokens.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
dummy_text = self.get_dummy_text(mm_counts)
|
||||||
|
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
|
||||||
|
|
||||||
|
return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data)
|
||||||
|
|
||||||
def _get_dummy_audios(
|
def _get_dummy_audios(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user