mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 18:05:34 +08:00
[Chore] Deprecate SupportsMultiModal.merge_by_field_config (#30170)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
6476382384
commit
c46b932df2
@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|||||||
model to perform tasks that involve both image and text inputs.
|
model to perform tasks that involve both image and text inputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# mapping for new names in checkpoint saved after transformers v4.52
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
|||||||
@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
|||||||
dummy_inputs=AyaVisionDummyInputsBuilder,
|
dummy_inputs=AyaVisionDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# mapping for new names in checkpoint saved after transformers v4.52
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
|||||||
@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
|||||||
class Blip2ForConditionalGeneration(
|
class Blip2ForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
|
|||||||
class ChameleonForConditionalGeneration(
|
class ChameleonForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
|
|||||||
is_pooling_model = True
|
is_pooling_model = True
|
||||||
|
|
||||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
|
|||||||
@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
|
|||||||
dummy_inputs=Cohere2VisionDummyInputsBuilder,
|
dummy_inputs=Cohere2VisionDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.vision_tower.": "vision_tower.",
|
"model.vision_tower.": "vision_tower.",
|
||||||
|
|||||||
@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
|
|||||||
dummy_inputs=DeepseekOCRDummyInputsBuilder,
|
dummy_inputs=DeepseekOCRDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# map prefix for language backbone
|
# map prefix for language backbone
|
||||||
|
|||||||
@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
|
|||||||
dummy_inputs=DeepseekVL2DummyInputsBuilder,
|
dummy_inputs=DeepseekVL2DummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"language.": "language_model.",
|
"language.": "language_model.",
|
||||||
|
|||||||
@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
|
|||||||
dummy_inputs=DotsOCRDummyInputsBuilder,
|
dummy_inputs=DotsOCRDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_substr={
|
orig_to_new_substr={
|
||||||
".attn.qkv_proj.": ".attn.qkv.",
|
".attn.qkv_proj.": ".attn.qkv.",
|
||||||
|
|||||||
@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
|
|||||||
class Ernie4_5_VLMoeForConditionalGeneration(
|
class Ernie4_5_VLMoeForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
|||||||
dummy_inputs=FuyuDummyInputsBuilder,
|
dummy_inputs=FuyuDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.vision_embed_tokens.": "vision_embed_tokens.",
|
"model.vision_embed_tokens.": "vision_embed_tokens.",
|
||||||
|
|||||||
@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
|
|||||||
class Gemma3ForConditionalGeneration(
|
class Gemma3ForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
|
|||||||
class Gemma3nForConditionalGeneration(
|
class Gemma3nForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsTranscription
|
nn.Module, SupportsMultiModal, SupportsTranscription
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
supported_languages = ISO639_1_SUPPORTED_LANGS
|
supported_languages = ISO639_1_SUPPORTED_LANGS
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
class Glm4vForConditionalGeneration(
|
class Glm4vForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
|
|||||||
class GLM4VForCausalLM(
|
class GLM4VForCausalLM(
|
||||||
ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"query_key_value": ["query_key_value"],
|
"query_key_value": ["query_key_value"],
|
||||||
"dense_h_to_4h": ["dense_h_to_4h"],
|
"dense_h_to_4h": ["dense_h_to_4h"],
|
||||||
|
|||||||
@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
SupportsLoRA,
|
SupportsLoRA,
|
||||||
SupportsTranscription,
|
SupportsTranscription,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
supported_languages = ISO639_1_SUPPORTED_LANGS
|
supported_languages = ISO639_1_SUPPORTED_LANGS
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration(
|
|||||||
SupportsQuant,
|
SupportsQuant,
|
||||||
SupportsXDRoPE,
|
SupportsXDRoPE,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
multimodal_cpu_fields = {"image_grid_thw"}
|
multimodal_cpu_fields = {"image_grid_thw"}
|
||||||
|
|
||||||
# To ensure correct weight loading and mapping.
|
# To ensure correct weight loading and mapping.
|
||||||
|
|||||||
@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
|
|||||||
dummy_inputs=HCXVisionDummyInputsBuilder,
|
dummy_inputs=HCXVisionDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
|
|||||||
dummy_inputs=Idefics3DummyInputsBuilder,
|
dummy_inputs=Idefics3DummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
|
class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol):
|
|||||||
`multimodal_config.mm_encoder_tp_mode="data"`.
|
`multimodal_config.mm_encoder_tp_mode="data"`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
merge_by_field_config: ClassVar[bool] = True
|
merge_by_field_config: ClassVar[bool | None] = None
|
||||||
"""
|
"""
|
||||||
A flag that indicates which implementation of
|
[DEPRECATED] A flag that indicates which implementation of
|
||||||
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
|
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
|
|||||||
def supports_multimodal(
|
def supports_multimodal(
|
||||||
model: type[object] | object,
|
model: type[object] | object,
|
||||||
) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
|
) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
|
||||||
return getattr(model, "supports_multimodal", False)
|
res = getattr(model, "supports_multimodal", False)
|
||||||
|
|
||||||
|
if res:
|
||||||
|
# We can remove this starting from v0.14
|
||||||
|
merge_by_field_config = getattr(model, "merge_by_field_config", None)
|
||||||
|
if merge_by_field_config is False:
|
||||||
|
raise ValueError(
|
||||||
|
"`merge_by_field_config=False` is no longer effective, "
|
||||||
|
"please update your model to consider the new batching logic "
|
||||||
|
"in `group_mm_kwargs_by_modality` (refer to "
|
||||||
|
"https://github.com/vllm-project/vllm/issues/26149), "
|
||||||
|
"and then remove the override from your model."
|
||||||
|
)
|
||||||
|
if merge_by_field_config is True:
|
||||||
|
logger.warning_once(
|
||||||
|
"`merge_by_field_config=True` is redundant, "
|
||||||
|
"please remove the override from your model."
|
||||||
|
)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
|
def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
|
||||||
|
|||||||
@ -509,8 +509,6 @@ class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo
|
|||||||
class InternS1ForConditionalGeneration(
|
class InternS1ForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
# To ensure correct weight loading and mapping.
|
# To ensure correct weight loading and mapping.
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
|
|||||||
@ -1074,8 +1074,6 @@ class InternVLMultiModalProcessor(
|
|||||||
dummy_inputs=InternVLDummyInputsBuilder,
|
dummy_inputs=InternVLDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
supports_encoder_tp_data = True
|
supports_encoder_tp_data = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -1292,8 +1292,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
|
|||||||
|
|
||||||
|
|
||||||
class BaseKeyeModule(nn.Module):
|
class BaseKeyeModule(nn.Module):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -298,8 +298,6 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
|
|||||||
dummy_inputs=KimiVLDummyInputsBuilder,
|
dummy_inputs=KimiVLDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
supports_encoder_tp_data = True
|
supports_encoder_tp_data = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -506,8 +506,6 @@ def init_vision_tower_for_llava(
|
|||||||
dummy_inputs=LlavaDummyInputsBuilder,
|
dummy_inputs=LlavaDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -223,8 +223,6 @@ class LlavaNextMultiModalProcessor(
|
|||||||
dummy_inputs=LlavaDummyInputsBuilder,
|
dummy_inputs=LlavaDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# mapping for new names in checkpoint saved after transformers v4.52
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
|||||||
@ -299,8 +299,6 @@ class LlavaNextMultiModalProjector(nn.Module):
|
|||||||
dummy_inputs=LlavaNextVideoDummyInputsBuilder,
|
dummy_inputs=LlavaNextVideoDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# mapping for new names in checkpoint saved after transformers v4.52
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
|||||||
@ -479,8 +479,6 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
|
|||||||
dummy_inputs=LlavaOnevisionDummyInputsBuilder,
|
dummy_inputs=LlavaOnevisionDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
# mapping for new names in checkpoint saved after transformers v4.52
|
# mapping for new names in checkpoint saved after transformers v4.52
|
||||||
|
|||||||
@ -683,8 +683,6 @@ class MiDashengLMMultiModalProcessor(
|
|||||||
dummy_inputs=MiDashengLMDummyInputsBuilder,
|
dummy_inputs=MiDashengLMDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
|
class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -1003,8 +1003,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
instantiated.
|
instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
supports_encoder_tp_data = True
|
supports_encoder_tp_data = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -179,8 +179,6 @@ class MiniMaxVL01MultiModalProcessor(
|
|||||||
dummy_inputs=MiniMaxVL01DummyInputsBuilder,
|
dummy_inputs=MiniMaxVL01DummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -423,8 +423,6 @@ def init_vision_tower_for_llava(
|
|||||||
class Mistral3ForConditionalGeneration(
|
class Mistral3ForConditionalGeneration(
|
||||||
nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
|
nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -741,8 +741,6 @@ class Llama4ForConditionalGeneration(
|
|||||||
SupportsEagle3,
|
SupportsEagle3,
|
||||||
SupportsLoRA,
|
SupportsLoRA,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -1354,8 +1354,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
|
|||||||
class MolmoForCausalLM(
|
class MolmoForCausalLM(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_substr={
|
orig_to_new_substr={
|
||||||
# vision backbone mapping
|
# vision backbone mapping
|
||||||
|
|||||||
@ -1116,8 +1116,6 @@ class NanoNemotronVLDummyInputsBuilder(
|
|||||||
class NemotronH_Nano_VL_V2(
|
class NemotronH_Nano_VL_V2(
|
||||||
nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
|
nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -358,8 +358,6 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
|||||||
dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo],
|
dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo],
|
||||||
)
|
)
|
||||||
class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -201,7 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
|
|||||||
dummy_inputs=OpenCUADummyInputsBuilder,
|
dummy_inputs=OpenCUADummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||||
merge_by_field_config = True
|
|
||||||
multimodal_cpu_fields = {"image_grid_thw"}
|
multimodal_cpu_fields = {"image_grid_thw"}
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -414,8 +414,6 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
|
|||||||
dummy_inputs=OvisDummyInputsBuilder,
|
dummy_inputs=OvisDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
|
class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -456,8 +456,6 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
|
|||||||
dummy_inputs=Ovis2_5DummyInputsBuilder,
|
dummy_inputs=Ovis2_5DummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
|
class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
config = vllm_config.model_config.hf_config
|
config = vllm_config.model_config.hf_config
|
||||||
|
|||||||
@ -1103,8 +1103,6 @@ class SiglipVisionModel(nn.Module):
|
|||||||
dummy_inputs=PaddleOCRVLDummyInputsBuilder,
|
dummy_inputs=PaddleOCRVLDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
|
class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.": "language_model.model.",
|
"model.": "language_model.model.",
|
||||||
|
|||||||
@ -251,8 +251,6 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
|
|||||||
dummy_inputs=PaliGemmaDummyInputsBuilder,
|
dummy_inputs=PaliGemmaDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@ -562,8 +562,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
|||||||
dummy_inputs=Phi3VDummyInputsBuilder,
|
dummy_inputs=Phi3VDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
|
class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.vision_embed_tokens.wte": "embed_tokens",
|
"model.vision_embed_tokens.wte": "embed_tokens",
|
||||||
|
|||||||
@ -984,8 +984,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
|||||||
Implements the Phi-4-multimodal-instruct model in vLLM.
|
Implements the Phi-4-multimodal-instruct model in vLLM.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"qkv_proj",
|
"qkv_proj",
|
||||||
|
|||||||
@ -365,8 +365,6 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
|
|||||||
dummy_inputs=PixtralDummyInputsBuilder,
|
dummy_inputs=PixtralDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -773,8 +773,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|
|||||||
SupportsMRoPE,
|
SupportsMRoPE,
|
||||||
Qwen2_5OmniConditionalGenerationMixin,
|
Qwen2_5OmniConditionalGenerationMixin,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"thinker.lm_head.": "language_model.lm_head.",
|
"thinker.lm_head.": "language_model.lm_head.",
|
||||||
|
|||||||
@ -1039,7 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration(
|
|||||||
SupportsMultiModalPruning,
|
SupportsMultiModalPruning,
|
||||||
SupportsMRoPE,
|
SupportsMRoPE,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -313,8 +313,6 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessing
|
|||||||
dummy_inputs=Qwen2AudioDummyInputsBuilder,
|
dummy_inputs=Qwen2AudioDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("audio"):
|
if modality.startswith("audio"):
|
||||||
|
|||||||
@ -1131,7 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
|
|||||||
class Qwen2VLForConditionalGeneration(
|
class Qwen2VLForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
||||||
|
|
||||||
# To ensure correct weight loading and mapping.
|
# To ensure correct weight loading and mapping.
|
||||||
|
|||||||
@ -1131,8 +1131,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|||||||
SupportsMRoPE,
|
SupportsMRoPE,
|
||||||
Qwen3OmniMoeConditionalGenerationMixin,
|
Qwen3OmniMoeConditionalGenerationMixin,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"thinker.lm_head.": "language_model.lm_head.",
|
"thinker.lm_head.": "language_model.lm_head.",
|
||||||
|
|||||||
@ -1190,7 +1190,6 @@ class Qwen3VLForConditionalGeneration(
|
|||||||
SupportsMRoPE,
|
SupportsMRoPE,
|
||||||
SupportsEagle3,
|
SupportsEagle3,
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -703,8 +703,6 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
|||||||
class QwenVLForConditionalGeneration(
|
class QwenVLForConditionalGeneration(
|
||||||
QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
|
QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"c_attn": ["c_attn"],
|
"c_attn": ["c_attn"],
|
||||||
"gate_up_proj": [
|
"gate_up_proj": [
|
||||||
|
|||||||
@ -989,7 +989,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
|
|||||||
is_pooling_model = True
|
is_pooling_model = True
|
||||||
|
|
||||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
|
|||||||
@ -647,8 +647,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
|
|||||||
dummy_inputs=SkyworkR1VDummyInputsBuilder,
|
dummy_inputs=SkyworkR1VDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
|
class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
|
|||||||
@ -916,8 +916,6 @@ class Step3VisionTransformer(nn.Module):
|
|||||||
dummy_inputs=Step3VLDummyInputsBuilder,
|
dummy_inputs=Step3VLDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.": "language_model.model.",
|
"model.": "language_model.model.",
|
||||||
|
|||||||
@ -400,8 +400,6 @@ def init_vision_tower_for_tarsier(
|
|||||||
dummy_inputs=TarsierDummyInputsBuilder,
|
dummy_inputs=TarsierDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -227,7 +227,6 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
|
|||||||
dummy_inputs=TerratorchInputBuilder,
|
dummy_inputs=TerratorchInputBuilder,
|
||||||
)
|
)
|
||||||
class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
|
class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
|
||||||
merge_by_field_config = True
|
|
||||||
supports_multimodal_raw_input_only = True
|
supports_multimodal_raw_input_only = True
|
||||||
is_pooling_model = True
|
is_pooling_model = True
|
||||||
|
|
||||||
|
|||||||
@ -264,7 +264,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
|||||||
|
|
||||||
class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
|
class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
|
||||||
supports_multimodal_raw_input_only = True
|
supports_multimodal_raw_input_only = True
|
||||||
merge_by_field_config = True
|
|
||||||
# Backwards compatibility for prev released models. State dicts back then
|
# Backwards compatibility for prev released models. State dicts back then
|
||||||
# had different formats and cannot be loaded with `AutoModel` mapping as is
|
# had different formats and cannot be loaded with `AutoModel` mapping as is
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
|
|||||||
@ -498,8 +498,6 @@ class ModifiedWhisperEncoder(WhisperEncoder):
|
|||||||
dummy_inputs=UltravoxDummyInputsBuilder,
|
dummy_inputs=UltravoxDummyInputsBuilder,
|
||||||
)
|
)
|
||||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
|||||||
@ -330,8 +330,6 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
|
|||||||
class VoxtralForConditionalGeneration(
|
class VoxtralForConditionalGeneration(
|
||||||
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
|
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
|
|
||||||
supported_languages = ISO639_1_SUPPORTED_LANGS
|
supported_languages = ISO639_1_SUPPORTED_LANGS
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
|
|||||||
@ -775,7 +775,6 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
|
|||||||
class WhisperForConditionalGeneration(
|
class WhisperForConditionalGeneration(
|
||||||
nn.Module, SupportsTranscription, SupportsMultiModal
|
nn.Module, SupportsTranscription, SupportsMultiModal
|
||||||
):
|
):
|
||||||
merge_by_field_config = True
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"self_attn.qkv_proj": [
|
"self_attn.qkv_proj": [
|
||||||
"self_attn.q_proj",
|
"self_attn.q_proj",
|
||||||
|
|||||||
@ -426,7 +426,6 @@ def group_mm_kwargs_by_modality(
|
|||||||
Yields:
|
Yields:
|
||||||
A tuple `(modality, num_items, grouped_kwargs)`.
|
A tuple `(modality, num_items, grouped_kwargs)`.
|
||||||
"""
|
"""
|
||||||
# TODO: After v0.13, remove merge_by_field_config attribute from model impls
|
|
||||||
if merge_by_field_config is not None:
|
if merge_by_field_config is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user