fixed property bug in processor and added abstract methods in BaseProcessingInfo

This commit is contained in:
Anexdeus 2025-12-17 01:31:34 +03:00
parent da0adea88e
commit 36121c6db0
3 changed files with 57 additions and 21 deletions

View File

@ -1007,25 +1007,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
for modality in ("image", "video")
]
def get_num_mm_encoder_tokens(
self,
num_image_tokens: int,
) -> int:
hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
merge_size = vision_config.spatial_merge_size
return num_image_tokens * merge_size**2
def get_num_mm_connector_tokens(
self,
num_vision_tokens: int,
) -> int:
hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
merge_size = vision_config.spatial_merge_size
return num_vision_tokens // merge_size**2
@MULTIMODAL_REGISTRY.register_processor(
Qwen2_5_VLMultiModalProcessor,

View File

@ -1132,6 +1132,25 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
self.info.get_hf_config().vision_config.spatial_merge_size
)(hf_inputs)
def get_num_mm_encoder_tokens(
self,
num_image_tokens: int,
) -> int:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config
merge_size = vision_config.spatial_merge_size
return num_image_tokens * merge_size**2
def get_num_mm_connector_tokens(
self,
num_vision_tokens: int,
) -> int:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config
merge_size = vision_config.spatial_merge_size
return num_vision_tokens // merge_size**2
@MULTIMODAL_REGISTRY.register_processor(
Qwen2VLMultiModalProcessor,

View File

@ -1185,6 +1185,32 @@ class BaseProcessingInfo:
"""
return self.ctx.get_hf_processor(**kwargs)
@abstractmethod
def get_num_mm_encoder_tokens(
self,
num_image_tokens: int,
) -> int:
"""
Implement this function to enable LoRA support
for the tower module of the multi-modal model
Given the number of image tokens, output the number of multi-modal encoder tokens
"""
raise NotImplementedError
@abstractmethod
def get_num_mm_connector_tokens(
self,
num_vision_tokens: int,
) -> int:
"""
Implement this function to enable LoRA support
for the connector module of the multi-modal model
Given the number of vision tokens, output the number of multi-modal connector tokens
"""
raise NotImplementedError
@abstractmethod
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
"""
@ -1394,7 +1420,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
self,
num_image_tokens: int,
) -> int:
"""Given the number of image tokens, output the number of multi-modal encoder tokens"""
"""
Implement this function to enable LoRA support
for the tower module of the multi-modal model
Given the number of image tokens, output the number of multi-modal encoder tokens
"""
raise NotImplementedError
@abstractmethod
@ -1402,7 +1433,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
self,
num_vision_tokens: int,
) -> int:
"""Given the number of vision tokens, output the number of multi-modal connector tokens"""
"""
Implement this function to enable LoRA support
for the connector module of the multi-modal model
Given the number of vision tokens, output the number of multi-modal connector tokens
"""
raise NotImplementedError
@abstractmethod