diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index a2ca92cdec072..f42d48e919cd0 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -28,12 +28,16 @@ from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, import torch from torch import nn -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.whisper.modeling_whisper import ( ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder) from vllm.config import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) @@ -512,6 +516,36 @@ class MiniCPMO(MiniCPMV2_6): self.audio_token_id = None + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + # See: https://huggingface.co/openbmb/MiniCPM-o-2_6-int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + # MiniCPMO GPTQ model leave vpm unquantized. + quant_config = self._maybe_ignore_quant_config(quant_config) + return super().init_vision_module(config, quant_config, prefix) + + def init_resampler( + self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + # MiniCPMO GPTQ model leave resampler unquantized. + quant_config = self._maybe_ignore_quant_config(quant_config) + return super().init_resampler(embed_dim, vision_dim, quant_config, + prefix) + def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): # Do not use parameters temporarily audio_config = self.config.audio_config diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 65a26eadd5c81..300360f785aec 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1181,7 +1181,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): def init_vision_module( self, config: PretrainedConfig, - quant_config: Optional[QuantizationConfig], + quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> nn.Module: model = Idefics2VisionTransformer(config.vision_config,