diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 94cbb20cee5f..f8e9e3181367 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -272,6 +272,9 @@ class Mistral3MultiModalProcessor( vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) + # Need to sneak in spatial_merge_size for Mistral3 + vision_config.spatial_merge_size = getattr(hf_config, + "spatial_merge_size", 1) encoder_info = PixtralHFEncoderInfo(vision_config) def get_replacement(item_idx: int): diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 58323d639d5d..73fd80146955 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -911,9 +911,8 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): return self.vision_config.image_size def get_patch_size(self) -> int: - spatial_merge_size = getattr(self.vision_config, "spatial_merge_size", - 1) - return (self.vision_config.patch_size * spatial_merge_size) + return (self.vision_config.patch_size * + self.vision_config.spatial_merge_size) def get_patch_grid_length(self) -> int: image_size, patch_size = self.get_image_size(), self.get_patch_size()