diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index a4cb1a68833a..92390d8c2f7e 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -153,4 +153,4 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], if hasattr(model, method_name): getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) + vllm_model.apply_model(validate_model_input) \ No newline at end of file diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 89d2817b57e0..4927d6b62c6d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema): in which case the data is passed as a list instead of a batched tensor. """ type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" - pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "c", "h", "w")] + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})] class LlavaImageEmbeddingInputs(TensorSchema):