diff --git a/requirements/test.txt b/requirements/test.txt
index 10fb1f14c3a1..c733364fd871 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.5.1
+mistral-common==1.5.4
     # via -r requirements/test.in
 more-itertools==10.5.0
     # via lm-eval
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8e5454328bda..f9facdf1831d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -73,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
-    
+
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
@@ -849,10 +849,10 @@ class VisionTransformer(nn.Module):
     ) -> torch.Tensor:
         """
         Args:
-            images: list of N_img images of variable sizes, 
+            images: list of N_img images of variable sizes,
                 each of shape (C, H, W)
         Returns:
-            image_features: tensor of token features for 
+            image_features: tensor of token features for
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
@@ -935,7 +935,8 @@ class PatchMerger(nn.Module):
         # x is (N, vision_encoder_dim)
         x = self.permute(x, image_sizes)
 
-        # x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
+        # x is (N / spatial_merge_size ** 2,
+        #       vision_encoder_dim * spatial_merge_size ** 2)
         x = self.merging_layer(x)
 
         # x is (N / spatial_merge_size ** 2, vision_encoder_dim)