From 9a8966bcc207036e9b65bd7951e808254ffba542 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Sat, 13 Sep 2025 16:13:44 +0900
Subject: [PATCH] [Docs] Fix warnings in mkdocs build (continued) (#24791)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
---
 vllm/distributed/eplb/eplb_state.py           |  8 +++---
 vllm/distributed/eplb/rebalance_algo.py       |  6 ++---
 .../kv_transfer/kv_connector/v1/base.py       |  5 ++--
 .../kv_connector/v1/lmcache_connector.py      |  5 ++--
 .../kv_connector/v1/p2p/p2p_nccl_connector.py |  7 ++---
 .../kv_connector/v1/p2p/tensor_memory_pool.py |  5 ++--
 .../v1/shared_storage_connector.py            |  7 ++---
 .../kv_transfer/kv_pipe/pynccl_pipe.py        |  4 +--
 vllm/model_executor/models/interfaces.py      |  2 +-
 vllm/model_executor/models/keye.py            | 12 +++------
 vllm/model_executor/models/keye_vl1_5.py      | 11 ++++----
 vllm/model_executor/models/llava.py           |  4 ++-
 vllm/model_executor/models/llava_next.py      |  5 ++--
 vllm/model_executor/models/mistral3.py        |  4 ++-
 vllm/model_executor/models/mllama4.py         |  9 +++----
 vllm/model_executor/models/moonvit.py         | 18 ++++++++++---
 vllm/model_executor/models/phi4_multimodal.py |  4 +--
 vllm/model_executor/models/qwen2_vl.py        | 12 +++------
 vllm/model_executor/models/siglip2navit.py    | 27 ++++++-------------
 vllm/model_executor/models/ultravox.py        |  9 ++++---
 vllm/model_executor/models/zamba2.py          |  4 +--
 vllm/transformers_utils/config.py             | 13 ++++-----
 vllm/transformers_utils/configs/jais.py       |  8 +++---
 vllm/transformers_utils/configs/ultravox.py   |  4 ---
 .../processors/deepseek_vl2.py                |  9 +++----
 vllm/transformers_utils/runai_utils.py        |  4 +--
 vllm/transformers_utils/s3_utils.py           |  6 ++---
 27 files changed, 102 insertions(+), 110 deletions(-)

diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index d5ab61473ab01..8f8baa7d59db7 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -337,11 +337,11 @@ class EplbState:
         Args:
             model (MixtureOfExperts): The MoE model.
             is_dummy (bool): If `True`, this is a dummy step and the load
-              metrics recorded in this forward pass will not count. Defaults
-              to `False`.
+                metrics recorded in this forward pass will not count. Defaults
+                to `False`.
             is_profile (bool): If `True`, perform a dummy rearrangement
-              with maximum communication cost. This is used in `profile_run`
-              to reserve enough memory for the communication buffer.
+                with maximum communication cost. This is used in `profile_run`
+                to reserve enough memory for the communication buffer.
             log_stats (bool): If `True`, log the expert load metrics.
 
         # Stats
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
index 879b5b9f18240..3564a10dfc684 100644
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -102,14 +102,14 @@ def rebalance_experts_hierarchical(
     num_groups: int,
     num_nodes: int,
     num_gpus: int,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Parameters:
         weight: [num_moe_layers, num_logical_experts]
         num_physical_experts: number of physical experts after replication
         num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-        (e.g, NVLink) is faster
+        num_nodes: number of server nodes, where the intra-node network 
+            (e.g, NVLink) is faster
         num_gpus: number of GPUs, must be a multiple of `num_nodes`
 
     Returns:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index cd4561154b78b..7e0b927c5b78f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC):
 
     @abstractmethod
     def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
         """
         Start loading the KV cache from the connector to vLLM's paged
         KV buffer. This is called from the forward context before the
@@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC):
 
     @abstractmethod
     def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
         """
         Start saving a layer of KV cache from vLLM's paged buffer 
         to the connector. This is called from within attention layer to
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index c99f538ee4185..2b0abe983fbb3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
     # Worker-side methods
     # ==============================
     def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
         """
         Start loading the KV cache from the connector to vLLM's paged
         KV buffer. This is called from the forward context before the
@@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self._lmcache_engine.wait_for_layer_load(layer_name)
 
     def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
         """
         Start saving the a layer of KV cache from vLLM's paged buffer 
         to the connector. This is called from within attention layer to
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 2485c57d86ecc..ec72905a0d3ec 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
     # ==============================
 
     def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
         """Start loading the KV cache from the connector buffer to vLLM's
         paged KV buffer.
 
@@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
         return
 
     def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
         to the connector.
 
@@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
     def get_finished(
             self, finished_req_ids: set[str],
-            **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]:
+            **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
index b775276d4a846..26070488bad89 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -218,8 +218,9 @@ class TensorMemoryPool:
 
         return addr
 
-    def load_tensor(self, addr: int, dtype: torch.dtype,
-                    shape: tuple[int, ...], device) -> torch.Tensor:
+    def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int,
+                                                                      ...],
+                    device: torch.device) -> torch.Tensor:
         """Loads a tensor from pinned host memory to the specified device.
 
         Args:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 25460ed295d30..48fa1a82c6775 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import safetensors
 import torch
@@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         logger.info("Shared storage path is %s", self._storage_path)
 
     def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
         """Start loading the KV cache from the connector buffer to vLLM's 
         paged KV buffer.
 
@@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
         return
 
     def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer 
         to the connector.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 66120e9a0a1a0..7a79a8cc0c932 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase):
         """
         Receives a tensor and its metadata from the source rank. Blocking call.
 
-        Args:
-            tensor: The received tensor, or `None` if no tensor is received.
+        Returns:
+            The received tensor, or `None` if no tensor is received.
         """
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index d5b71b057831b..8f8e300c84d71 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
         
         Args:
             layers: Tuple of layer indices that should output auxiliary
-              hidden states.
+                hidden states.
         """
         ...
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index cb4cd60a89177..afe33b4d4ad26 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
                 batch.
                 **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                 opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index b3c44d132435c..93a3bf5f98f7b 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
     return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
 
 
-def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
-                                                              torch.Tensor]):
+def get_num_patches(grid_thw: torch.Tensor,
+                    num_frames: Union[list[int], torch.Tensor]) -> list[int]:
     """
     Return num_patches per video.
 
     Args:
-        t: tensor with shape [N, ...] where each item is a list/tensor
-        cu_seqlens: list indicating the boundaries of groups
+        grid_thw: Tensor with shape [N, 3] containing temporal, height, width
+            dimensions
+        num_frames: List or tensor indicating the number of frames per video
 
     Returns:
-        list of ints representing the sum of products for each group
+        List of ints representing the number of patches for each video
 
     Examples:
         >>> # Suppose there are 2 videos with a total of 3 grids
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d692b2783048f..9591deea06ce9 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
 
         Info:
             [LlavaImageInputs][]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a63c18493df5e..5e82f9799e0fe 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: The pixels in each grid patch for each input image.
-            image_sizes: The original `(height, width)` for each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
 
         Info:
             [LlavaNextImageInputs][]
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 08948960b275c..09479012a03ad 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
 
         Info:
             [Mistral3ImagePixelInputs][]
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 6dc77666e1582..2f0e8a2a5e575 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
     ) -> torch.Tensor:
         r"""
         Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                    `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if you
-                want more control over how to convert `input_ids` indices into
+            hidden_states: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Hidden states from the model embeddings, representing 
+                the input tokens.
                 associated vectors than the model's internal embedding
                 lookup matrix.
         """
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 41a2c836b09f3..caa00763fc3d4 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -70,11 +70,15 @@ def multihead_attention(
     v: torch.Tensor,
     q_cu_seqlens: Optional[torch.Tensor] = None,
     k_cu_seqlens: Optional[torch.Tensor] = None,
-):
+) -> torch.Tensor:
     """Multi-head attention using flash attention 2.
 
     Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
             or (tot_seqlens, num_heads, head_dim) if packing.
         q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
             The first element should be 0 and the last element should be q.shape[0].
@@ -123,8 +127,14 @@ def sdpa_attention(
     """SDPA attention.
 
     Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
             or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens: Optional cumulative sequence lengths of q.
+        k_cu_seqlens: Optional cumulative sequence lengths of k.
     """
     seq_length = q.shape[0]
     attention_mask = torch.zeros([1, seq_length, seq_length],
@@ -387,7 +397,7 @@ class MLP2(nn.Module):
     def __init__(self,
                  dims: list[int],
                  activation,
-                 bias=True,
+                 bias: bool = True,
                  prefix: str = "",
                  use_data_parallel: bool = False):
         super().__init__()
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index ab63649b43561..25df9e9261d91 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
     Typically used as a very first layer in a model.
 
     Args:
-        input_size: int
-            layer input size.
+        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) 
+            object containing model parameters.
     """
 
     def __init__(self, config: Phi4MultimodalAudioConfig):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cf15dfa67743f..d08181c5fd53b 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 batch.
                 **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                 opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index a86700fe68dd3..7d90d3a7ef128 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
                 position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under
-                returned tensors for more detail.
+            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
+            cu_seqlens: Cumulative sequence lengths tensor.
+            position_embeddings: Position embeddings tensor.
         """
         residual = hidden_states
 
@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
     ) -> torch.Tensor:
         r"""
         Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if
-                you want more control over how to convert `input_ids` indices
-                into associated vectors than the model's internal embedding
-                lookup matrix.
-            grid_thws (`torch.LongTensor`):
-                grid shape (num_patches, 3)
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See
-                `hidden_states` under returned tensors for more detail.
-            return_dict (`bool`, *optional*):
+            inputs_embeds: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Embedded representation of the input tokens.
+            grid_thws: Grid tensor of shape (num_patches, 3) 
+                containing grid dimensions.
                 Whether or not to return a [`~utils.ModelOutput`] instead of
                 a plain tuple.
         """
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9e28b0c443df4..ad911ebedf895 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         with the `input_ids`.
 
         Args:
-            audio_features: A batch of audio input chunks [B, N, 80, M].
-            audio_lens: Length of audio frames for each audio chunk [B].
-            audio_token_len: Length of audio tokens for each audio chunk [B'].
-                Note: batch dim is different from batch dim in audio chunks.
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
 
         """
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 34b9c1ad07d76..86335d48c1454 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -909,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             prefix: Optional prefix for parameter names
         
         Raises:
-            AssertionError: If prefix caching is enabled
-            (not supported by Mamba)
+            AssertionError: If prefix caching is enabled 
+                (not supported by Mamba)
         """
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index a76f9504eccef..fd19d33ca0c89 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -679,20 +679,21 @@ def get_hf_file_to_dict(file_name: str,
 
 
 @cache
-def get_pooling_config(model: str, revision: Optional[str] = 'main'):
+def get_pooling_config(model: str,
+                       revision: Optional[str] = 'main') -> Optional[dict]:
     """
     This function gets the pooling and normalize
     config from the model - only applies to
     sentence-transformers models.
 
     Args:
-        model (str): The name of the Hugging Face model.
-        revision (str, optional): The specific version
-        of the model to use. Defaults to 'main'.
+        model: The name of the Hugging Face model.
+        revision: The specific version of the model to use. 
+            Defaults to 'main'.
 
     Returns:
-        dict: A dictionary containing the pooling
-        type and whether normalization is used.
+        A dictionary containing the pooling type and whether 
+            normalization is used, or None if no pooling configuration is found.
     """
 
     modules_file_name = "modules.json"
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 767c4ddae870d..d5ca2c7b4751a 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -74,10 +74,10 @@ class JAISConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values
             attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
-            defaults to `False`):
-            Whether to additionally scale attention weights by
-            `1 / layer_idx + 1`.
+        scale_attn_by_inverse_layer_idx
+            (`bool`, *optional*, defaults to `False`): 
+            Whether to additionally scale attention weights 
+            by `1 / layer_idx + 1`.
         reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
             Whether to scale keys (K) prior to computing attention
             (dot-product)
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index e67479516560f..aaf31d84d0c1a 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -37,10 +37,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
             The initialization value for the layer normalization.
         projector_act (`str`, *optional*, defaults to `"swiglu"`):
             The activation function used by the multimodal projector.
-        text_model_lora_config (`LoraConfigSimplified`, *optional*):
-            The LoRA configuration for finetuning the text model.
-        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
-            The LoRA configuration for finetuning the audio model.
         projector_ln_mid (`bool`, *optional*, defaults to `False`):
             Whether to apply layer normalization at the middle of the
             projector or at the end. Versions v0.4.1 and below
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index 5896bde312657..d1d117b4e2cf4 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -25,6 +25,7 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 import math
+from typing import Any
 
 import torch
 import torchvision.transforms as T
@@ -178,17 +179,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
         prompt: str,
         images: list[Image.Image],
         inference_mode: bool = True,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
 
         Args:
             prompt (str): the formatted prompt;
-            conversations (list[dict]): conversations with a list of messages;
             images (list[ImageType]): the list of images;
             inference_mode (bool): if True, then remove the last eos token;
-            system_prompt (str): the system prompt;
-            **kwargs:
+            **kwargs: Additional keyword arguments.
 
         Returns:
             outputs (BaseProcessorOutput): the output of the processor,
@@ -259,7 +258,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         text: str,
         images: list[Image.Image],
         inference_mode: bool = True,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
 
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
index 357c180ed35f2..b7bee1974de5b 100644
--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@@ -33,7 +33,6 @@ def list_safetensors(path: str = "") -> list[str]:
 
     Args:
         path: The object storage path to list from.
-        allow_pattern: A list of patterns of which files to pull.
 
     Returns:
         list[str]: List of full object storage paths allowed by the pattern
@@ -54,8 +53,7 @@ class ObjectStorageModel:
         dir: The temporary created directory.
 
     Methods:
-        pull_files(): Pull model from object storage to the temporary
-        directory.
+        pull_files(): Pull model from object storage to the temporary directory.
     """
 
     def __init__(self) -> None:
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 62c87c167e682..d17c1afe9b504 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-from typing import Optional
+from typing import Any, Optional
 
 from vllm.utils import PlaceholderModule
 
@@ -26,7 +26,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
     ]
 
 
-def glob(s3=None,
+def glob(s3: Optional[Any] = None,
          path: str = "",
          allow_pattern: Optional[list[str]] = None) -> list[str]:
     """
@@ -51,7 +51,7 @@ def glob(s3=None,
 
 
 def list_files(
-        s3,
+        s3: Any,
         path: str,
         allow_pattern: Optional[list[str]] = None,
         ignore_pattern: Optional[list[str]] = None