[Docs] Fix warnings in mkdocs build (#23649)

Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-06-04 17:15:43 +08:00 · 2025-08-27 03:19:23 +09:00 · 2025-08-27 03:19:23 +09:00 · 730d0ac8b9
commit 730d0ac8b9
parent 9b0187003e
14 changed files with 66 additions and 58 deletions
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
        """Forward pass with FlashAttention.
        Args:
-            query: shape = [num_tokens, num_heads, head_size]
+            layer: Attention layer instance.
-            key: shape = [num_tokens, num_kv_heads, head_size]
+            q: Query tensor with shape = [num_tokens, num_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
+            k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
-            output: shape = [num_tokens, num_heads, head_size]
+            v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
                [2, num_blocks, block_size, num_kv_heads, head_size].
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
            output: Output tensor with shape [num_tokens, num_heads, head_size]
            output_scale: Optional output scale tensor.
            output_block_scale: Optional output block scale tensor.
        NOTE: It in-place updates the output tensor.
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
                [2, num_blocks, block_size, num_kv_heads, head_size].
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
 def _get_query_key_seq_metadata(
-    attn_metadata,
+    attn_metadata: FlashAttentionMetadata,
    is_prompt: bool,
    attn_type: str,
 ) -> tuple:
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                use prefill sequence attributes
        Args:
            layer: Attention layer instance.
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
                [2, num_blocks, block_size * num_kv_heads * head_size].
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
+            output: Optional output tensor.
-                       decoder self-attention, or encoder/decoder cross-
+            output_scale: Optional output scale tensor.
-                       attention. Defaults to decoder self-attention,
+            output_block_scale: Optional output block scale tensor.
                       which is the vLLM default generally
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
    Raises:
        AssertionError: If the number of encoder tokens in `attn_metadata` 
-        is `None` when required for the calculations.
+            is `None` when required for the calculations.
    """
    num_prefill_query_tokens = 0
    num_decode_query_tokens = 0
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                max_encoder_seq_len)
        Args:
            layer: Attention layer instance.
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
                [2, num_blocks, block_size * num_kv_heads * head_size].
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
+            output: Optional output tensor.
-                       decoder self-attention, or encoder/decoder cross-
+            output_scale: Optional output scale tensor.
-                       attention. Defaults to decoder self-attention,
+            output_block_scale: Optional output block scale tensor.
                       which is the vLLM default generally
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
        for API spec.
        Args:
            output: shape = [num_prefill_tokens, num_heads, head_size]
            query: shape = [num_prefill_tokens, num_heads, head_size]
            key: shape = [num_prefill_tokens, num_kv_heads, head_size]
            value: shape = [num_prefill_tokens, num_kv_heads, head_size]
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
        with num_lookahead_slots.
        Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
            num_lookahead_slots (int): Number of lookahead slots used in 
                speculative decoding, default to 0.
@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
        Args:
            seq_group (SequenceGroup): The sequence group to swap out.
            num_lookahead_slots (int): Number of lookahead slots used in 
                speculative decoding, default to 0.
        Returns:
            bool: Whether it's possible to swap out current sequence group.
@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
        swapping out the given sequence_group with num_lookahead_slots.
        Args:
-            sequence_group (SequenceGroup): The sequence group to swap out.
+            seq_group (SequenceGroup): The sequence group to swap out.
        Returns:
            List[Tuple[int, int]]: The mapping of swapping block from 
@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
        on to the 'device'.
        Args:
-            sequence_group (SequenceGroup): The sequence group to swap in/out.
+            seq_group (SequenceGroup): The sequence group to swap in/out.
            device (Device): device to swap the 'seq_group' on.
            status (SequenceStatus): The status of sequence which is needed
                for action. RUNNING for swap out and SWAPPED for swap in
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
    def __init__(self,
-                 *args,
+                 *args: Any,
                 log_requests: bool = True,
                 start_engine_loop: bool = True,
-                 **kwargs) -> None:
+                 **kwargs: Any) -> None:
        if envs.VLLM_USE_V1:
            raise ValueError(
                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -644,10 +644,10 @@ class LLMEngine:
        Details:
            - Set arrival_time to the current time if it is None.
            - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.Sequence] objects.
+            - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
-            - Create a [SequenceGroup][vllm.SequenceGroup] object
+            - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
-              from the list of [Sequence][vllm.Sequence].
+              from the list of [Sequence][vllm.sequence.Sequence].
-            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+            - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
              scheduler.
        Example:
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -186,7 +186,7 @@ class LLM:
                                           CompilationConfig]] = None,
        logits_processors: Optional[list[Union[str,
                                               type[LogitsProcessor]]]] = None,
-        **kwargs,
+        **kwargs: Any,
    ) -> None:
        """LLM constructor."""
@ -697,8 +697,8 @@ class LLM:
        Generate responses for a chat conversation.
        The chat conversation is converted into a text prompt using the
-        tokenizer and calls the [generate][] method to generate the
+        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
-        responses.
+        the responses.
        Multi-modal inputs can be passed in the same way you would pass them
        to the OpenAI API.
@ -1334,8 +1334,8 @@ class LLM:
    def wake_up(self, tags: Optional[list[str]] = None):
        """
-        Wake up the engine from sleep mode. See the [sleep][] method
+        Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
-        for more details.
+        method for more details.
        Args:
            tags: An optional list of tags to reallocate the engine memory
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
                i += 1
        return boundaries
-    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+    def _extract_tool_args(self, tool_content: str,
                           args_match: re.Match[str]) -> str:
        """
        Extract tool arguments from tool content.
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
 from einops import rearrange
@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
 lightning_attention_ = _attention.apply
-def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+def lightning_attention(
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    ed: torch.Tensor,
    block_size: int = 256,
    kv_history: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Apply lightning attention algorithm 
    to compute attention efficiently.
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@ -233,10 +233,10 @@ class LinearBase(CustomOp):
    Args:
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        bias: If true, add bias.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: Prefix for parameter names.
        return_bias: If true, return bias together with outputs in forward pass.
    """
@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
    Args:
        input_size: input dimension of the linear layer.
-        output_size: output dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
        bias: If true, add bias.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
    """
    def __init__(
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@ -409,7 +409,7 @@ class EmbeddingOutput:
    Args:
        embedding: The embedding vector, which is a list of floats.
-        Its length depends on the hidden dimension of the model.
+            Its length depends on the hidden dimension of the model.
    """
    embedding: list[float]
@ -447,7 +447,7 @@ class ClassificationOutput:
    Args:
        probs: The probability vector, which is a list of floats.
-        Its length depends on the number of classes.
+            Its length depends on the number of classes.
    """
    probs: list[float]
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@ -147,18 +147,7 @@ class SequenceDataDelta(
 class SequenceData(msgspec.Struct,
                   omit_defaults=True):  # type: ignore[call-arg]
-    """Data associated with a sequence.
+    """Data associated with a sequence."""
    Args:
        prompt_token_ids: The token IDs of the prompt.
        output_token_ids: The token IDs of the output. Set to an empty list if
            None.
    Attributes:
        prompt_token_ids: The token IDs of the prompt.
        output_token_ids: The token IDs of the output.
        cumulative_logprob: The cumulative log probability of the output.
    """
    # NOTE: we cannot use Union[list, array] because msgspec cannot support
    # union of 2 list types.
    _prompt_token_ids: array
@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
    @property
    def cumulative_logprob(self) -> float:
        """The cumulative log probability of the output."""
        return self._cumulative_logprob
    @property
    def prompt_token_ids(self) -> tuple[int, ...]:
        """The token IDs of the prompt."""
        return self._prompt_token_ids_tuple
    @prompt_token_ids.setter
@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
    @property
    def output_token_ids(self) -> tuple[int, ...]:
        """The token IDs of the output."""
        return tuple(self._output_token_ids)
    @output_token_ids.setter
@ -940,7 +932,7 @@ class SequenceGroupMetadata(
        omit_defaults=True):  # type: ignore[call-arg]
    """Metadata for a sequence group. Used to create `AttentionMetadata`.
-    Args:
+    Attributes:
        request_id: The ID of the request.
        is_prompt: Whether the request is at prompt stage.
        seq_data: The sequence data. (Seq id -> sequence data)
@ -950,14 +942,14 @@ class SequenceGroupMetadata(
        do_sample: True if sampling is required. Sampling is not required when
            e.g., prefill is chunked, and the current iteration only computes
            query tokens for prefill, we don't need sampling.
-        token_chunk_size: The number of tokens to be processed (per sequence).
+        pooling_params: Pooling parameters.
            None if chunking is not required.
        lora_request: LoRA request.
        computed_block_nums: The block numbers that are already computed,
            used in prefix caching.
        state: Internal state tied to this sequence group.
        token_type_ids: Token type IDs.
        multi_modal_data: Multi modal data.
-        mm_processor_kwargs: Multimodal input processor / mapper overrides.
+        multi_modal_placeholders: Multi modal placeholders.
        encoder_seq_data: Optional sequence data for encoder prompt
                          (SequenceGroup.encoder_seq). Should be None
                          unless you are working with an encoder/decoder
@ -1043,12 +1035,13 @@ class SequenceOutput(
        array_like=True):  # type: ignore[call-arg]
    """The model output associated with a sequence.
-    Args:
+    Attributes:
        parent_seq_id: The ID of the parent sequence (for forking in beam
            search).
        output_token: The output token ID.
        logprobs: The logprobs of the output token.
            (Token id -> logP(x_i+1 | x_0, ..., x_i))
        output_embed: Optional output embedding tensor.
    """
    parent_seq_id: int
    output_token: int