[Docs] Fix warnings in mkdocs build (#23649)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Hyogeun Oh (오효근) 2025-08-27 03:19:23 +09:00 committed by GitHub
parent 9b0187003e
commit 730d0ac8b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 66 additions and 58 deletions

View File

@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
"""Forward pass with FlashAttention. """Forward pass with FlashAttention.
Args: Args:
query: shape = [num_tokens, num_heads, head_size] layer: Attention layer instance.
key: shape = [num_tokens, num_kv_heads, head_size] q: Query tensor with shape = [num_tokens, num_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size] k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size] v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0] NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run. for profiling run.
attn_metadata: Metadata for attention. attn_metadata: Metadata for attention.
output: Output tensor with shape [num_tokens, num_heads, head_size]
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
NOTE: It in-place updates the output tensor. NOTE: It in-place updates the output tensor.
NOTE: FP8 quantization, flash-attn expect the size of NOTE: FP8 quantization, flash-attn expect the size of
{q,k,v}_descale to be (num_sequences, num_kv_heads). {q,k,v}_descale to be (num_sequences, num_kv_heads).

View File

@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
key: shape = [num_tokens, num_kv_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size] output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0] NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run. for profiling run.
attn_metadata: Metadata for attention. attn_metadata: Metadata for attention.
@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
def _get_query_key_seq_metadata( def _get_query_key_seq_metadata(
attn_metadata, attn_metadata: FlashAttentionMetadata,
is_prompt: bool, is_prompt: bool,
attn_type: str, attn_type: str,
) -> tuple: ) -> tuple:

View File

@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
use prefill sequence attributes use prefill sequence attributes
Args: Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size] query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0] NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run. for profiling run.
attn_metadata: Metadata for attention. attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention, output: Optional output tensor.
decoder self-attention, or encoder/decoder cross- output_scale: Optional output scale tensor.
attention. Defaults to decoder self-attention, output_block_scale: Optional output block scale tensor.
which is the vLLM default generally
Returns: Returns:
shape = [num_tokens, num_heads * head_size] shape = [num_tokens, num_heads * head_size]
""" """

View File

@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
Raises: Raises:
AssertionError: If the number of encoder tokens in `attn_metadata` AssertionError: If the number of encoder tokens in `attn_metadata`
is `None` when required for the calculations. is `None` when required for the calculations.
""" """
num_prefill_query_tokens = 0 num_prefill_query_tokens = 0
num_decode_query_tokens = 0 num_decode_query_tokens = 0

View File

@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
max_encoder_seq_len) max_encoder_seq_len)
Args: Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size] query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0] NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run. for profiling run.
attn_metadata: Metadata for attention. attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention, output: Optional output tensor.
decoder self-attention, or encoder/decoder cross- output_scale: Optional output scale tensor.
attention. Defaults to decoder self-attention, output_block_scale: Optional output block scale tensor.
which is the vLLM default generally
Returns: Returns:
shape = [num_tokens, num_heads * head_size] shape = [num_tokens, num_heads * head_size]
""" """
@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
for API spec. for API spec.
Args: Args:
output: shape = [num_prefill_tokens, num_heads, head_size]
query: shape = [num_prefill_tokens, num_heads, head_size] query: shape = [num_prefill_tokens, num_heads, head_size]
key: shape = [num_prefill_tokens, num_kv_heads, head_size] key: shape = [num_prefill_tokens, num_kv_heads, head_size]
value: shape = [num_prefill_tokens, num_kv_heads, head_size] value: shape = [num_prefill_tokens, num_kv_heads, head_size]

View File

@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
with num_lookahead_slots. with num_lookahead_slots.
Args: Args:
sequence_group (SequenceGroup): The sequence group to swap in. seq_group (SequenceGroup): The sequence group to swap in.
num_lookahead_slots (int): Number of lookahead slots used in num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0. speculative decoding, default to 0.
@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
Args: Args:
seq_group (SequenceGroup): The sequence group to swap out. seq_group (SequenceGroup): The sequence group to swap out.
num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0.
Returns: Returns:
bool: Whether it's possible to swap out current sequence group. bool: Whether it's possible to swap out current sequence group.
@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
swapping out the given sequence_group with num_lookahead_slots. swapping out the given sequence_group with num_lookahead_slots.
Args: Args:
sequence_group (SequenceGroup): The sequence group to swap out. seq_group (SequenceGroup): The sequence group to swap out.
Returns: Returns:
List[Tuple[int, int]]: The mapping of swapping block from List[Tuple[int, int]]: The mapping of swapping block from
@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
on to the 'device'. on to the 'device'.
Args: Args:
sequence_group (SequenceGroup): The sequence group to swap in/out. seq_group (SequenceGroup): The sequence group to swap in/out.
device (Device): device to swap the 'seq_group' on. device (Device): device to swap the 'seq_group' on.
status (SequenceStatus): The status of sequence which is needed status (SequenceStatus): The status of sequence which is needed
for action. RUNNING for swap out and SWAPPED for swap in for action. RUNNING for swap out and SWAPPED for swap in

View File

@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
def __init__(self, def __init__(self,
*args, *args: Any,
log_requests: bool = True, log_requests: bool = True,
start_engine_loop: bool = True, start_engine_loop: bool = True,
**kwargs) -> None: **kwargs: Any) -> None:
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1:
raise ValueError( raise ValueError(
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "

View File

@ -644,10 +644,10 @@ class LLMEngine:
Details: Details:
- Set arrival_time to the current time if it is None. - Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None. - Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of [Sequence][vllm.Sequence] objects. - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
- Create a [SequenceGroup][vllm.SequenceGroup] object - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
from the list of [Sequence][vllm.Sequence]. from the list of [Sequence][vllm.sequence.Sequence].
- Add the [SequenceGroup][vllm.SequenceGroup] object to the - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
scheduler. scheduler.
Example: Example:

View File

@ -186,7 +186,7 @@ class LLM:
CompilationConfig]] = None, CompilationConfig]] = None,
logits_processors: Optional[list[Union[str, logits_processors: Optional[list[Union[str,
type[LogitsProcessor]]]] = None, type[LogitsProcessor]]]] = None,
**kwargs, **kwargs: Any,
) -> None: ) -> None:
"""LLM constructor.""" """LLM constructor."""
@ -697,8 +697,8 @@ class LLM:
Generate responses for a chat conversation. Generate responses for a chat conversation.
The chat conversation is converted into a text prompt using the The chat conversation is converted into a text prompt using the
tokenizer and calls the [generate][] method to generate the tokenizer and calls the [generate][vllm.LLM.generate] method to generate
responses. the responses.
Multi-modal inputs can be passed in the same way you would pass them Multi-modal inputs can be passed in the same way you would pass them
to the OpenAI API. to the OpenAI API.
@ -1334,8 +1334,8 @@ class LLM:
def wake_up(self, tags: Optional[list[str]] = None): def wake_up(self, tags: Optional[list[str]] = None):
""" """
Wake up the engine from sleep mode. See the [sleep][] method Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
for more details. method for more details.
Args: Args:
tags: An optional list of tags to reallocate the engine memory tags: An optional list of tags to reallocate the engine memory

View File

@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
i += 1 i += 1
return boundaries return boundaries
def _extract_tool_args(self, tool_content: str, args_match) -> str: def _extract_tool_args(self, tool_content: str,
args_match: re.Match[str]) -> str:
""" """
Extract tool arguments from tool content. Extract tool arguments from tool content.

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import torch import torch
from einops import rearrange from einops import rearrange
@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
lightning_attention_ = _attention.apply lightning_attention_ = _attention.apply
def lightning_attention(q, k, v, ed, block_size=256, kv_history=None): def lightning_attention(
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
ed: torch.Tensor,
block_size: int = 256,
kv_history: Optional[torch.Tensor] = None
) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Apply lightning attention algorithm Apply lightning attention algorithm
to compute attention efficiently. to compute attention efficiently.

View File

@ -233,10 +233,10 @@ class LinearBase(CustomOp):
Args: Args:
input_size: input dimension of the linear layer. input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer. output_size: output dimension of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it. skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters. params_dtype: Data type for the parameters.
quant_config: Quantization configure. quant_config: Quantization configure.
prefix: Prefix for parameter names.
return_bias: If true, return bias together with outputs in forward pass. return_bias: If true, return bias together with outputs in forward pass.
""" """
@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
Args: Args:
input_size: input dimension of the linear layer. input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer. output_sizes: list of output dimensions of the linear layer.
bias: If true, add bias. bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it. skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters. params_dtype: Data type for the parameters.
quant_config: Quantization configure. quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj) (e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
""" """
def __init__( def __init__(

View File

@ -409,7 +409,7 @@ class EmbeddingOutput:
Args: Args:
embedding: The embedding vector, which is a list of floats. embedding: The embedding vector, which is a list of floats.
Its length depends on the hidden dimension of the model. Its length depends on the hidden dimension of the model.
""" """
embedding: list[float] embedding: list[float]
@ -447,7 +447,7 @@ class ClassificationOutput:
Args: Args:
probs: The probability vector, which is a list of floats. probs: The probability vector, which is a list of floats.
Its length depends on the number of classes. Its length depends on the number of classes.
""" """
probs: list[float] probs: list[float]

View File

@ -147,18 +147,7 @@ class SequenceDataDelta(
class SequenceData(msgspec.Struct, class SequenceData(msgspec.Struct,
omit_defaults=True): # type: ignore[call-arg] omit_defaults=True): # type: ignore[call-arg]
"""Data associated with a sequence. """Data associated with a sequence."""
Args:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output. Set to an empty list if
None.
Attributes:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output.
cumulative_logprob: The cumulative log probability of the output.
"""
# NOTE: we cannot use Union[list, array] because msgspec cannot support # NOTE: we cannot use Union[list, array] because msgspec cannot support
# union of 2 list types. # union of 2 list types.
_prompt_token_ids: array _prompt_token_ids: array
@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
@property @property
def cumulative_logprob(self) -> float: def cumulative_logprob(self) -> float:
"""The cumulative log probability of the output."""
return self._cumulative_logprob return self._cumulative_logprob
@property @property
def prompt_token_ids(self) -> tuple[int, ...]: def prompt_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the prompt."""
return self._prompt_token_ids_tuple return self._prompt_token_ids_tuple
@prompt_token_ids.setter @prompt_token_ids.setter
@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
@property @property
def output_token_ids(self) -> tuple[int, ...]: def output_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the output."""
return tuple(self._output_token_ids) return tuple(self._output_token_ids)
@output_token_ids.setter @output_token_ids.setter
@ -940,7 +932,7 @@ class SequenceGroupMetadata(
omit_defaults=True): # type: ignore[call-arg] omit_defaults=True): # type: ignore[call-arg]
"""Metadata for a sequence group. Used to create `AttentionMetadata`. """Metadata for a sequence group. Used to create `AttentionMetadata`.
Args: Attributes:
request_id: The ID of the request. request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage. is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data) seq_data: The sequence data. (Seq id -> sequence data)
@ -950,14 +942,14 @@ class SequenceGroupMetadata(
do_sample: True if sampling is required. Sampling is not required when do_sample: True if sampling is required. Sampling is not required when
e.g., prefill is chunked, and the current iteration only computes e.g., prefill is chunked, and the current iteration only computes
query tokens for prefill, we don't need sampling. query tokens for prefill, we don't need sampling.
token_chunk_size: The number of tokens to be processed (per sequence). pooling_params: Pooling parameters.
None if chunking is not required.
lora_request: LoRA request. lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed, computed_block_nums: The block numbers that are already computed,
used in prefix caching. used in prefix caching.
state: Internal state tied to this sequence group. state: Internal state tied to this sequence group.
token_type_ids: Token type IDs.
multi_modal_data: Multi modal data. multi_modal_data: Multi modal data.
mm_processor_kwargs: Multimodal input processor / mapper overrides. multi_modal_placeholders: Multi modal placeholders.
encoder_seq_data: Optional sequence data for encoder prompt encoder_seq_data: Optional sequence data for encoder prompt
(SequenceGroup.encoder_seq). Should be None (SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder unless you are working with an encoder/decoder
@ -1043,12 +1035,13 @@ class SequenceOutput(
array_like=True): # type: ignore[call-arg] array_like=True): # type: ignore[call-arg]
"""The model output associated with a sequence. """The model output associated with a sequence.
Args: Attributes:
parent_seq_id: The ID of the parent sequence (for forking in beam parent_seq_id: The ID of the parent sequence (for forking in beam
search). search).
output_token: The output token ID. output_token: The output token ID.
logprobs: The logprobs of the output token. logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i)) (Token id -> logP(x_i+1 | x_0, ..., x_i))
output_embed: Optional output embedding tensor.
""" """
parent_seq_id: int parent_seq_id: int
output_token: int output_token: int