mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-04 17:15:43 +08:00
[Docs] Fix warnings in mkdocs build (#23649)
Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
9b0187003e
commit
730d0ac8b9
@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
|
|||||||
"""Forward pass with FlashAttention.
|
"""Forward pass with FlashAttention.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: shape = [num_tokens, num_heads, head_size]
|
layer: Attention layer instance.
|
||||||
key: shape = [num_tokens, num_kv_heads, head_size]
|
q: Query tensor with shape = [num_tokens, num_heads, head_size]
|
||||||
value: shape = [num_tokens, num_kv_heads, head_size]
|
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
|
||||||
output: shape = [num_tokens, num_heads, head_size]
|
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
|
||||||
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
kv_cache: KV cache tensor with shape
|
||||||
|
[2, num_blocks, block_size, num_kv_heads, head_size].
|
||||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||||
for profiling run.
|
for profiling run.
|
||||||
attn_metadata: Metadata for attention.
|
attn_metadata: Metadata for attention.
|
||||||
|
output: Output tensor with shape [num_tokens, num_heads, head_size]
|
||||||
|
output_scale: Optional output scale tensor.
|
||||||
|
output_block_scale: Optional output block scale tensor.
|
||||||
NOTE: It in-place updates the output tensor.
|
NOTE: It in-place updates the output tensor.
|
||||||
NOTE: FP8 quantization, flash-attn expect the size of
|
NOTE: FP8 quantization, flash-attn expect the size of
|
||||||
{q,k,v}_descale to be (num_sequences, num_kv_heads).
|
{q,k,v}_descale to be (num_sequences, num_kv_heads).
|
||||||
|
|||||||
@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
|
|||||||
key: shape = [num_tokens, num_kv_heads, head_size]
|
key: shape = [num_tokens, num_kv_heads, head_size]
|
||||||
value: shape = [num_tokens, num_kv_heads, head_size]
|
value: shape = [num_tokens, num_kv_heads, head_size]
|
||||||
output: shape = [num_tokens, num_heads, head_size]
|
output: shape = [num_tokens, num_heads, head_size]
|
||||||
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
kv_cache: KV cache tensor with shape
|
||||||
|
[2, num_blocks, block_size, num_kv_heads, head_size].
|
||||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||||
for profiling run.
|
for profiling run.
|
||||||
attn_metadata: Metadata for attention.
|
attn_metadata: Metadata for attention.
|
||||||
@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
|
|||||||
|
|
||||||
|
|
||||||
def _get_query_key_seq_metadata(
|
def _get_query_key_seq_metadata(
|
||||||
attn_metadata,
|
attn_metadata: FlashAttentionMetadata,
|
||||||
is_prompt: bool,
|
is_prompt: bool,
|
||||||
attn_type: str,
|
attn_type: str,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
|
|||||||
@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
|||||||
use prefill sequence attributes
|
use prefill sequence attributes
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
layer: Attention layer instance.
|
||||||
query: shape = [num_tokens, num_heads * head_size]
|
query: shape = [num_tokens, num_heads * head_size]
|
||||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
kv_cache: KV cache tensor with shape
|
||||||
|
[2, num_blocks, block_size * num_kv_heads * head_size].
|
||||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||||
for profiling run.
|
for profiling run.
|
||||||
attn_metadata: Metadata for attention.
|
attn_metadata: Metadata for attention.
|
||||||
attn_type: Select attention type, between encoder attention,
|
output: Optional output tensor.
|
||||||
decoder self-attention, or encoder/decoder cross-
|
output_scale: Optional output scale tensor.
|
||||||
attention. Defaults to decoder self-attention,
|
output_block_scale: Optional output block scale tensor.
|
||||||
which is the vLLM default generally
|
|
||||||
Returns:
|
Returns:
|
||||||
shape = [num_tokens, num_heads * head_size]
|
shape = [num_tokens, num_heads * head_size]
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
|
|||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
AssertionError: If the number of encoder tokens in `attn_metadata`
|
AssertionError: If the number of encoder tokens in `attn_metadata`
|
||||||
is `None` when required for the calculations.
|
is `None` when required for the calculations.
|
||||||
"""
|
"""
|
||||||
num_prefill_query_tokens = 0
|
num_prefill_query_tokens = 0
|
||||||
num_decode_query_tokens = 0
|
num_decode_query_tokens = 0
|
||||||
|
|||||||
@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
|||||||
max_encoder_seq_len)
|
max_encoder_seq_len)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
layer: Attention layer instance.
|
||||||
query: shape = [num_tokens, num_heads * head_size]
|
query: shape = [num_tokens, num_heads * head_size]
|
||||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
kv_cache: KV cache tensor with shape
|
||||||
|
[2, num_blocks, block_size * num_kv_heads * head_size].
|
||||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||||
for profiling run.
|
for profiling run.
|
||||||
attn_metadata: Metadata for attention.
|
attn_metadata: Metadata for attention.
|
||||||
attn_type: Select attention type, between encoder attention,
|
output: Optional output tensor.
|
||||||
decoder self-attention, or encoder/decoder cross-
|
output_scale: Optional output scale tensor.
|
||||||
attention. Defaults to decoder self-attention,
|
output_block_scale: Optional output block scale tensor.
|
||||||
which is the vLLM default generally
|
|
||||||
Returns:
|
Returns:
|
||||||
shape = [num_tokens, num_heads * head_size]
|
shape = [num_tokens, num_heads * head_size]
|
||||||
"""
|
"""
|
||||||
@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
|||||||
for API spec.
|
for API spec.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output: shape = [num_prefill_tokens, num_heads, head_size]
|
|
||||||
query: shape = [num_prefill_tokens, num_heads, head_size]
|
query: shape = [num_prefill_tokens, num_heads, head_size]
|
||||||
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
||||||
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
||||||
|
|||||||
@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
|||||||
with num_lookahead_slots.
|
with num_lookahead_slots.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence_group (SequenceGroup): The sequence group to swap in.
|
seq_group (SequenceGroup): The sequence group to swap in.
|
||||||
num_lookahead_slots (int): Number of lookahead slots used in
|
num_lookahead_slots (int): Number of lookahead slots used in
|
||||||
speculative decoding, default to 0.
|
speculative decoding, default to 0.
|
||||||
|
|
||||||
@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_group (SequenceGroup): The sequence group to swap out.
|
seq_group (SequenceGroup): The sequence group to swap out.
|
||||||
num_lookahead_slots (int): Number of lookahead slots used in
|
|
||||||
speculative decoding, default to 0.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: Whether it's possible to swap out current sequence group.
|
bool: Whether it's possible to swap out current sequence group.
|
||||||
@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
|||||||
swapping out the given sequence_group with num_lookahead_slots.
|
swapping out the given sequence_group with num_lookahead_slots.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence_group (SequenceGroup): The sequence group to swap out.
|
seq_group (SequenceGroup): The sequence group to swap out.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Tuple[int, int]]: The mapping of swapping block from
|
List[Tuple[int, int]]: The mapping of swapping block from
|
||||||
@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
|||||||
on to the 'device'.
|
on to the 'device'.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence_group (SequenceGroup): The sequence group to swap in/out.
|
seq_group (SequenceGroup): The sequence group to swap in/out.
|
||||||
device (Device): device to swap the 'seq_group' on.
|
device (Device): device to swap the 'seq_group' on.
|
||||||
status (SequenceStatus): The status of sequence which is needed
|
status (SequenceStatus): The status of sequence which is needed
|
||||||
for action. RUNNING for swap out and SWAPPED for swap in
|
for action. RUNNING for swap out and SWAPPED for swap in
|
||||||
|
|||||||
@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
*args,
|
*args: Any,
|
||||||
log_requests: bool = True,
|
log_requests: bool = True,
|
||||||
start_engine_loop: bool = True,
|
start_engine_loop: bool = True,
|
||||||
**kwargs) -> None:
|
**kwargs: Any) -> None:
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
|
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
|
||||||
|
|||||||
@ -644,10 +644,10 @@ class LLMEngine:
|
|||||||
Details:
|
Details:
|
||||||
- Set arrival_time to the current time if it is None.
|
- Set arrival_time to the current time if it is None.
|
||||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||||
- Create `n` number of [Sequence][vllm.Sequence] objects.
|
- Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
|
||||||
- Create a [SequenceGroup][vllm.SequenceGroup] object
|
- Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
|
||||||
from the list of [Sequence][vllm.Sequence].
|
from the list of [Sequence][vllm.sequence.Sequence].
|
||||||
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
|
- Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
|
||||||
scheduler.
|
scheduler.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|||||||
@ -186,7 +186,7 @@ class LLM:
|
|||||||
CompilationConfig]] = None,
|
CompilationConfig]] = None,
|
||||||
logits_processors: Optional[list[Union[str,
|
logits_processors: Optional[list[Union[str,
|
||||||
type[LogitsProcessor]]]] = None,
|
type[LogitsProcessor]]]] = None,
|
||||||
**kwargs,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""LLM constructor."""
|
"""LLM constructor."""
|
||||||
|
|
||||||
@ -697,8 +697,8 @@ class LLM:
|
|||||||
Generate responses for a chat conversation.
|
Generate responses for a chat conversation.
|
||||||
|
|
||||||
The chat conversation is converted into a text prompt using the
|
The chat conversation is converted into a text prompt using the
|
||||||
tokenizer and calls the [generate][] method to generate the
|
tokenizer and calls the [generate][vllm.LLM.generate] method to generate
|
||||||
responses.
|
the responses.
|
||||||
|
|
||||||
Multi-modal inputs can be passed in the same way you would pass them
|
Multi-modal inputs can be passed in the same way you would pass them
|
||||||
to the OpenAI API.
|
to the OpenAI API.
|
||||||
@ -1334,8 +1334,8 @@ class LLM:
|
|||||||
|
|
||||||
def wake_up(self, tags: Optional[list[str]] = None):
|
def wake_up(self, tags: Optional[list[str]] = None):
|
||||||
"""
|
"""
|
||||||
Wake up the engine from sleep mode. See the [sleep][] method
|
Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
|
||||||
for more details.
|
method for more details.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tags: An optional list of tags to reallocate the engine memory
|
tags: An optional list of tags to reallocate the engine memory
|
||||||
|
|||||||
@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
|
|||||||
i += 1
|
i += 1
|
||||||
return boundaries
|
return boundaries
|
||||||
|
|
||||||
def _extract_tool_args(self, tool_content: str, args_match) -> str:
|
def _extract_tool_args(self, tool_content: str,
|
||||||
|
args_match: re.Match[str]) -> str:
|
||||||
"""
|
"""
|
||||||
Extract tool arguments from tool content.
|
Extract tool arguments from tool content.
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
|
|
||||||
@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
|
|||||||
lightning_attention_ = _attention.apply
|
lightning_attention_ = _attention.apply
|
||||||
|
|
||||||
|
|
||||||
def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
|
def lightning_attention(
|
||||||
|
q: torch.Tensor,
|
||||||
|
k: torch.Tensor,
|
||||||
|
v: torch.Tensor,
|
||||||
|
ed: torch.Tensor,
|
||||||
|
block_size: int = 256,
|
||||||
|
kv_history: Optional[torch.Tensor] = None
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
Apply lightning attention algorithm
|
Apply lightning attention algorithm
|
||||||
to compute attention efficiently.
|
to compute attention efficiently.
|
||||||
|
|||||||
@ -233,10 +233,10 @@ class LinearBase(CustomOp):
|
|||||||
Args:
|
Args:
|
||||||
input_size: input dimension of the linear layer.
|
input_size: input dimension of the linear layer.
|
||||||
output_size: output dimension of the linear layer.
|
output_size: output dimension of the linear layer.
|
||||||
bias: If true, add bias.
|
|
||||||
skip_bias_add: If true, skip adding bias but instead return it.
|
skip_bias_add: If true, skip adding bias but instead return it.
|
||||||
params_dtype: Data type for the parameters.
|
params_dtype: Data type for the parameters.
|
||||||
quant_config: Quantization configure.
|
quant_config: Quantization configure.
|
||||||
|
prefix: Prefix for parameter names.
|
||||||
return_bias: If true, return bias together with outputs in forward pass.
|
return_bias: If true, return bias together with outputs in forward pass.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_size: input dimension of the linear layer.
|
input_size: input dimension of the linear layer.
|
||||||
output_size: output dimension of the linear layer.
|
output_sizes: list of output dimensions of the linear layer.
|
||||||
bias: If true, add bias.
|
bias: If true, add bias.
|
||||||
skip_bias_add: If true, skip adding bias but instead return it.
|
skip_bias_add: If true, skip adding bias but instead return it.
|
||||||
params_dtype: Data type for the parameters.
|
params_dtype: Data type for the parameters.
|
||||||
quant_config: Quantization configure.
|
quant_config: Quantization configure.
|
||||||
prefix: The name of the layer in the state dict, including all parents
|
prefix: The name of the layer in the state dict, including all parents
|
||||||
(e.g. model.layers.0.qkv_proj)
|
(e.g. model.layers.0.qkv_proj)
|
||||||
|
return_bias: If true, return bias together with outputs in forward pass.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@ -409,7 +409,7 @@ class EmbeddingOutput:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
embedding: The embedding vector, which is a list of floats.
|
embedding: The embedding vector, which is a list of floats.
|
||||||
Its length depends on the hidden dimension of the model.
|
Its length depends on the hidden dimension of the model.
|
||||||
"""
|
"""
|
||||||
embedding: list[float]
|
embedding: list[float]
|
||||||
|
|
||||||
@ -447,7 +447,7 @@ class ClassificationOutput:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
probs: The probability vector, which is a list of floats.
|
probs: The probability vector, which is a list of floats.
|
||||||
Its length depends on the number of classes.
|
Its length depends on the number of classes.
|
||||||
"""
|
"""
|
||||||
probs: list[float]
|
probs: list[float]
|
||||||
|
|
||||||
|
|||||||
@ -147,18 +147,7 @@ class SequenceDataDelta(
|
|||||||
|
|
||||||
class SequenceData(msgspec.Struct,
|
class SequenceData(msgspec.Struct,
|
||||||
omit_defaults=True): # type: ignore[call-arg]
|
omit_defaults=True): # type: ignore[call-arg]
|
||||||
"""Data associated with a sequence.
|
"""Data associated with a sequence."""
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt_token_ids: The token IDs of the prompt.
|
|
||||||
output_token_ids: The token IDs of the output. Set to an empty list if
|
|
||||||
None.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
prompt_token_ids: The token IDs of the prompt.
|
|
||||||
output_token_ids: The token IDs of the output.
|
|
||||||
cumulative_logprob: The cumulative log probability of the output.
|
|
||||||
"""
|
|
||||||
# NOTE: we cannot use Union[list, array] because msgspec cannot support
|
# NOTE: we cannot use Union[list, array] because msgspec cannot support
|
||||||
# union of 2 list types.
|
# union of 2 list types.
|
||||||
_prompt_token_ids: array
|
_prompt_token_ids: array
|
||||||
@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def cumulative_logprob(self) -> float:
|
def cumulative_logprob(self) -> float:
|
||||||
|
"""The cumulative log probability of the output."""
|
||||||
return self._cumulative_logprob
|
return self._cumulative_logprob
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prompt_token_ids(self) -> tuple[int, ...]:
|
def prompt_token_ids(self) -> tuple[int, ...]:
|
||||||
|
"""The token IDs of the prompt."""
|
||||||
return self._prompt_token_ids_tuple
|
return self._prompt_token_ids_tuple
|
||||||
|
|
||||||
@prompt_token_ids.setter
|
@prompt_token_ids.setter
|
||||||
@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def output_token_ids(self) -> tuple[int, ...]:
|
def output_token_ids(self) -> tuple[int, ...]:
|
||||||
|
"""The token IDs of the output."""
|
||||||
return tuple(self._output_token_ids)
|
return tuple(self._output_token_ids)
|
||||||
|
|
||||||
@output_token_ids.setter
|
@output_token_ids.setter
|
||||||
@ -940,7 +932,7 @@ class SequenceGroupMetadata(
|
|||||||
omit_defaults=True): # type: ignore[call-arg]
|
omit_defaults=True): # type: ignore[call-arg]
|
||||||
"""Metadata for a sequence group. Used to create `AttentionMetadata`.
|
"""Metadata for a sequence group. Used to create `AttentionMetadata`.
|
||||||
|
|
||||||
Args:
|
Attributes:
|
||||||
request_id: The ID of the request.
|
request_id: The ID of the request.
|
||||||
is_prompt: Whether the request is at prompt stage.
|
is_prompt: Whether the request is at prompt stage.
|
||||||
seq_data: The sequence data. (Seq id -> sequence data)
|
seq_data: The sequence data. (Seq id -> sequence data)
|
||||||
@ -950,14 +942,14 @@ class SequenceGroupMetadata(
|
|||||||
do_sample: True if sampling is required. Sampling is not required when
|
do_sample: True if sampling is required. Sampling is not required when
|
||||||
e.g., prefill is chunked, and the current iteration only computes
|
e.g., prefill is chunked, and the current iteration only computes
|
||||||
query tokens for prefill, we don't need sampling.
|
query tokens for prefill, we don't need sampling.
|
||||||
token_chunk_size: The number of tokens to be processed (per sequence).
|
pooling_params: Pooling parameters.
|
||||||
None if chunking is not required.
|
|
||||||
lora_request: LoRA request.
|
lora_request: LoRA request.
|
||||||
computed_block_nums: The block numbers that are already computed,
|
computed_block_nums: The block numbers that are already computed,
|
||||||
used in prefix caching.
|
used in prefix caching.
|
||||||
state: Internal state tied to this sequence group.
|
state: Internal state tied to this sequence group.
|
||||||
|
token_type_ids: Token type IDs.
|
||||||
multi_modal_data: Multi modal data.
|
multi_modal_data: Multi modal data.
|
||||||
mm_processor_kwargs: Multimodal input processor / mapper overrides.
|
multi_modal_placeholders: Multi modal placeholders.
|
||||||
encoder_seq_data: Optional sequence data for encoder prompt
|
encoder_seq_data: Optional sequence data for encoder prompt
|
||||||
(SequenceGroup.encoder_seq). Should be None
|
(SequenceGroup.encoder_seq). Should be None
|
||||||
unless you are working with an encoder/decoder
|
unless you are working with an encoder/decoder
|
||||||
@ -1043,12 +1035,13 @@ class SequenceOutput(
|
|||||||
array_like=True): # type: ignore[call-arg]
|
array_like=True): # type: ignore[call-arg]
|
||||||
"""The model output associated with a sequence.
|
"""The model output associated with a sequence.
|
||||||
|
|
||||||
Args:
|
Attributes:
|
||||||
parent_seq_id: The ID of the parent sequence (for forking in beam
|
parent_seq_id: The ID of the parent sequence (for forking in beam
|
||||||
search).
|
search).
|
||||||
output_token: The output token ID.
|
output_token: The output token ID.
|
||||||
logprobs: The logprobs of the output token.
|
logprobs: The logprobs of the output token.
|
||||||
(Token id -> logP(x_i+1 | x_0, ..., x_i))
|
(Token id -> logP(x_i+1 | x_0, ..., x_i))
|
||||||
|
output_embed: Optional output embedding tensor.
|
||||||
"""
|
"""
|
||||||
parent_seq_id: int
|
parent_seq_id: int
|
||||||
output_token: int
|
output_token: int
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user