[Doc] Fix top-level API links/docs (#18621)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-05-24 00:46:56 +08:00 committed by GitHub
parent 8ddd1cf26a
commit 273cb3b4d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 88 additions and 103 deletions

View File

@ -129,16 +129,17 @@ class BenchmarkDataset(ABC):
Args: Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of LoRA is selected.
LoRAs available. If None, LoRA is not used. lora_path max_loras (Optional[int]): The maximum number of LoRAs available.
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA If `None`, LoRA is not used.
is not used. lora_path (Optional[str]): Path to the LoRA parameters on disk.
If `None`, LoRA is not used.
Returns: Returns:
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first A tuple with the following elements:
element is a LoRARequest (or None if not applicable) and the second - A new [LoRARequest][] (or `None` if not applicable).
element is the tokenizer associated with the LoRA request (or the - The tokenizer associated with the LoRA request
base tokenizer). (or the base tokenizer).
""" """
if max_loras is None or lora_path is None: if max_loras is None or lora_path is None:
return None, tokenizer return None, tokenizer
@ -167,7 +168,7 @@ class BenchmarkDataset(ABC):
Args: Args:
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text. for processing the dataset's text.
num_requests (int): The number of sample requests to generate. num_requests (int): The number of sample requests to generate.
Returns: Returns:
@ -184,7 +185,8 @@ class BenchmarkDataset(ABC):
Args: Args:
requests (List[SampleRequest]): The current list of sampled requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests. requests.
num_requests (int): The target number of requests.
""" """
if len(requests) < num_requests: if len(requests) < num_requests:
random.seed(self.random_seed) random.seed(self.random_seed)

View File

@ -4552,7 +4552,7 @@ def contains_object_print(text):
text (str): The text to check text (str): The text to check
Returns: Returns:
bool: True if a match is found, False otherwise result (bool): `True` if a match is found, `False` otherwise.
""" """
pattern = r'at 0x[a-fA-F0-9]{2,16}>' pattern = r'at 0x[a-fA-F0-9]{2,16}>'
match = re.search(pattern, text) match = re.search(pattern, text)

View File

@ -210,10 +210,11 @@ class KVConnectorBase_V1(ABC):
computed tokens for this request computed tokens for this request
Returns: Returns:
* the number of tokens that can be loaded from the A tuple with the following elements:
external KV cache beyond what is already computed. - The number of tokens that can be loaded from the
* true if external KV cache tokens will be loaded external KV cache beyond what is already computed.
asynchronously (between scheduler steps). - `True` if external KV cache tokens will be loaded
asynchronously (between scheduler steps).
""" """
pass pass

View File

@ -118,11 +118,11 @@ class PyNcclPipe(KVPipeBase):
""" """
Create the metadata as a dictionary based on the input tensor. Create the metadata as a dictionary based on the input tensor.
Parameters: Args:
- tensor: The input tensor or None if no tensor is provided. tensor: The input tensor or None if no tensor is provided.
Returns: Returns:
- metadata: A dictionary with the following keys: metadata: A dictionary with the following keys:
- "dtype": The data type of the tensor or None. - "dtype": The data type of the tensor or None.
- "shape": The shape of the tensor or None. - "shape": The shape of the tensor or None.
""" """
@ -135,13 +135,13 @@ class PyNcclPipe(KVPipeBase):
""" """
Create a buffer to receive the tensor based on the provided metadata. Create a buffer to receive the tensor based on the provided metadata.
Parameters: Args:
- metadata: A dictionary with keys "dtype" and "shape", describing metadata: A dictionary with keys "dtype" and "shape",
the tensor's data type and shape. describing the tensor's data type and shape.
Returns: Returns:
- buffer: A tensor of the specified type and shape, allocated on buffer: A tensor of the specified type and shape,
self.device. allocated on `self.device`.
""" """
return torch.empty(metadata["shape"], return torch.empty(metadata["shape"],
dtype=metadata["dtype"], dtype=metadata["dtype"],
@ -151,8 +151,8 @@ class PyNcclPipe(KVPipeBase):
""" """
Send the metadata dictionary to the target rank. Send the metadata dictionary to the target rank.
Parameters: Args:
- metadata: A dictionary with keys "dtype" and "shape". metadata: A dictionary with keys "dtype" and "shape".
""" """
self.group.send_obj(metadata, self.target_rank_for_send) self.group.send_obj(metadata, self.target_rank_for_send)
@ -161,8 +161,8 @@ class PyNcclPipe(KVPipeBase):
Receive the metadata dictionary from the target rank. Receive the metadata dictionary from the target rank.
Returns: Returns:
- metadata: A dictionary with keys "dtype" and "shape" describing metadata: A dictionary with keys "dtype" and "shape"
the tensor. describing the tensor.
""" """
return self.group.recv_obj(self.target_rank_for_recv) return self.group.recv_obj(self.target_rank_for_recv)
@ -171,9 +171,9 @@ class PyNcclPipe(KVPipeBase):
The actual implementation of sending the tensor and its metadata to the The actual implementation of sending the tensor and its metadata to the
target rank. target rank.
Parameters: Args:
- tensor: The input tensor to be sent, or None if no tensor is tensor: The input tensor to be sent, or `None` if no tensor is
being sent. being sent.
""" """
metadata = self._make_metadata(tensor) metadata = self._make_metadata(tensor)
self._send_metadata(metadata) self._send_metadata(metadata)
@ -187,7 +187,7 @@ class PyNcclPipe(KVPipeBase):
the target rank. the target rank.
Returns: Returns:
- buffer: The received tensor, or None if no tensor is received. buffer: The received tensor, or `None` if no tensor is received.
""" """
metadata = self._recv_metadata() metadata = self._recv_metadata()
if metadata["dtype"] is None: if metadata["dtype"] is None:
@ -227,8 +227,8 @@ class PyNcclPipe(KVPipeBase):
Sends a tensor and its metadata to the destination rank in a Sends a tensor and its metadata to the destination rank in a
non-blocking way. non-blocking way.
Parameters: Args:
- tensor: The tensor to send, or None if no tensor is being sent. tensor: The tensor to send, or `None` if no tensor is being sent.
""" """
if self.transport_thread is None: if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1) self.transport_thread = ThreadPoolExecutor(max_workers=1)
@ -250,8 +250,8 @@ class PyNcclPipe(KVPipeBase):
""" """
Receives a tensor and its metadata from the source rank. Blocking call. Receives a tensor and its metadata from the source rank. Blocking call.
Returns: Args:
- tensor: The received tensor, or None if no tensor is received. tensor: The received tensor, or `None` if no tensor is received.
""" """
if self.transport_thread is None: if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1) self.transport_thread = ThreadPoolExecutor(max_workers=1)

View File

@ -130,26 +130,16 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the iteration-level scheduling and efficient memory management to maximize the
serving throughput. serving throughput.
The {class}`~vllm.LLM` class wraps this class for offline batched inference The [LLM][vllm.LLM] class wraps this class for offline batched inference
and the {class}`AsyncLLMEngine` class wraps this class for online serving. and the [AsyncLLMEngine][] class wraps this class for online serving.
The config arguments are derived from {class}`~vllm.EngineArgs`. (See The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
{ref}`engine-args`) [engine-args][])
Args: Args:
model_config: The configuration related to the LLM model. vllm_config: The configuration for initializing and running vLLM.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
lora_config (Optional): The configuration related to serving multi-LoRA.
speculative_config (Optional): The configuration related to speculative
decoding.
executor_class: The model executor class for managing distributed executor_class: The model executor class for managing distributed
execution. execution.
prompt_adapter_config (Optional): The configuration related to serving
prompt adapters.
log_stats: Whether to log statistics. log_stats: Whether to log statistics.
usage_context: Specified entry point, used for usage info collection. usage_context: Specified entry point, used for usage info collection.
""" """
@ -695,11 +685,12 @@ class LLMEngine:
Args: Args:
request_id: The unique ID of the request. request_id: The unique ID of the request.
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
[PromptType][vllm.inputs.PromptType]
for more details about the format of each input. for more details about the format of each input.
params: Parameters for sampling or pooling. params: Parameters for sampling or pooling.
{class}`~vllm.SamplingParams` for text generation. [SamplingParams][vllm.SamplingParams] for text generation.
{class}`~vllm.PoolingParams` for pooling. [PoolingParams][vllm.PoolingParams] for pooling.
arrival_time: The arrival time of the request. If None, we use arrival_time: The arrival time of the request. If None, we use
the current monotonic time. the current monotonic time.
lora_request: The LoRA request to add. lora_request: The LoRA request to add.
@ -711,10 +702,11 @@ class LLMEngine:
Details: Details:
- Set arrival_time to the current time if it is None. - Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None. - Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of {class}`~vllm.Sequence` objects. - Create `n` number of [Sequence][vllm.Sequence] objects.
- Create a {class}`~vllm.SequenceGroup` object - Create a [SequenceGroup][vllm.SequenceGroup] object
from the list of {class}`~vllm.Sequence`. from the list of [Sequence][vllm.Sequence].
- Add the {class}`~vllm.SequenceGroup` object to the scheduler. - Add the [SequenceGroup][vllm.SequenceGroup] object to the
scheduler.
Example: Example:
>>> # initialize engine >>> # initialize engine
@ -861,9 +853,7 @@ class LLMEngine:
request_id: The ID(s) of the request to abort. request_id: The ID(s) of the request to abort.
Details: Details:
- Refer to the - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class {class}`~vllm.core.scheduler.Scheduler`.
Example: Example:
>>> # initialize engine and add a request with request_id >>> # initialize engine and add a request with request_id

View File

@ -116,7 +116,8 @@ class LLM:
to eager mode. Additionally for encoder-decoder models, if the to eager mode. Additionally for encoder-decoder models, if the
sequence length of the encoder input is larger than this, we fall sequence length of the encoder input is larger than this, we fall
back to the eager mode. back to the eager mode.
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig` disable_custom_all_reduce: See
[ParallelConfig][vllm.config.ParallelConfig].
disable_async_output_proc: Disable async output processing. disable_async_output_proc: Disable async output processing.
This may result in lower performance. This may result in lower performance.
hf_token: The token to use as HTTP bearer authorization for remote files hf_token: The token to use as HTTP bearer authorization for remote files
@ -128,12 +129,12 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration. is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See **kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See
{ref}`engine-args`) [engine-args][])
Note: Note:
This class is intended to be used for offline inference. For online This class is intended to be used for offline inference. For online
serving, use the {class}`~vllm.AsyncLLMEngine` class instead. serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
""" """
DEPRECATE_LEGACY: ClassVar[bool] = True DEPRECATE_LEGACY: ClassVar[bool] = True
@ -142,7 +143,7 @@ class LLM:
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
""" """
A flag to toggle whether to deprecate positional arguments in A flag to toggle whether to deprecate positional arguments in
{meth}`LLM.__init__`. [LLM.__init__][].
""" """
@classmethod @classmethod
@ -403,7 +404,7 @@ class LLM:
Args: Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType` for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts. for more details about the format of each prompts.
sampling_params: The sampling parameters for text generation. If sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters. None, we use the default sampling parameters.
@ -669,7 +670,7 @@ class LLM:
Generate responses for a chat conversation. Generate responses for a chat conversation.
The chat conversation is converted into a text prompt using the The chat conversation is converted into a text prompt using the
tokenizer and calls the {meth}`generate` method to generate the tokenizer and calls the [generate][] method to generate the
responses. responses.
Multi-modal inputs can be passed in the same way you would pass them Multi-modal inputs can be passed in the same way you would pass them
@ -678,8 +679,8 @@ class LLM:
Args: Args:
messages: A list of conversations or a single conversation. messages: A list of conversations or a single conversation.
- Each conversation is represented as a list of messages. - Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys. - Each message is a dictionary with 'role' and 'content' keys.
sampling_params: The sampling parameters for text generation. sampling_params: The sampling parameters for text generation.
If None, we use the default sampling parameters. When it If None, we use the default sampling parameters. When it
@ -689,27 +690,27 @@ class LLM:
use_tqdm: Whether to use tqdm to display the progress bar. use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
chat_template: The template to use for structuring the chat. chat_template: The template to use for structuring the chat.
If not provided, the model's default chat template will be used. If not provided, the model's default chat template will be used.
chat_template_content_format: The format to render message content. chat_template_content_format: The format to render message content.
- "string" will render the content as a string. - "string" will render the content as a string.
Example: ``"Who are you?"`` Example: `"Who are you?"`
- "openai" will render the content as a list of dictionaries, - "openai" will render the content as a list of dictionaries,
similar to OpenAI schema. similar to OpenAI schema.
Example: ``[{"type": "text", "text": "Who are you?"}]`` Example: `[{"type": "text", "text": "Who are you?"}]`
add_generation_prompt: If True, adds a generation template add_generation_prompt: If True, adds a generation template
to each message. to each message.
continue_final_message: If True, continues the final message in continue_final_message: If True, continues the final message in
the conversation instead of starting a new one. Cannot be the conversation instead of starting a new one. Cannot be
``True`` if ``add_generation_prompt`` is also ``True``. `True` if `add_generation_prompt` is also `True`.
chat_template_kwargs: Additional kwargs to pass to the chat chat_template_kwargs: Additional kwargs to pass to the chat
template. template.
mm_processor_kwargs: Multimodal processor kwarg overrides for this mm_processor_kwargs: Multimodal processor kwarg overrides for this
chat request. Only used for offline requests. chat request. Only used for offline requests.
Returns: Returns:
A list of ``RequestOutput`` objects containing the generated A list of `RequestOutput` objects containing the generated
responses in the same order as the input messages. responses in the same order as the input messages.
""" """
list_of_messages: list[list[ChatCompletionMessageParam]] list_of_messages: list[list[ChatCompletionMessageParam]]
@ -908,7 +909,7 @@ class LLM:
Args: Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType` for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts. for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters. use the default pooling parameters.
@ -997,7 +998,7 @@ class LLM:
Args: Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType` for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts. for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters. use the default pooling parameters.
@ -1007,7 +1008,7 @@ class LLM:
generation, if any. generation, if any.
Returns: Returns:
A list of ``EmbeddingRequestOutput`` objects containing the A list of `EmbeddingRequestOutput` objects containing the
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
if self.llm_engine.model_config.task != "embed": if self.llm_engine.model_config.task != "embed":
@ -1041,7 +1042,7 @@ class LLM:
Args: Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType` for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts. for more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar. use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
@ -1049,7 +1050,7 @@ class LLM:
generation, if any. generation, if any.
Returns: Returns:
A list of ``ClassificationRequestOutput`` objects containing the A list of `ClassificationRequestOutput` objects containing the
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
if self.llm_engine.model_config.task != "classify": if self.llm_engine.model_config.task != "classify":
@ -1159,11 +1160,11 @@ class LLM:
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
"""Generate similarity scores for all pairs ``<text,text_pair>``. """Generate similarity scores for all pairs `<text,text_pair>`.
The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N`` In the `1 - N` case the `text_1` sentence will be replicated `N`
times to pair with the ``text_2`` sentences. times to pair with the `text_2` sentences.
The input pairs are used to build a list of prompts for the The input pairs are used to build a list of prompts for the
cross encoder model. This class automatically batches the prompts, cross encoder model. This class automatically batches the prompts,
considering the memory constraint. For the best performance, put all considering the memory constraint. For the best performance, put all
@ -1171,9 +1172,9 @@ class LLM:
Args: Args:
text_1: can be a single prompt or a list of prompts, in which text_1: can be a single prompt or a list of prompts, in which
case it has to have the same length as the ``text_2`` list case it has to have the same length as the `text_2` list
text_2: The texts to pair with the query to form the input text_2: The texts to pair with the query to form the input
to the LLM. See {class}`~vllm.inputs.PromptType` for to the LLM. See [PromptType][vllm.inputs.PromptType] for
more details about the format of each prompts. more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar. use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
@ -1181,7 +1182,7 @@ class LLM:
generation, if any. generation, if any.
Returns: Returns:
A list of ``ScoringRequestOutput`` objects containing the A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts. generated scores in the same order as the input prompts.
""" """
runner_type = self.llm_engine.model_config.runner_type runner_type = self.llm_engine.model_config.runner_type
@ -1282,13 +1283,13 @@ class LLM:
def wake_up(self, tags: Optional[list[str]] = None): def wake_up(self, tags: Optional[list[str]] = None):
""" """
Wake up the engine from sleep mode. See the {meth}`sleep` method Wake up the engine from sleep mode. See the [sleep][] method
for more details. for more details.
Args: Args:
tags: An optional list of tags to reallocate the engine memory tags: An optional list of tags to reallocate the engine memory
for specific memory allocations. Values must be in for specific memory allocations. Values must be in
("weights", "kv_cache",). If None, all memory is reallocated. `("weights", "kv_cache")`. If None, all memory is reallocated.
wake_up should be called with all tags (or None) before the wake_up should be called with all tags (or None) before the
engine is used again. engine is used again.
""" """

View File

@ -12,7 +12,7 @@ The global {class}`~MultiModalRegistry` is used by model runners to
dispatch data processing according to the target model. dispatch data processing according to the target model.
Info: Info:
{ref}`mm-processing` [mm-processing][]
""" """
__all__ = [ __all__ = [

View File

@ -215,7 +215,7 @@ class MultiModalRegistry:
invoked to transform the data into a dictionary of model inputs. invoked to transform the data into a dictionary of model inputs.
Info: Info:
{ref}`mm-processing` [mm-processing][]
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
@ -260,7 +260,7 @@ class MultiModalRegistry:
Create a multi-modal processor for a specific model and tokenizer. Create a multi-modal processor for a specific model and tokenizer.
Info: Info:
{ref}`mm-processing` [mm-processing][]
""" """
if not model_config.is_multimodal_model: if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model") raise ValueError(f"{model_config.model} is not a multimodal model")

View File

@ -391,15 +391,6 @@ class PoolingRequestOutput(Generic[_O]):
prompt_token_ids, finished) prompt_token_ids, finished)
def __repr__(self): def __repr__(self):
"""
Returns a string representation of an PoolingRequestOutput instance.
The representation includes the request_id and the number of outputs,
providing a quick overview of the pooling request's results.
Returns:
str: A string representation of the PoolingRequestOutput instance.
"""
return (f"{type(self).__name__}(request_id={self.request_id!r}, " return (f"{type(self).__name__}(request_id={self.request_id!r}, "
f"outputs={self.outputs!r}, " f"outputs={self.outputs!r}, "
f"prompt_token_ids={self.prompt_token_ids}, " f"prompt_token_ids={self.prompt_token_ids}, "