diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 13c37c979dac7..74a9b2b03391b 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -129,16 +129,17 @@ class BenchmarkDataset(ABC): Args: tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. + LoRA is selected. + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). + A tuple with the following elements: + - A new [LoRARequest][] (or `None` if not applicable). + - The tokenizer associated with the LoRA request + (or the base tokenizer). """ if max_loras is None or lora_path is None: return None, tokenizer @@ -167,7 +168,7 @@ class BenchmarkDataset(ABC): Args: tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. + for processing the dataset's text. num_requests (int): The number of sample requests to generate. Returns: @@ -184,7 +185,8 @@ class BenchmarkDataset(ABC): Args: requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. + requests. + num_requests (int): The target number of requests. """ if len(requests) < num_requests: random.seed(self.random_seed) diff --git a/vllm/config.py b/vllm/config.py index cd2eb4508de38..40dbc2824bcb6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4552,7 +4552,7 @@ def contains_object_print(text): text (str): The text to check Returns: - bool: True if a match is found, False otherwise + result (bool): `True` if a match is found, `False` otherwise. """ pattern = r'at 0x[a-fA-F0-9]{2,16}>' match = re.search(pattern, text) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index ef4460a592bd6..bc9258e9d07b6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -210,10 +210,11 @@ class KVConnectorBase_V1(ABC): computed tokens for this request Returns: - * the number of tokens that can be loaded from the - external KV cache beyond what is already computed. - * true if external KV cache tokens will be loaded - asynchronously (between scheduler steps). + A tuple with the following elements: + - The number of tokens that can be loaded from the + external KV cache beyond what is already computed. + - `True` if external KV cache tokens will be loaded + asynchronously (between scheduler steps). """ pass diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index fcc38d7fbd125..761c56f7e41f5 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -118,11 +118,11 @@ class PyNcclPipe(KVPipeBase): """ Create the metadata as a dictionary based on the input tensor. - Parameters: - - tensor: The input tensor or None if no tensor is provided. + Args: + tensor: The input tensor or None if no tensor is provided. Returns: - - metadata: A dictionary with the following keys: + metadata: A dictionary with the following keys: - "dtype": The data type of the tensor or None. - "shape": The shape of the tensor or None. """ @@ -135,13 +135,13 @@ class PyNcclPipe(KVPipeBase): """ Create a buffer to receive the tensor based on the provided metadata. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape", describing - the tensor's data type and shape. + Args: + metadata: A dictionary with keys "dtype" and "shape", + describing the tensor's data type and shape. Returns: - - buffer: A tensor of the specified type and shape, allocated on - self.device. + buffer: A tensor of the specified type and shape, + allocated on `self.device`. """ return torch.empty(metadata["shape"], dtype=metadata["dtype"], @@ -151,8 +151,8 @@ class PyNcclPipe(KVPipeBase): """ Send the metadata dictionary to the target rank. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape". + Args: + metadata: A dictionary with keys "dtype" and "shape". """ self.group.send_obj(metadata, self.target_rank_for_send) @@ -161,8 +161,8 @@ class PyNcclPipe(KVPipeBase): Receive the metadata dictionary from the target rank. Returns: - - metadata: A dictionary with keys "dtype" and "shape" describing - the tensor. + metadata: A dictionary with keys "dtype" and "shape" + describing the tensor. """ return self.group.recv_obj(self.target_rank_for_recv) @@ -171,9 +171,9 @@ class PyNcclPipe(KVPipeBase): The actual implementation of sending the tensor and its metadata to the target rank. - Parameters: - - tensor: The input tensor to be sent, or None if no tensor is - being sent. + Args: + tensor: The input tensor to be sent, or `None` if no tensor is + being sent. """ metadata = self._make_metadata(tensor) self._send_metadata(metadata) @@ -187,7 +187,7 @@ class PyNcclPipe(KVPipeBase): the target rank. Returns: - - buffer: The received tensor, or None if no tensor is received. + buffer: The received tensor, or `None` if no tensor is received. """ metadata = self._recv_metadata() if metadata["dtype"] is None: @@ -227,8 +227,8 @@ class PyNcclPipe(KVPipeBase): Sends a tensor and its metadata to the destination rank in a non-blocking way. - Parameters: - - tensor: The tensor to send, or None if no tensor is being sent. + Args: + tensor: The tensor to send, or `None` if no tensor is being sent. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) @@ -250,8 +250,8 @@ class PyNcclPipe(KVPipeBase): """ Receives a tensor and its metadata from the source rank. Blocking call. - Returns: - - tensor: The received tensor, or None if no tensor is received. + Args: + tensor: The received tensor, or `None` if no tensor is received. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c48d8a3869699..2e5361c4891b4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,26 +130,16 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The {class}`~vllm.LLM` class wraps this class for offline batched inference - and the {class}`AsyncLLMEngine` class wraps this class for online serving. + The [LLM][vllm.LLM] class wraps this class for offline batched inference + and the [AsyncLLMEngine][] class wraps this class for online serving. - The config arguments are derived from {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See + [engine-args][]) Args: - model_config: The configuration related to the LLM model. - cache_config: The configuration related to the KV cache memory - management. - parallel_config: The configuration related to distributed execution. - scheduler_config: The configuration related to the request scheduler. - device_config: The configuration related to the device. - lora_config (Optional): The configuration related to serving multi-LoRA. - speculative_config (Optional): The configuration related to speculative - decoding. + vllm_config: The configuration for initializing and running vLLM. executor_class: The model executor class for managing distributed execution. - prompt_adapter_config (Optional): The configuration related to serving - prompt adapters. log_stats: Whether to log statistics. usage_context: Specified entry point, used for usage info collection. """ @@ -695,11 +685,12 @@ class LLMEngine: Args: request_id: The unique ID of the request. - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See + [PromptType][vllm.inputs.PromptType] for more details about the format of each input. params: Parameters for sampling or pooling. - {class}`~vllm.SamplingParams` for text generation. - {class}`~vllm.PoolingParams` for pooling. + [SamplingParams][vllm.SamplingParams] for text generation. + [PoolingParams][vllm.PoolingParams] for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. lora_request: The LoRA request to add. @@ -711,10 +702,11 @@ class LLMEngine: Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of {class}`~vllm.Sequence` objects. - - Create a {class}`~vllm.SequenceGroup` object - from the list of {class}`~vllm.Sequence`. - - Add the {class}`~vllm.SequenceGroup` object to the scheduler. + - Create `n` number of [Sequence][vllm.Sequence] objects. + - Create a [SequenceGroup][vllm.SequenceGroup] object + from the list of [Sequence][vllm.Sequence]. + - Add the [SequenceGroup][vllm.SequenceGroup] object to the + scheduler. Example: >>> # initialize engine @@ -861,9 +853,7 @@ class LLMEngine: request_id: The ID(s) of the request to abort. Details: - - Refer to the - {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group` - from class {class}`~vllm.core.scheduler.Scheduler`. + - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][]. Example: >>> # initialize engine and add a request with request_id diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0465302c5a1c8..f818e1737975b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -116,7 +116,8 @@ class LLM: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. - disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig` + disable_custom_all_reduce: See + [ParallelConfig][vllm.config.ParallelConfig]. disable_async_output_proc: Disable async output processing. This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files @@ -128,12 +129,12 @@ class LLM: compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. - **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + **kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See + [engine-args][]) Note: This class is intended to be used for offline inference. For online - serving, use the {class}`~vllm.AsyncLLMEngine` class instead. + serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = True @@ -142,7 +143,7 @@ class LLM: DEPRECATE_INIT_POSARGS: ClassVar[bool] = True """ A flag to toggle whether to deprecate positional arguments in - {meth}`LLM.__init__`. + [LLM.__init__][]. """ @classmethod @@ -403,7 +404,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. @@ -669,7 +670,7 @@ class LLM: Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the {meth}`generate` method to generate the + tokenizer and calls the [generate][] method to generate the responses. Multi-modal inputs can be passed in the same way you would pass them @@ -678,8 +679,8 @@ class LLM: Args: messages: A list of conversations or a single conversation. - - Each conversation is represented as a list of messages. - - Each message is a dictionary with 'role' and 'content' keys. + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it @@ -689,27 +690,27 @@ class LLM: use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. - If not provided, the model's default chat template will be used. + If not provided, the model's default chat template will be used. chat_template_content_format: The format to render message content. - - "string" will render the content as a string. - Example: ``"Who are you?"`` - - "openai" will render the content as a list of dictionaries, - similar to OpenAI schema. - Example: ``[{"type": "text", "text": "Who are you?"}]`` + - "string" will render the content as a string. + Example: `"Who are you?"` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: `[{"type": "text", "text": "Who are you?"}]` add_generation_prompt: If True, adds a generation template to each message. continue_final_message: If True, continues the final message in the conversation instead of starting a new one. Cannot be - ``True`` if ``add_generation_prompt`` is also ``True``. + `True` if `add_generation_prompt` is also `True`. chat_template_kwargs: Additional kwargs to pass to the chat template. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. Returns: - A list of ``RequestOutput`` objects containing the generated + A list of `RequestOutput` objects containing the generated responses in the same order as the input messages. """ list_of_messages: list[list[ChatCompletionMessageParam]] @@ -908,7 +909,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -997,7 +998,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -1007,7 +1008,7 @@ class LLM: generation, if any. Returns: - A list of ``EmbeddingRequestOutput`` objects containing the + A list of `EmbeddingRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "embed": @@ -1041,7 +1042,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1049,7 +1050,7 @@ class LLM: generation, if any. Returns: - A list of ``ClassificationRequestOutput`` objects containing the + A list of `ClassificationRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "classify": @@ -1159,11 +1160,11 @@ class LLM: lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: - """Generate similarity scores for all pairs ````. + """Generate similarity scores for all pairs ``. - The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. - In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N`` - times to pair with the ``text_2`` sentences. + The inputs can be `1 -> 1`, `1 -> N` or `N -> N`. + In the `1 - N` case the `text_1` sentence will be replicated `N` + times to pair with the `text_2` sentences. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all @@ -1171,9 +1172,9 @@ class LLM: Args: text_1: can be a single prompt or a list of prompts, in which - case it has to have the same length as the ``text_2`` list + case it has to have the same length as the `text_2` list text_2: The texts to pair with the query to form the input - to the LLM. See {class}`~vllm.inputs.PromptType` for + to the LLM. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1181,7 +1182,7 @@ class LLM: generation, if any. Returns: - A list of ``ScoringRequestOutput`` objects containing the + A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. """ runner_type = self.llm_engine.model_config.runner_type @@ -1282,13 +1283,13 @@ class LLM: def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the {meth}`sleep` method + Wake up the engine from sleep mode. See the [sleep][] method for more details. Args: tags: An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in - ("weights", "kv_cache",). If None, all memory is reallocated. + `("weights", "kv_cache")`. If None, all memory is reallocated. wake_up should be called with all tags (or None) before the engine is used again. """ diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 70568a195fd83..22fee2f747129 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -12,7 +12,7 @@ The global {class}`~MultiModalRegistry` is used by model runners to dispatch data processing according to the target model. Info: - {ref}`mm-processing` + [mm-processing][] """ __all__ = [ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 8a27d866e88e3..0d0d4a4363f4d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -215,7 +215,7 @@ class MultiModalRegistry: invoked to transform the data into a dictionary of model inputs. Info: - {ref}`mm-processing` + [mm-processing][] """ def wrapper(model_cls: N) -> N: @@ -260,7 +260,7 @@ class MultiModalRegistry: Create a multi-modal processor for a specific model and tokenizer. Info: - {ref}`mm-processing` + [mm-processing][] """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/outputs.py b/vllm/outputs.py index 05026b5696913..33cc50c872b67 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -391,15 +391,6 @@ class PoolingRequestOutput(Generic[_O]): prompt_token_ids, finished) def __repr__(self): - """ - Returns a string representation of an PoolingRequestOutput instance. - - The representation includes the request_id and the number of outputs, - providing a quick overview of the pooling request's results. - - Returns: - str: A string representation of the PoolingRequestOutput instance. - """ return (f"{type(self).__name__}(request_id={self.request_id!r}, " f"outputs={self.outputs!r}, " f"prompt_token_ids={self.prompt_token_ids}, "