[Doc] Convert Sphinx directives ( {class}, {meth}, {attr}, ...) to MkDocs format for better documentation linking (#18663)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
This commit is contained in:
Hyogeun Oh (오효근) 2025-05-27 17:44:20 +09:00 committed by GitHub
parent 6881107948
commit a68e293cb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 360 additions and 247 deletions

View File

@ -39,7 +39,8 @@ class CompilerInterface:
Gather all the relevant information from the vLLM config, Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model. to compute a hash so that we can cache the compiled model.
See {meth}`VllmConfig.compute_hash` to check what information See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
to check what information
is already considered by default. This function should only is already considered by default. This function should only
consider the information that is specific to the compiler. consider the information that is specific to the compiler.
""" """

View File

@ -2986,7 +2986,7 @@ class PoolerConfig:
pooling_type: Optional[str] = None pooling_type: Optional[str] = None
""" """
The pooling method of the pooling model. This should be a key in The pooling method of the pooling model. This should be a key in
{class}`vllm.model_executor.layers.pooler.PoolingType`. [`vllm.model_executor.layers.pooler.PoolingType`][].
""" """
normalize: Optional[bool] = None normalize: Optional[bool] = None
@ -3697,23 +3697,27 @@ class CompilationConfig:
"""Configuration for compilation. It has three parts: """Configuration for compilation. It has three parts:
- Top-level Compilation control: - Top-level Compilation control:
- {attr}`level` - [`level`][vllm.config.CompilationConfig.level]
- {attr}`debug_dump_path` - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
- {attr}`cache_dir` - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
- {attr}`backend` - [`backend`][vllm.config.CompilationConfig.backend]
- {attr}`custom_ops` - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
- {attr}`splitting_ops` - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
- CudaGraph capture: - CudaGraph capture:
- {attr}`use_cudagraph` - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
- {attr}`cudagraph_capture_sizes` - [`cudagraph_capture_sizes`]
- {attr}`cudagraph_num_of_warmups` [vllm.config.CompilationConfig.cudagraph_capture_sizes]
- {attr}`cudagraph_copy_inputs` - [`cudagraph_num_of_warmups`]
- {attr}`full_cuda_graph` [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
- [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
- [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
- Inductor compilation: - Inductor compilation:
- {attr}`use_inductor` - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- {attr}`compile_sizes` - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
- {attr}`inductor_compile_config` - [`inductor_compile_config`]
- {attr}`inductor_passes` [vllm.config.CompilationConfig.inductor_compile_config]
- [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
- custom inductor passes - custom inductor passes
Why we have different sizes for cudagraph and inductor: Why we have different sizes for cudagraph and inductor:

View File

@ -167,4 +167,7 @@ class HTTPConnection:
global_http_connection = HTTPConnection() global_http_connection = HTTPConnection()
"""The global {class}`HTTPConnection` instance used by vLLM.""" """
The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
by vLLM.
"""

View File

@ -475,7 +475,8 @@ class _AsyncLLMEngine(LLMEngine):
*, *,
inputs: Optional[PromptType] = None, # DEPRECATED inputs: Optional[PromptType] = None, # DEPRECATED
) -> None: ) -> None:
"""Async version of {meth}`add_request`.""" """Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None: if inputs is not None:
prompt = inputs prompt = inputs
assert prompt is not None and params is not None assert prompt is not None and params is not None
@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(
class AsyncLLMEngine(EngineClient): class AsyncLLMEngine(EngineClient):
"""An asynchronous wrapper for {class}`LLMEngine`. """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
This class is used to wrap the {class}`LLMEngine` class to make it This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
asynchronous. It uses asyncio to create a background loop that keeps make it asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The {class}`LLMEngine` is kicked by the processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
generate method when there are requests in the waiting queue. The generate by the generate method when there are requests in the waiting queue. The
method yields the outputs from the {class}`LLMEngine` to the caller. generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
to the caller.
Args: Args:
log_requests: Whether to log the requests. log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call. will be automatically started in the generate call.
*args: Arguments for {class}`LLMEngine`. *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
**kwargs: Arguments for {class}`LLMEngine`. **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
""" """
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@ -985,8 +987,9 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request. sampling_params: The sampling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
@ -1003,7 +1006,7 @@ class AsyncLLMEngine(EngineClient):
Details: Details:
- If the engine is not running, start the background loop, - If the engine is not running, start the background loop,
which iteratively invokes which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
to process the waiting requests. to process the waiting requests.
- Add the request to the engine's `RequestTracker`. - Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to On the next background loop, this request will be sent to
@ -1075,8 +1078,9 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request. pooling_params: The pooling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
@ -1089,15 +1093,15 @@ class AsyncLLMEngine(EngineClient):
for the request. for the request.
Details: Details:
- If the engine is not running, start the background loop, - If the engine is not running, start the background loop,
which iteratively invokes which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
to process the waiting requests. to process the waiting requests.
- Add the request to the engine's `RequestTracker`. - Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to On the next background loop, this request will be sent to
the underlying engine. the underlying engine.
Also, a corresponding `AsyncStream` will be created. Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them. - Wait for the request outputs from `AsyncStream` and yield them.
Example: Example:
``` ```

View File

@ -130,11 +130,11 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the iteration-level scheduling and efficient memory management to maximize the
serving throughput. serving throughput.
The [LLM][vllm.LLM] class wraps this class for offline batched inference The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
and the [AsyncLLMEngine][] class wraps this class for online serving. and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
class wraps this class for online serving.
The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
[engine-args][])
Args: Args:
vllm_config: The configuration for initializing and running vLLM. vllm_config: The configuration for initializing and running vLLM.

View File

@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request. sampling_params: The sampling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request. pooling_params: The pooling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.

View File

@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
class MQLLMEngine: class MQLLMEngine:
"""A multiprocessing wrapper for {class}`LLMEngine`. """A multiprocessing wrapper for
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
This class is used to wrap the {class}`LLMEngine` class to enable use This class is used to wrap the
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to in concurrnet manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc. receive new requests and stream outputs incrementally via ipc.
The {class}`LLMEngine` generate or encode process is kicked off when a new The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
RPCProcessRequest is received by the input_socket. process is kicked off when a new RPCProcessRequest is received by the
input_socket.
The self.engine_loop checks the input_socket for new requests, The self.engine_loop checks the input_socket for new requests,
adds them to the LLMEngine if there are any, calls the internal adds them to the LLMEngine if there are any, calls the internal
{class}`LLMEngine.step()`, and sends the RequestOutputs back over [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
the output_socket. the RequestOutputs back over the output_socket.
If use_async_sockets is set, the logic associated with reading new If use_async_sockets is set, the logic associated with reading new
requests from the socket and sending data to the socket is passed requests from the socket and sending data to the socket is passed
@ -65,8 +68,8 @@ class MQLLMEngine:
ipc_path: Base path for zeromq interprocess messaging ipc_path: Base path for zeromq interprocess messaging
use_async_sockets: Whether to make send/recv async with GPU use_async_sockets: Whether to make send/recv async with GPU
log_requests: Whether to log the requests. log_requests: Whether to log the requests.
*args: Arguments for {class}`LLMEngine`. *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
**kwargs: Arguments for {class}`LLMEngine`. **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
""" """
def __init__(self, def __init__(self,

View File

@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation. scheduled computation.
Args: Args:
seq_group: the outputs are associated with this {class}`SequenceGroup` seq_group: the outputs are associated with this
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps [`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
for all scheduler steps
""" """
for output in outputs: for output in outputs:
# Concatenate single-step prompt logprob processing results. # Concatenate single-step prompt logprob processing results.

View File

@ -19,17 +19,21 @@ logger = init_logger(__name__)
def single_step_process_prompt_logprob( def single_step_process_prompt_logprob(
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
output: CompletionSequenceGroupOutput) -> None: output: CompletionSequenceGroupOutput) -> None:
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput` """Process prompt logprobs associated with the
for a given step. [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
Do nothing if the output has no prompt logprobs. Do nothing if the output has no prompt logprobs.
Account for the fact that transformers do not compute first-token logprobs. Account for the fact that transformers do not compute first-token logprobs.
Args: Args:
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance sg_output_proc:
seq_group: the output is associated with this {class}`SequenceGroup` [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
output: the {class}`SequenceGroupOutput` for a single scheduler step instance
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
""" """
prompt_logprobs = output.prompt_logprobs prompt_logprobs = output.prompt_logprobs
@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation. scheduled computation.
Args: Args:
seq_group: the output is associated with this {class}`SequenceGroup` seq_group: the output is associated with this
outputs: the {class}`SequenceGroupOutput` for a single scheduler step [`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
""" """
assert len(outputs) == 1, "Single step should only have 1 output." assert len(outputs) == 1, "Single step should only have 1 output."
output = outputs[0] output = outputs[0]

View File

@ -129,8 +129,7 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration. is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
[engine-args][])
Note: Note:
This class is intended to be used for offline inference. For online This class is intended to be used for offline inference. For online
@ -494,7 +493,7 @@ class LLM:
`self` argument, in addition to the arguments passed in `args` `self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object. and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely. [`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method. args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method.

View File

@ -582,7 +582,8 @@ class OpenAIServing:
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> TextTokensPrompt: ) -> TextTokensPrompt:
""" """
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes single input. that assumes single input.
""" """
return next( return next(
@ -603,7 +604,8 @@ class OpenAIServing:
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]: ) -> Iterator[TextTokensPrompt]:
""" """
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes multiple inputs. that assumes multiple inputs.
""" """
for text in prompt_inputs: for text in prompt_inputs:

View File

@ -74,7 +74,7 @@ class ExecutorBase(ABC):
`self` argument, in addition to the arguments passed in `args` `self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object. and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely. [`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method. args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method.

View File

@ -10,8 +10,9 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
INPUT_REGISTRY = InputRegistry() INPUT_REGISTRY = InputRegistry()
""" """
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine` The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
to dispatch data processing according to the target model. by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
target model.
""" """
__all__ = [ __all__ = [

View File

@ -80,22 +80,24 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
""" """
Set of possible schemas for a single prompt: Set of possible schemas for a single prompt:
- A text prompt ({class}`str` or {class}`TextPrompt`) - A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`) - A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`) - An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
Note that "singleton" is as opposed to a data structure Note that "singleton" is as opposed to a data structure
which encapsulates multiple prompts, i.e. of the sort which encapsulates multiple prompts, i.e. of the sort
which may be utilized for encoder/decoder models when which may be utilized for encoder/decoder models when
the user desires to express both the encoder & decoder the user desires to express both the encoder & decoder
prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt` prompts explicitly, i.e.
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
A prompt of type {class}`SingletonPrompt` may be employed A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be
as (1) input to a decoder-only model, (2) input to employed as (1) input to a decoder-only model, (2) input to
the encoder of an encoder/decoder model, in the scenario the encoder of an encoder/decoder model, in the scenario
where the decoder-prompt is not specified explicitly, or where the decoder-prompt is not specified explicitly, or
(3) as a member of a larger data structure encapsulating (3) as a member of a larger data structure encapsulating
more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt` more than one prompt, i.e.
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
""" """
@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
comprising an explicit encoder prompt and a decoder prompt. comprising an explicit encoder prompt and a decoder prompt.
The encoder and decoder prompts, respectively, may be formatted The encoder and decoder prompts, respectively, may be formatted
according to any of the {class}`SingletonPrompt` schemas, according to any of the
[`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas,
and are not required to have the same schema. and are not required to have the same schema.
Only the encoder prompt may have multi-modal data. mm_processor_kwargs Only the encoder prompt may have multi-modal data. mm_processor_kwargs
should be at the top-level, and should not be set in the encoder/decoder should be at the top-level, and should not be set in the encoder/decoder
prompts, since they are agnostic to the encoder/decoder. prompts, since they are agnostic to the encoder/decoder.
Note that an {class}`ExplicitEncoderDecoderPrompt` may not Note that an
be used as an input to a decoder-only model, [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
may not be used as an input to a decoder-only model,
and that the `encoder_prompt` and `decoder_prompt` and that the `encoder_prompt` and `decoder_prompt`
fields of this data structure themselves must be fields of this data structure themselves must be
{class}`SingletonPrompt` instances. [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances.
""" """
encoder_prompt: _T1_co encoder_prompt: _T1_co
@ -152,11 +156,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
Set of possible schemas for an LLM input, including Set of possible schemas for an LLM input, including
both decoder-only and encoder/decoder input types: both decoder-only and encoder/decoder input types:
- A text prompt ({class}`str` or {class}`TextPrompt`) - A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`) - A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`) - An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
- A single data structure containing both an encoder and a decoder prompt - A single data structure containing both an encoder and a decoder prompt
({class}`ExplicitEncoderDecoderPrompt`) ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt])
""" """
@ -189,7 +193,8 @@ def token_inputs(
prompt: Optional[str] = None, prompt: Optional[str] = None,
cache_salt: Optional[str] = None, cache_salt: Optional[str] = None,
) -> TokenInputs: ) -> TokenInputs:
"""Construct {class}`TokenInputs` from optional values.""" """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None: if prompt is not None:
@ -221,7 +226,8 @@ def embeds_inputs(
prompt_embeds: torch.Tensor, prompt_embeds: torch.Tensor,
cache_salt: Optional[str] = None, cache_salt: Optional[str] = None,
) -> EmbedsInputs: ) -> EmbedsInputs:
"""Construct :class:`EmbedsInputs` from optional values.""" """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
values."""
inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
if cache_salt is not None: if cache_salt is not None:
@ -232,7 +238,7 @@ def embeds_inputs(
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
""" """
The inputs in {class}`~vllm.LLMEngine` before they are The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
passed to the model executor. passed to the model executor.
This specifies the data required for decoder-only models. This specifies the data required for decoder-only models.
""" """
@ -240,11 +246,12 @@ This specifies the data required for decoder-only models.
class EncoderDecoderInputs(TypedDict): class EncoderDecoderInputs(TypedDict):
""" """
The inputs in {class}`~vllm.LLMEngine` before they are The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they
passed to the model executor. are passed to the model executor.
This specifies the required data for encoder-decoder models. This specifies the required data for encoder-decoder models.
""" """
encoder: Union[TokenInputs, "MultiModalInputs"] encoder: Union[TokenInputs, "MultiModalInputs"]
"""The inputs for the encoder portion.""" """The inputs for the encoder portion."""
@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict):
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
""" """
A processed {class}`SingletonPrompt` which can be passed to A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
{class}`vllm.sequence.Sequence`. passed to [`vllm.sequence.Sequence`][].
""" """
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
""" """
The inputs to {data}`vllm.inputs.InputProcessor`. The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
""" """
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt(
return ExplicitEncoderDecoderPrompt( return ExplicitEncoderDecoderPrompt(
encoder_prompt=encoder_prompt, encoder_prompt=encoder_prompt,
decoder_prompt=decoder_prompt, decoder_prompt=decoder_prompt,
mm_processor_kwargs=mm_processor_kwargs) mm_processor_kwargs=mm_processor_kwargs,
)
def zip_enc_dec_prompts( def zip_enc_dec_prompts(
@ -288,7 +296,8 @@ def zip_enc_dec_prompts(
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
""" """
Zip encoder and decoder prompts together into a list of Zip encoder and decoder prompts together into a list of
{class}`ExplicitEncoderDecoderPrompt` instances. [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
instances.
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
dictionary will be used for every encoder/decoder prompt. If an iterable is dictionary will be used for every encoder/decoder prompt. If an iterable is
@ -299,10 +308,11 @@ def zip_enc_dec_prompts(
if isinstance(mm_processor_kwargs, dict): if isinstance(mm_processor_kwargs, dict):
return [ return [
build_explicit_enc_dec_prompt( build_explicit_enc_dec_prompt(
encoder_prompt, decoder_prompt, encoder_prompt,
cast(dict[str, Any], mm_processor_kwargs)) decoder_prompt,
for (encoder_prompt, cast(dict[str, Any], mm_processor_kwargs),
decoder_prompt) in zip(enc_prompts, dec_prompts) ) for (encoder_prompt,
decoder_prompt) in zip(enc_prompts, dec_prompts)
] ]
return [ return [
build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,

View File

@ -23,13 +23,13 @@ class ParsedTokens(TypedDict):
@overload @overload
def parse_and_batch_prompt( def parse_and_batch_prompt(
prompt: Union[str, list[str]]) -> Sequence[ParsedText]: prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
... ...
@overload @overload
def parse_and_batch_prompt( def parse_and_batch_prompt(
prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
... ...
@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict):
class ParsedEmbedsPrompt(TypedDict): class ParsedEmbedsPrompt(TypedDict):
type: Literal['embeds'] type: Literal["embeds"]
content: EmbedsPrompt content: EmbedsPrompt
@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
def is_explicit_encoder_decoder_prompt( def is_explicit_encoder_decoder_prompt(
prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
return isinstance(prompt, dict) and "encoder_prompt" in prompt return isinstance(prompt, dict) and "encoder_prompt" in prompt

View File

@ -67,11 +67,11 @@ class InputPreprocessor:
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
def get_decoder_start_token_id(self) -> Optional[int]: def get_decoder_start_token_id(self) -> Optional[int]:
''' """
Obtain the decoder start token id employed by an encoder/decoder Obtain the decoder start token id employed by an encoder/decoder
model. Returns None for non-encoder/decoder models or if the model. Returns None for non-encoder/decoder models or if the
model config is unavailable. model config is unavailable.
''' """
if not self.model_config.is_encoder_decoder: if not self.model_config.is_encoder_decoder:
logger.warning_once( logger.warning_once(
@ -79,14 +79,14 @@ class InputPreprocessor:
"this is not an encoder/decoder model.") "this is not an encoder/decoder model.")
return None return None
if (self.model_config is None or self.model_config.hf_config is None): if self.model_config is None or self.model_config.hf_config is None:
logger.warning_once( logger.warning_once(
"Using None for decoder start token id because " "Using None for decoder start token id because "
"model config is not available.") "model config is not available.")
return None return None
dec_start_token_id = getattr(self.model_config.hf_config, dec_start_token_id = getattr(self.model_config.hf_config,
'decoder_start_token_id', None) "decoder_start_token_id", None)
if dec_start_token_id is None: if dec_start_token_id is None:
logger.warning_once( logger.warning_once(
"Falling back on <BOS> for decoder start token " "Falling back on <BOS> for decoder start token "
@ -97,7 +97,7 @@ class InputPreprocessor:
return dec_start_token_id return dec_start_token_id
def _get_default_enc_dec_decoder_prompt(self) -> list[int]: def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
''' """
Specifically for encoder/decoder models: Specifically for encoder/decoder models:
generate a default decoder prompt for when generate a default decoder prompt for when
the user specifies only the encoder prompt. the user specifies only the encoder prompt.
@ -126,7 +126,7 @@ class InputPreprocessor:
Returns: Returns:
* prompt_token_ids * prompt_token_ids
''' """
bos_token_id = self.get_bos_token_id() bos_token_id = self.get_bos_token_id()
assert bos_token_id is not None assert bos_token_id is not None
@ -224,7 +224,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[int]: ) -> list[int]:
"""Async version of {meth}`_tokenize_prompt`.""" """
Async version of
[`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
"""
tokenizer = self.get_tokenizer_group() tokenizer = self.get_tokenizer_group()
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
@ -287,7 +290,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
"""Async version of {meth}`_process_multimodal`.""" """
Async version of
[`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
"""
tokenizer = await self._get_mm_tokenizer_async(lora_request) tokenizer = await self._get_mm_tokenizer_async(lora_request)
mm_processor = self.mm_registry.create_processor(self.model_config, mm_processor = self.mm_registry.create_processor(self.model_config,
@ -472,7 +478,7 @@ class InputPreprocessor:
Returns: Returns:
* {class}`SingletonInputs` instance * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
""" """
parsed = parse_singleton_prompt(prompt) parsed = parse_singleton_prompt(prompt)
@ -508,7 +514,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> SingletonInputs: ) -> SingletonInputs:
"""Async version of {meth}`_prompt_to_llm_inputs`.""" """
Async version of
[`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs].
"""
parsed = parse_singleton_prompt(prompt) parsed = parse_singleton_prompt(prompt)
if parsed["type"] == "embeds": if parsed["type"] == "embeds":
@ -644,7 +653,9 @@ class InputPreprocessor:
) -> EncoderDecoderInputs: ) -> EncoderDecoderInputs:
""" """
For encoder/decoder models only: For encoder/decoder models only:
Process an input prompt into an {class}`EncoderDecoderInputs` instance. Process an input prompt into an
[`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance.
There are two types of input prompts: There are two types of input prompts:
singleton prompts which carry only the singleton prompts which carry only the
@ -670,7 +681,8 @@ class InputPreprocessor:
Returns: Returns:
* {class}`EncoderDecoderInputs` instance * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance
""" """
encoder_inputs: SingletonInputs encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs] decoder_inputs: Optional[SingletonInputs]
@ -710,7 +722,10 @@ class InputPreprocessor:
prompt: PromptType, prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> EncoderDecoderInputs: ) -> EncoderDecoderInputs:
"""Async version of {meth}`_process_encoder_decoder_prompt`.""" """
Async version of
[`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt].
"""
encoder_inputs: SingletonInputs encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs] decoder_inputs: Optional[SingletonInputs]
@ -778,7 +793,8 @@ class InputPreprocessor:
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
""" """
For decoder-only models: For decoder-only models:
Process an input prompt into an {class}`DecoderOnlyInputs` instance. Process an input prompt into a
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
Arguments: Arguments:
@ -789,7 +805,7 @@ class InputPreprocessor:
Returns: Returns:
* {class}`DecoderOnlyInputs` instance * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
""" """
prompt_comps = self._prompt_to_llm_inputs( prompt_comps = self._prompt_to_llm_inputs(
@ -812,7 +828,10 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
"""Async version of {meth}`_process_decoder_only_prompt`.""" """
Async version of
[`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt].
"""
prompt_comps = await self._prompt_to_llm_inputs_async( prompt_comps = await self._prompt_to_llm_inputs_async(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
@ -863,7 +882,10 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
"""Async version of {meth}`preprocess`.""" """
Async version of
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
"""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, ( assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ", "Multimodal hashes for encoder-decoder models should not be ",

View File

@ -38,7 +38,7 @@ class InputContext:
) -> _C: ) -> _C:
""" """
Get the HuggingFace configuration Get the HuggingFace configuration
({class}`transformers.PretrainedConfig`) of the model, (`transformers.PretrainedConfig`) of the model,
additionally checking its type. additionally checking its type.
Raises: Raises:
@ -79,7 +79,7 @@ class InputContext:
) -> _P: ) -> _P:
""" """
Get the HuggingFace processor Get the HuggingFace processor
({class}`transformers.ProcessorMixin`) of the model, (`transformers.ProcessorMixin`) of the model,
additionally checking its type. additionally checking its type.
Raises: Raises:

View File

@ -68,22 +68,22 @@ class _VllmLogger(Logger):
""" """
Note: Note:
This class is just to provide type information. This class is just to provide type information.
We actually patch the methods directly on the {class}`logging.Logger` We actually patch the methods directly on the [`logging.Logger`][]
instance to avoid conflicting with other libraries such as instance to avoid conflicting with other libraries such as
`intel_extension_for_pytorch.utils._logger`. `intel_extension_for_pytorch.utils._logger`.
""" """
def info_once(self, msg: str, *args: Hashable) -> None: def info_once(self, msg: str, *args: Hashable) -> None:
""" """
As {meth}`info`, but subsequent calls with the same message As [`info`][logging.Logger.info], but subsequent calls with
are silently dropped. the same message are silently dropped.
""" """
_print_info_once(self, msg, *args) _print_info_once(self, msg, *args)
def warning_once(self, msg: str, *args: Hashable) -> None: def warning_once(self, msg: str, *args: Hashable) -> None:
""" """
As {meth}`warning`, but subsequent calls with the same message As [`warning`][logging.Logger.warning], but subsequent calls with
are silently dropped. the same message are silently dropped.
""" """
_print_warning_once(self, msg, *args) _print_warning_once(self, msg, *args)

View File

@ -228,17 +228,19 @@ class Sampler(nn.Module):
) -> Optional[SamplerOutput]: ) -> Optional[SamplerOutput]:
""" """
Single-step scheduling: Single-step scheduling:
* Perform GPU-side sampling computation & compute * Perform GPU-side sampling computation & compute
GPU-side logprobs tensor GPU-side logprobs tensor
* Pythonize sampling result & logprobs tensor * Pythonize sampling result & logprobs tensor
Multi-step scheduling: Multi-step scheduling:
* Perform GPU-side sampling computation & compute * Perform GPU-side sampling computation & compute
GPU-side logprobs tensor GPU-side logprobs tensor
* Defer Pythonization of sampling result & logprobs * Defer Pythonization of sampling result & logprobs
tensor tensor
* Encapsulate arguments required for deferred Pythonization * Encapsulate arguments required for deferred Pythonization
in the {class}`SamplerOutput` structure in the
[`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
structure
Args: Args:
logits: (num_tokens, vocab_size). logits: (num_tokens, vocab_size).

View File

@ -226,9 +226,11 @@ class SupportsPP(Protocol):
intermediate_tensors: Optional["IntermediateTensors"], intermediate_tensors: Optional["IntermediateTensors"],
) -> Union[Tensor, "IntermediateTensors"]: ) -> Union[Tensor, "IntermediateTensors"]:
""" """
Accept {class}`IntermediateTensors` when PP rank > 0. Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
PP rank > 0.
Return {class}`IntermediateTensors` only for the last PP rank. Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
for the last PP rank.
""" """
... ...

View File

@ -965,7 +965,7 @@ def select_tiling(
class MolmoProcessorWrapper: class MolmoProcessorWrapper:
""" """
Wraps {class}`MolmoProcessor` so that it can be called directly. Wraps `MolmoProcessor` so that it can be called directly.
The original definition can be found here: The original definition can be found here:
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py

View File

@ -67,14 +67,14 @@ class PixtralImagePixelInputs(TypedDict):
""" """
Shape: `(batch_size * num_images, num_channels, image_width, image_height)` Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
The result of stacking {attr}`ImageEncoding.tokens` from each prompt. The result of stacking `ImageEncoding.tokens` from each prompt.
""" """
class PixtralProcessorAdapter: class PixtralProcessorAdapter:
""" """
Provide a HF-compatible interface for Provide a HF-compatible interface for
{class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
""" """
def __init__(self, tokenizer: MistralTokenizer) -> None: def __init__(self, tokenizer: MistralTokenizer) -> None:

View File

@ -382,7 +382,8 @@ def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
""" """
The logic of adding image pad tokens should only be applied in The logic of adding image pad tokens should only be applied in
{class}`QwenVLProcessor`, so they are patched out here. [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
so they are patched out here.
The definition of the wrapped tokenizer can be found here: The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py

View File

@ -383,7 +383,7 @@ class _ModelRegistry:
`model_cls` can be either: `model_cls` can be either:
- A {class}`torch.nn.Module` class directly referencing the model. - A [`torch.nn.Module`][] class directly referencing the model.
- A string in the format `<module>:<class>` which can be used to - A string in the format `<module>:<class>` which can be used to
lazily import the model. This is useful to avoid initializing CUDA lazily import the model. This is useful to avoid initializing CUDA
when importing the model and thus the related error when importing the model and thus the related error

View File

@ -66,7 +66,7 @@ class WeightsMapper:
class AutoWeightsLoader: class AutoWeightsLoader:
""" """
Helper class to load weights into a {class}`torch.nn.Module`. It is able Helper class to load weights into a [`torch.nn.Module`][]. It is able
to automatically detect child modules and parameters while iterating over to automatically detect child modules and parameters while iterating over
the weights only once. the weights only once.

View File

@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry() MULTIMODAL_REGISTRY = MultiModalRegistry()
""" """
The global {class}`~MultiModalRegistry` is used by model runners to The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry]
dispatch data processing according to the target model. is used by model runners to dispatch data processing according to the target
model.
Info: Info:
[mm-processing][] [mm_processing](../../../design/mm_processing.html)
""" """
__all__ = [ __all__ = [

View File

@ -29,14 +29,14 @@ _T = TypeVar("_T")
HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"] HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"]
""" """
A {class}`transformers.image_utils.ImageInput` representing a single image A `transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`. item, which can be passed to a HuggingFace `ImageProcessor`.
""" """
HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor", HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor",
list[np.ndarray], list["torch.Tensor"]] list[np.ndarray], list["torch.Tensor"]]
""" """
A {class}`transformers.image_utils.VideoInput` representing a single video A `transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace `VideoProcessor`. item, which can be passed to a HuggingFace `VideoProcessor`.
""" """
@ -48,7 +48,7 @@ item, which can be passed to a HuggingFace `AudioProcessor`.
ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
""" """
A {class}`transformers.image_utils.ImageInput` representing a single image A `transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`. item, which can be passed to a HuggingFace `ImageProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors, Alternatively, a 3-D tensor or batch of 2-D tensors,
@ -58,7 +58,7 @@ these are directly passed to the model without HF processing.
VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"] VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"]
""" """
A {class}`transformers.image_utils.VideoInput` representing a single video A `transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace `VideoProcessor`. item, which can be passed to a HuggingFace `VideoProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors, Alternatively, a 3-D tensor or batch of 2-D tensors,
@ -108,7 +108,8 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
""" """
A dictionary containing an entry for each modality type to input. A dictionary containing an entry for each modality type to input.
The built-in modalities are defined by {class}`MultiModalDataBuiltins`. The built-in modalities are defined by
[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
""" """
@ -169,7 +170,8 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
"""Equality check between {data}`NestedTensors` objects.""" """Equality check between
[`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
if isinstance(a, torch.Tensor): if isinstance(a, torch.Tensor):
return isinstance(b, torch.Tensor) and torch.equal(a, b) return isinstance(b, torch.Tensor) and torch.equal(a, b)
elif isinstance(b, torch.Tensor): elif isinstance(b, torch.Tensor):
@ -189,7 +191,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
""" """
A dictionary containing nested tensors which have been batched via A dictionary containing nested tensors which have been batched via
{meth}`MultiModalKwargs.batch`. [`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
""" """
@ -197,7 +199,7 @@ A dictionary containing nested tensors which have been batched via
class MultiModalFieldElem: class MultiModalFieldElem:
""" """
Represents a keyword argument corresponding to a multi-modal item Represents a keyword argument corresponding to a multi-modal item
in {class}`MultiModalKwargs`. in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
""" """
modality: str modality: str
@ -208,13 +210,15 @@ class MultiModalFieldElem:
key: str key: str
""" """
The key of this field in {class}`MultiModalKwargs`, The key of this field in
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
i.e. the name of the keyword argument to be passed to the model. i.e. the name of the keyword argument to be passed to the model.
""" """
data: NestedTensors data: NestedTensors
""" """
The tensor data of this field in {class}`MultiModalKwargs`, The tensor data of this field in
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
i.e. the value of the keyword argument to be passed to the model. i.e. the value of the keyword argument to be passed to the model.
""" """
@ -237,7 +241,8 @@ class MultiModalFieldElem:
class BaseMultiModalField(ABC): class BaseMultiModalField(ABC):
""" """
Defines how to interpret tensor data belonging to a keyword argument in Defines how to interpret tensor data belonging to a keyword argument in
{class}`MultiModalKwargs` for multiple multi-modal items, and vice versa. [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
multi-modal items, and vice versa.
""" """
def _field_factory(self, *, modality: str, key: str): def _field_factory(self, *, modality: str, key: str):
@ -262,10 +267,12 @@ class BaseMultiModalField(ABC):
data: NestedTensors, data: NestedTensors,
) -> Sequence[MultiModalFieldElem]: ) -> Sequence[MultiModalFieldElem]:
""" """
Construct {class}`MultiModalFieldElem` instances to represent Construct
the provided data. [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
instances to represent the provided data.
This is the inverse of {meth}`reduce_data`. This is the inverse of
[`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
""" """
raise NotImplementedError raise NotImplementedError
@ -275,9 +282,11 @@ class BaseMultiModalField(ABC):
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
""" """
Merge the data from multiple instances of {class}`MultiModalFieldElem`. Merge the data from multiple instances of
[`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].
This is the inverse of {meth}`build_elems`. This is the inverse of
[`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
""" """
field_types = [type(item.field) for item in elems] field_types = [type(item.field) for item in elems]
if len(set(field_types)) > 1: if len(set(field_types)) > 1:
@ -290,7 +299,7 @@ class BaseMultiModalField(ABC):
class MultiModalBatchedField(BaseMultiModalField): class MultiModalBatchedField(BaseMultiModalField):
""" """
Info: Info:
[MultiModalFieldConfig.batched][] [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
""" """
def build_elems( def build_elems(
@ -320,8 +329,8 @@ class MultiModalBatchedField(BaseMultiModalField):
class MultiModalFlatField(BaseMultiModalField): class MultiModalFlatField(BaseMultiModalField):
""" """
Info: Info:
[MultiModalFieldConfig.flat][] [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
[MultiModalFieldConfig.flat_from_sizes][] [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
""" """
slices: Union[Sequence[slice], Sequence[Sequence[slice]]] slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
dim: int = 0 dim: int = 0
@ -362,7 +371,7 @@ class MultiModalFlatField(BaseMultiModalField):
class MultiModalSharedField(BaseMultiModalField): class MultiModalSharedField(BaseMultiModalField):
""" """
Info: Info:
[MultiModalFieldConfig.shared][] [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
""" """
batch_size: int batch_size: int
@ -508,7 +517,7 @@ class MultiModalFieldConfig:
``` ```
Info: Info:
[MultiModalFieldConfig.flat][] [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
""" """
if size_per_item.ndim != 1: if size_per_item.ndim != 1:
@ -572,8 +581,10 @@ class MultiModalFieldConfig:
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
""" """
A collection of {class}`MultiModalFieldElem` A collection of
corresponding to a data item in {class}`MultiModalDataItems`. [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
corresponding to a data item in
[`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
""" """
@staticmethod @staticmethod
@ -592,11 +603,13 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
class MultiModalKwargs(UserDict[str, NestedTensors]): class MultiModalKwargs(UserDict[str, NestedTensors]):
""" """
A dictionary that represents the keyword arguments to A dictionary that represents the keyword arguments to
{meth}`~torch.nn.Module.forward`. [`torch.nn.Module.forward`][].
The metadata `items` enables us to obtain the keyword arguments The metadata `items` enables us to obtain the keyword arguments
corresponding to each data item in {class}`MultiModalDataItems`, via corresponding to each data item in
{meth}`get_item` and {meth}`get_items`. [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
[`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
[`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
""" """
@staticmethod @staticmethod
@ -635,7 +648,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
@staticmethod @staticmethod
def from_items(items: Sequence[MultiModalKwargsItem]): def from_items(items: Sequence[MultiModalKwargsItem]):
"""Construct a new {class}`MultiModalKwargs` from multiple items.""" """Construct a new
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
from multiple items."""
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for item in items: for item in items:
for key, elem in item.items(): for key, elem in item.items():
@ -800,7 +815,7 @@ A dictionary containing placeholder ranges for each modality.
class MultiModalInputs(TypedDict): class MultiModalInputs(TypedDict):
""" """
Represents the outputs of Represents the outputs of
{class}`vllm.multimodal.processing.BaseMultiModalProcessor`, [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
ready to be passed to vLLM internals. ready to be passed to vLLM internals.
""" """
@ -836,7 +851,8 @@ class MultiModalInputs(TypedDict):
class MultiModalEncDecInputs(MultiModalInputs): class MultiModalEncDecInputs(MultiModalInputs):
""" """
Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor` Represents the outputs of
[`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
ready to be passed to vLLM internals. ready to be passed to vLLM internals.
""" """

View File

@ -28,7 +28,8 @@ else:
class ModalityDataItems(ABC, Generic[_T, _I]): class ModalityDataItems(ABC, Generic[_T, _I]):
""" """
Represents data items for a modality in {class}`MultiModalDataItems`. Represents data items for a modality in
[`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
""" """
def __init__(self, data: _T, modality: str) -> None: def __init__(self, data: _T, modality: str) -> None:
@ -251,15 +252,15 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
""" """
As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
such that each entry corresponds to a list. normalized such that each entry corresponds to a list.
""" """
def get_count(self, modality: str, *, strict: bool = True) -> int: def get_count(self, modality: str, *, strict: bool = True) -> int:
""" """
Get the number of data items belonging to a modality. Get the number of data items belonging to a modality.
If `strict=False`, return `0` instead of raising {exc}`KeyError` If `strict=False`, return `0` instead of raising [`KeyError`][]
even if the modality is not found. even if the modality is not found.
""" """
if modality not in self: if modality not in self:
@ -305,8 +306,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
class MultiModalDataParser: class MultiModalDataParser:
""" """
Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
{class}`MultiModalDataItems`. into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
Args: Args:
target_sr (float, optional): Enables automatic resampling of audio target_sr (float, optional): Enables automatic resampling of audio

View File

@ -114,13 +114,14 @@ class PromptUpdateDetails(Generic[_S]):
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
""" """
Given {attr}`full`, return a boolean mask of shape `(len(full),)` Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
indicating which positions of `full` to assign embeddings to. return a boolean mask of shape `(len(full),)` indicating which positions
of `full` to assign embeddings to.
`None` (default) means to assign embeddings to all positions of `full`. `None` (default) means to assign embeddings to all positions of `full`.
The embeddings are obtained by calling The embeddings are obtained by calling
{class}`SupportsMultiModal.get_multimodal_embeddings`. [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
""" """
@staticmethod @staticmethod
@ -159,13 +160,15 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
The token sequence or text that are part of the update. The token sequence or text that are part of the update.
If only part of the content corresponds to feature placeholders, you can If only part of the content corresponds to feature placeholders, you can
use {class}`PromptUpdateDetails` to specify which part. use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
specify which part.
""" """
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
PromptUpdateInfo] PromptUpdateInfo]
""" """
Given the index of the processed item within {attr}`modality`, Given the index of the processed item within
[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
output the corresponding token sequence (or text). output the corresponding token sequence (or text).
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
@ -260,8 +263,10 @@ class PromptInsertion(PromptUpdate):
insertion: PromptUpdateContent = field(repr=False) insertion: PromptUpdateContent = field(repr=False)
""" """
Given the index of the processed item within {attr}`modality`, Given the index of the processed item within
output the token sequence (or text) to insert right after {attr}`target`. [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
output the token sequence (or text) to insert right after
[`target`][vllm.multimodal.processing.PromptUpdate.target].
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input. instead of a function if it does not depend on the input.
@ -332,8 +337,10 @@ class PromptReplacement(PromptUpdate):
replacement: PromptUpdateContent = field(repr=False) replacement: PromptUpdateContent = field(repr=False)
""" """
Given the index of the processed item within {attr}`modality`, Given the index of the processed item within
output the token sequence (or text) to replace {attr}`target`. [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
output the token sequence (or text) to replace
[`target`][vllm.multimodal.processing.PromptUpdate.target].
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input. instead of a function if it does not depend on the input.
@ -387,14 +394,16 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
"""Convenience function to apply [full_groupby][] based on modality.""" """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby]
based on modality."""
return full_groupby(values, key=lambda x: x.modality) return full_groupby(values, key=lambda x: x.modality)
@dataclass @dataclass
class _BoundPromptSequence: class _BoundPromptSequence:
""" """
A {data}`_PromptSeq` bound to a tokenizer to automatically A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound
to a tokenizer to automatically
convert between token sequence and text representations. convert between token sequence and text representations.
""" """
tokenizer: AnyTokenizer = field(repr=False) tokenizer: AnyTokenizer = field(repr=False)
@ -446,9 +455,11 @@ class _BoundPromptContent:
@dataclass @dataclass
class BoundPromptUpdate: class BoundPromptUpdate:
""" """
A {class}`PromptUpdate` bound to a tokenizer to automatically convert A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound
{attr}`target` and the result of {meth}`get_content` between to a tokenizer to automatically convert
token sequence and text representations. [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of
[`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content]
between token sequence and text representations.
""" """
_origin: PromptUpdate _origin: PromptUpdate
tokenizer: AnyTokenizer = field(repr=False) tokenizer: AnyTokenizer = field(repr=False)
@ -482,7 +493,8 @@ class BoundPromptUpdate:
def get_content(self, item_idx: int) -> _BoundPromptContent: def get_content(self, item_idx: int) -> _BoundPromptContent:
""" """
Given the index of the processed item within {attr}`modality`, Given the index of the processed item within
[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
output the token sequence (or text) to update. output the token sequence (or text) to update.
""" """
content = self.content content = self.content
@ -1019,7 +1031,8 @@ class ProcessingCache:
) -> None: ) -> None:
""" """
Put a processed multi-modal item into the cache Put a processed multi-modal item into the cache
according to its dependencies (see {meth}`get`). according to its dependencies
(see [`get`][vllm.multimodal.processing.ProcessingCache.get]).
""" """
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
**{modality: input_item}, **{modality: input_item},
@ -1091,7 +1104,8 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
MultiModalHashes = dict[str, list[str]] MultiModalHashes = dict[str, list[str]]
""" """
A collection of hashes with a similar structure as {class}`MultiModalKwargs`. A collection of hashes with a similar structure as
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
""" """
@ -1099,7 +1113,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
""" """
Abstract base class to process multi-modal inputs to be used in vLLM. Abstract base class to process multi-modal inputs to be used in vLLM.
Not to be confused with {class}`transformers.ProcessorMixin`. Not to be confused with `transformers.ProcessorMixin`.
""" """
def __init__(self, def __init__(self,
@ -1126,10 +1140,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
def _get_data_parser(self) -> MultiModalDataParser: def _get_data_parser(self) -> MultiModalDataParser:
""" """
Construct a parser to preprocess multi-modal data items Construct a parser to preprocess multi-modal data items
before passing them to {meth}`_get_hf_mm_data`. before passing them to
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
You can support additional modalities by creating a subclass You can support additional modalities by creating a subclass
of {class}`MultiModalDataParser` that has additional subparsers. of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
that has additional subparsers.
""" """
return MultiModalDataParser() return MultiModalDataParser()
@ -1138,8 +1154,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
) -> MultiModalDataItems: ) -> MultiModalDataItems:
""" """
Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems` Normalize
before passing them to {meth}`_get_hf_mm_data`. [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]
before passing them to
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
""" """
mm_items = self.data_parser.parse_mm_data(mm_data) mm_items = self.data_parser.parse_mm_data(mm_data)
supported_mm_limits = self.info.get_supported_mm_limits() supported_mm_limits = self.info.get_supported_mm_limits()
@ -1191,7 +1210,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
inputs. inputs.
Moreover, this information is critical to determine the token positions Moreover, this information is critical to determine the token positions
in order to construct {class}`~vllm-multimodal.input.PlaceholderRange` in order to construct
[`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange]
for each multi-modal item. for each multi-modal item.
""" """
raise NotImplementedError raise NotImplementedError
@ -1315,7 +1335,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Most HF processors accept prompt text but not prompt tokens. Most HF processors accept prompt text but not prompt tokens.
If the HF processor adds or removes tokens that are not related to If the HF processor adds or removes tokens that are not related to
multi-modal data, you should override this method so it is consistent multi-modal data, you should override this method so it is consistent
with the output of {meth}`_apply_hf_processor_text_only` on the with the output of
[`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only]
on the
corresponding text. corresponding text.
""" """
return prompt_tokens return prompt_tokens
@ -1330,7 +1352,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Since HF processor requires that text and multi-modal items Since HF processor requires that text and multi-modal items
correspond to each other, we generate dummy text using correspond to each other, we generate dummy text using
{class}`DummyInputsBuilder` to go along with the multi-modal data. [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
to go along with the multi-modal data.
""" """
mm_counts = mm_items.get_all_counts() mm_counts = mm_items.get_all_counts()

View File

@ -25,7 +25,7 @@ logger = init_logger(__name__)
class ProcessorInputs: class ProcessorInputs:
""" """
Represents the keyword arguments to Represents the keyword arguments to
{meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
""" """
prompt: Union[str, list[int]] prompt: Union[str, list[int]]
mm_data: MultiModalDataDict mm_data: MultiModalDataDict

View File

@ -29,7 +29,11 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
class ProcessingInfoFactory(Protocol[_I_co]): class ProcessingInfoFactory(Protocol[_I_co]):
"""Constructs a {class}`MultiModalProcessor` instance from the context.""" """
Constructs a
[`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
instance from the context.
"""
def __call__( def __call__(
self, self,
@ -40,7 +44,9 @@ class ProcessingInfoFactory(Protocol[_I_co]):
class DummyInputsBuilderFactory(Protocol[_I]): class DummyInputsBuilderFactory(Protocol[_I]):
""" """
Constructs a {class}`BaseDummyInputsBuilder` instance from the context. Constructs a
[`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
instance from the context.
""" """
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
@ -48,7 +54,11 @@ class DummyInputsBuilderFactory(Protocol[_I]):
class MultiModalProcessorFactory(Protocol[_I]): class MultiModalProcessorFactory(Protocol[_I]):
"""Constructs a {class}`MultiModalProcessor` instance from the context.""" """
Constructs a
[`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
instance from the context.
"""
def __call__( def __call__(
self, self,
@ -155,8 +165,6 @@ class MultiModalRegistry:
""" """
Get the maximum number of tokens from each modality Get the maximum number of tokens from each modality
for profiling the memory usage of a model. for profiling the memory usage of a model.
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
""" """
mm_limits = self.get_mm_limits_per_prompt(model_config) mm_limits = self.get_mm_limits_per_prompt(model_config)
@ -170,8 +178,6 @@ class MultiModalRegistry:
""" """
Get the maximum number of multi-modal tokens Get the maximum number of multi-modal tokens
for profiling the memory usage of a model. for profiling the memory usage of a model.
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
""" """
return sum(self.get_max_tokens_by_modality(model_config).values()) return sum(self.get_max_tokens_by_modality(model_config).values())
@ -213,9 +219,6 @@ class MultiModalRegistry:
When the model receives multi-modal data, the provided function is When the model receives multi-modal data, the provided function is
invoked to transform the data into a dictionary of model inputs. invoked to transform the data into a dictionary of model inputs.
Info:
[mm-processing][]
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
@ -258,9 +261,6 @@ class MultiModalRegistry:
) -> BaseMultiModalProcessor[BaseProcessingInfo]: ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
""" """
Create a multi-modal processor for a specific model and tokenizer. Create a multi-modal processor for a specific model and tokenizer.
Info:
[mm-processing][]
""" """
if not model_config.is_multimodal_model: if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model") raise ValueError(f"{model_config.model} is not a multimodal model")

View File

@ -259,7 +259,8 @@ class MediaConnector:
global_media_connector = MediaConnector() global_media_connector = MediaConnector()
"""The global {class}`MediaConnector` instance used by vLLM.""" """The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
instance used by vLLM."""
fetch_audio = global_media_connector.fetch_audio fetch_audio = global_media_connector.fetch_audio
fetch_image = global_media_connector.fetch_image fetch_image = global_media_connector.fetch_image

View File

@ -84,7 +84,7 @@ class DeviceCapability(NamedTuple):
def to_int(self) -> int: def to_int(self) -> int:
""" """
Express device capability as an integer ``<major><minor>``. Express device capability as an integer `<major><minor>`.
It is assumed that the minor version is always a single digit. It is assumed that the minor version is always a single digit.
""" """
@ -206,10 +206,11 @@ class Platform:
""" """
Test whether this platform is compatible with a device capability. Test whether this platform is compatible with a device capability.
The ``capability`` argument can either be: The `capability` argument can either be:
- A tuple ``(major, minor)``. - A tuple `(major, minor)`.
- An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`) - An integer `<major><minor>`. (See
[`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
""" """
current_capability = cls.get_device_capability(device_id=device_id) current_capability = cls.get_device_capability(device_id=device_id)
if current_capability is None: if current_capability is None:

View File

@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
def array_full(token_id: int, count: int): def array_full(token_id: int, count: int):
"""{class}`array` equivalent of [numpy.full][].""" """[`array`][] equivalent of [numpy.full][]."""
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
@ -192,8 +192,8 @@ class SequenceData(msgspec.Struct,
def from_prompt_token_counts( def from_prompt_token_counts(
*token_counts: tuple[int, int]) -> "SequenceData": *token_counts: tuple[int, int]) -> "SequenceData":
""" """
Construct a {class}`SequenceData` instance by concatenating Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
prompt token sequences. by concatenating prompt token sequences.
Each tuple represents one token sequence, expressed in the form Each tuple represents one token sequence, expressed in the form
`(token_id, count)`. `(token_id, count)`.
@ -216,8 +216,8 @@ class SequenceData(msgspec.Struct,
prompt_embeds: Optional[torch.Tensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
) -> "SequenceData": ) -> "SequenceData":
""" """
Construct a {class}`SequenceData` instance from prompt and output Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
token sequences. from prompt and output token sequences.
""" """
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
prompt_token_ids) prompt_token_ids)
@ -452,9 +452,11 @@ class SequenceData(msgspec.Struct,
class Sequence: class Sequence:
"""Stores the data, status, and block information of a sequence. """Stores the data, status, and block information of a sequence.
The sequence is constructed from the {data}`DecoderOnlyInputs` The sequence is constructed from the
(for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder) [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
instance passed in through the `inputs` constructor argument. or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
(for encoder-decoder) instance passed in through the `inputs`
constructor argument.
Args: Args:
seq_id: The ID of the sequence. seq_id: The ID of the sequence.

View File

@ -1005,7 +1005,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
""" """
Unlike {class}`itertools.groupby`, groups are not broken by Unlike [`itertools.groupby`][], groups are not broken by
non-contiguous data. non-contiguous data.
""" """
groups = defaultdict[_K, list[_V]](list) groups = defaultdict[_K, list[_V]](list)
@ -1926,7 +1926,8 @@ class _PlaceholderBase:
Disallows downstream usage of placeholder modules. Disallows downstream usage of placeholder modules.
We need to explicitly override each dunder method because We need to explicitly override each dunder method because
{meth}`__getattr__` is not called when they are accessed. [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
is not called when they are accessed.
Info: Info:
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)

View File

@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
) -> None: ) -> None:
""" """
Perform sanity checks for the result of Perform sanity checks for the result of
{meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
""" """
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
"Expected multimodal embeddings to be a list/tuple of 2D tensors, " "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@ -39,7 +39,7 @@ def scatter_mm_placeholders(
Scatter the multimodal embeddings into a contiguous tensor that represents Scatter the multimodal embeddings into a contiguous tensor that represents
the placeholder tokens. the placeholder tokens.
{class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][].
Args: Args:
embeds: The multimodal embeddings. embeds: The multimodal embeddings.

View File

@ -734,11 +734,12 @@ def _pythonize_sampler_output(
cache: Optional[PythonizationCache], cache: Optional[PythonizationCache],
) -> None: ) -> None:
""" This function is only called when the output tensors are ready. """ This function is only called when the output tensors are ready.
See {class}`ModelOutput`. See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure adding a Pythonized output data structure
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`. ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
Args: Args:
model_input model_input