[Docs] Replace rst style double-backtick with md single-backtick (#27091)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-10-17 10:47:34 +01:00 committed by GitHub
parent 483ea64611
commit 6c9fdbf725
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 98 additions and 98 deletions

View File

@ -1251,7 +1251,7 @@ async def main() -> None:
default=None, default=None,
help="The model name used in the API. " help="The model name used in the API. "
"If not specified, the model name will be the " "If not specified, the model name will be the "
"same as the ``--model`` argument. ", "same as the `--model` argument. ",
) )
parser.add_argument( parser.add_argument(

View File

@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors
Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
To enable this feature, use the ``--load-format fastsafetensors`` command-line argument To enable this feature, use the `--load-format fastsafetensors` command-line argument

View File

@ -67,17 +67,17 @@ class _HfExamplesInfo:
is_available_online: bool = True is_available_online: bool = True
""" """
Set this to ``False`` if the name of this architecture no longer exists on Set this to `False` if the name of this architecture no longer exists on
the HF repo. To maintain backwards compatibility, we have not removed them the HF repo. To maintain backwards compatibility, we have not removed them
from the main model registry, so without this flag the registry tests will from the main model registry, so without this flag the registry tests will
fail. fail.
""" """
trust_remote_code: bool = False trust_remote_code: bool = False
"""The ``trust_remote_code`` level required to load the model.""" """The `trust_remote_code` level required to load the model."""
hf_overrides: dict[str, Any] = field(default_factory=dict) hf_overrides: dict[str, Any] = field(default_factory=dict)
"""The ``hf_overrides`` required to load the model.""" """The `hf_overrides` required to load the model."""
max_model_len: int | None = None max_model_len: int | None = None
""" """

View File

@ -162,7 +162,7 @@ def check_logprobs_close(
# Test prompt logprobs closeness # Test prompt logprobs closeness
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None: if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
# Both sequences' prompt logprobs lists are not `None`` # Both sequences' prompt logprobs lists are not `None`
# (although individual list elements may be `None`); # (although individual list elements may be `None`);
# for each token's logprobs: # for each token's logprobs:
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Ensure we perform lazy loading in vllm/__init__.py. """Ensure we perform lazy loading in vllm/__init__.py.
i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard, i.e: appears only within the `if typing.TYPE_CHECKING:` guard,
**except** for a short whitelist. **except** for a short whitelist.
""" """

View File

@ -21,7 +21,7 @@ def get_cache_dir() -> Path:
@lru_cache @lru_cache
def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path: def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
""" """
Download an asset file from ``s3://vllm-public-assets`` Download an asset file from `s3://vllm-public-assets`
and return the path to the downloaded file. and return the path to the downloaded file.
""" """
asset_directory = get_cache_dir() / "vllm_public_assets" asset_directory = get_cache_dir() / "vllm_public_assets"

View File

@ -1231,7 +1231,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=None, default=None,
help="The model name used in the API. " help="The model name used in the API. "
"If not specified, the model name will be the " "If not specified, the model name will be the "
"same as the ``--model`` argument. ", "same as the `--model` argument. ",
) )
parser.add_argument( parser.add_argument(

View File

@ -138,8 +138,8 @@ def support_torch_compile(
""" """
def cls_decorator_helper(cls: _T) -> _T: def cls_decorator_helper(cls: _T) -> _T:
# helper to pass `dynamic_arg_dims`` to `_support_torch_compile`` # helper to pass `dynamic_arg_dims` to `_support_torch_compile`
# to avoid too much indentation for `_support_torch_compile`` # to avoid too much indentation for `_support_torch_compile`
if not hasattr(cls, "forward"): if not hasattr(cls, "forward"):
raise TypeError("decorated class should have a forward method.") raise TypeError("decorated class should have a forward method.")
sig = inspect.signature(cls.forward) sig = inspect.signature(cls.forward)

View File

@ -66,15 +66,15 @@ class PoolerConfig:
""" """
step_tag_id: int | None = None step_tag_id: int | None = None
""" """
If set, only the score corresponding to the ``step_tag_id`` in the If set, only the score corresponding to the `step_tag_id` in the
generated sentence should be returned. Otherwise, the scores for all tokens generated sentence should be returned. Otherwise, the scores for all tokens
are returned. are returned.
""" """
returned_token_ids: list[int] | None = None returned_token_ids: list[int] | None = None
""" """
A list of indices for the vocabulary dimensions to be extracted, A list of indices for the vocabulary dimensions to be extracted,
such as the token IDs of ``good_token`` and ``bad_token`` in the such as the token IDs of `good_token` and `bad_token` in the
``math-shepherd-mistral-7b-prm`` model. `math-shepherd-mistral-7b-prm` model.
""" """
def compute_hash(self) -> str: def compute_hash(self) -> str:

View File

@ -117,7 +117,7 @@ class ZmqEventPublisher(EventPublisher):
Parameters Parameters
---------- ----------
endpoint: endpoint:
PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to
connect. connect.
replay_endpoint: replay_endpoint:
Optional ROUTER address for replay requests. When given, subscribers can Optional ROUTER address for replay requests. When given, subscribers can

View File

@ -515,7 +515,7 @@ class StreamingHarmonyContext(HarmonyContext):
def render_for_completion(self) -> list[int]: def render_for_completion(self) -> list[int]:
# now this list of tokens as next turn's starting tokens # now this list of tokens as next turn's starting tokens
# `<|start|>assistant``, # `<|start|>assistant`,
# we need to process them in parser. # we need to process them in parser.
rendered_tokens = super().render_for_completion() rendered_tokens = super().render_for_completion()

View File

@ -1504,7 +1504,7 @@ class LLM:
"""Return a snapshot of aggregated metrics from Prometheus. """Return a snapshot of aggregated metrics from Prometheus.
Returns: Returns:
A ``MetricSnapshot`` instance capturing the current state A `MetricSnapshot` instance capturing the current state
of all aggregated metrics from Prometheus. of all aggregated metrics from Prometheus.
Note: Note:

View File

@ -26,12 +26,12 @@ class RenderConfig:
max_length: int | None = None max_length: int | None = None
"""Maximum allowable total input token length. If provided, """Maximum allowable total input token length. If provided,
token inputs longer than this raise ``ValueError``.""" token inputs longer than this raise `ValueError`."""
truncate_prompt_tokens: int | None = None truncate_prompt_tokens: int | None = None
"""Number of tokens to keep. ``None`` means no truncation. """Number of tokens to keep. `None` means no truncation.
``0`` yields an empty list (and skips embeds). `0` yields an empty list (and skips embeds).
``-1`` maps to ``model_config.max_model_len``.""" `-1` maps to `model_config.max_model_len`."""
add_special_tokens: bool | None = True add_special_tokens: bool | None = True
"""Whether to add model-specific special tokens during tokenization.""" """Whether to add model-specific special tokens during tokenization."""
@ -107,10 +107,10 @@ class BaseRenderer(ABC):
Args: Args:
prompt_or_prompts: One of: prompt_or_prompts: One of:
- ``str``: Single text prompt. - `str`: Single text prompt.
- ``list[str]``: Batch of text prompts. - `list[str]`: Batch of text prompts.
- ``list[int]``: Single pre-tokenized sequence. - `list[int]`: Single pre-tokenized sequence.
- ``list[list[int]]``: Batch of pre-tokenized sequences. - `list[list[int]]`: Batch of pre-tokenized sequences.
config: Render configuration controlling how prompts are prepared config: Render configuration controlling how prompts are prepared
(e.g., tokenization and length handling). (e.g., tokenization and length handling).
@ -134,9 +134,9 @@ class BaseRenderer(ABC):
Convert text/token and/or base64-encoded embeddings inputs into Convert text/token and/or base64-encoded embeddings inputs into
engine-ready prompt objects using a unified RenderConfig. engine-ready prompt objects using a unified RenderConfig.
At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be At least one of `prompt_or_prompts` or `prompt_embeds` must be
provided and non-empty. If both are omitted or empty (e.g., empty provided and non-empty. If both are omitted or empty (e.g., empty
string and empty list), a ``ValueError`` is raised. string and empty list), a `ValueError` is raised.
Args: Args:
prompt_or_prompts: Text or token inputs to include. prompt_or_prompts: Text or token inputs to include.
@ -150,7 +150,7 @@ class BaseRenderer(ABC):
Engine-ready prompt objects. Engine-ready prompt objects.
Raises: Raises:
ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds`` ValueError: If both `prompt_or_prompts` and `prompt_embeds`
are omitted or empty (decoder prompt cannot be empty), or if are omitted or empty (decoder prompt cannot be empty), or if
length limits are exceeded. length limits are exceeded.
""" """

View File

@ -327,7 +327,7 @@ def zip_enc_dec_prompts(
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
instances. instances.
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same `mm_processor_kwargs` may also be provided; if a dict is passed, the same
dictionary will be used for every encoder/decoder prompt. If an iterable is dictionary will be used for every encoder/decoder prompt. If an iterable is
provided, it will be zipped with the encoder/decoder prompts. provided, it will be zipped with the encoder/decoder prompts.
""" """

View File

@ -27,7 +27,7 @@ __all__ = [
def is_flashinfer_fp4_cutlass_moe_available() -> bool: def is_flashinfer_fp4_cutlass_moe_available() -> bool:
"""Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" """Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
return ( return (
envs.VLLM_USE_FLASHINFER_MOE_FP4 envs.VLLM_USE_FLASHINFER_MOE_FP4
and has_flashinfer_cutlass_fused_moe() and has_flashinfer_cutlass_fused_moe()

View File

@ -887,11 +887,11 @@ def requant_weight_ue8m0_inplace(
UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace. UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
Args: Args:
weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``. weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`.
Expected shape ``(..., M, K)``. Expected shape `(..., M, K)`.
weight_scale: Corresponding per-block scale tensor (``torch.float32``) weight_scale: Corresponding per-block scale tensor (`torch.float32`)
with shape ``(..., M // block_size[0], K // block_size[1])``. with shape `(..., M // block_size[0], K // block_size[1])`.
block_size: 2-element iterable ``[block_m, block_k]`` describing the block_size: 2-element iterable `[block_m, block_k]` describing the
block quantisation granularity. block quantisation granularity.
""" """
if weight.numel() == 0: if weight.numel() == 0:

View File

@ -64,7 +64,7 @@ from .utils import (
class OlmoAttention(nn.Module): class OlmoAttention(nn.Module):
""" """
This is the attention block where the output is computed as This is the attention block where the output is computed as
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
(plus another skip connection). (plus another skip connection).
""" """
@ -144,7 +144,7 @@ class OlmoAttention(nn.Module):
class OlmoMLP(nn.Module): class OlmoMLP(nn.Module):
""" """
This is the MLP block where the output is computed as This is the MLP block where the output is computed as
``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` `MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
(plus another skip connection). (plus another skip connection).
""" """
@ -193,7 +193,7 @@ class OlmoMLP(nn.Module):
class OlmoDecoderLayer(nn.Module): class OlmoDecoderLayer(nn.Module):
""" """
This is a typical transformer block where the output is This is a typical transformer block where the output is
computed as ``MLP(LN(x + Attention(LN(x))))`` computed as `MLP(LN(x + Attention(LN(x))))`
(plus another skip connection). (plus another skip connection).
""" """

View File

@ -69,7 +69,7 @@ from vllm.transformers_utils.configs import Olmo3Config
class Olmo2Attention(nn.Module): class Olmo2Attention(nn.Module):
""" """
This is the attention block where the output is computed as This is the attention block where the output is computed as
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
(plus another skip connection). (plus another skip connection).
""" """
@ -190,7 +190,7 @@ class Olmo2Attention(nn.Module):
class Olmo2MLP(nn.Module): class Olmo2MLP(nn.Module):
""" """
This is the MLP block where the output is computed as This is the MLP block where the output is computed as
``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))`` `MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
(plus another skip connection). (plus another skip connection).
""" """
@ -235,7 +235,7 @@ class Olmo2MLP(nn.Module):
class Olmo2DecoderLayer(nn.Module): class Olmo2DecoderLayer(nn.Module):
""" """
This is a typical transformer block where the output is This is a typical transformer block where the output is
computed as ``MLP(LN(x + Attention(LN(x))))`` computed as `MLP(LN(x + Attention(LN(x))))`
(plus another skip connection). (plus another skip connection).
""" """

View File

@ -166,7 +166,7 @@ class VisualTokenizer(torch.nn.Module):
# e.g., for hidden_stride=2, this leads to a token length reduction: # e.g., for hidden_stride=2, this leads to a token length reduction:
# 1024 -> 256 for aimv2 # 1024 -> 256 for aimv2
if self.config.hidden_stride > 1: if self.config.hidden_stride > 1:
# this `d` maybe different from the above `d`` # this `d` maybe different from the above `d`
n, L, d = features.shape n, L, d = features.shape
sqrt_l = int(L**0.5) sqrt_l = int(L**0.5)
assert sqrt_l**2 == L, ( assert sqrt_l**2 == L, (

View File

@ -99,13 +99,13 @@ class AutoWeightsLoader:
the weights only once. the weights only once.
The weight loading logic for individual modules can be overridden The weight loading logic for individual modules can be overridden
by defining a ``load_weights`` method. by defining a `load_weights` method.
Similarly, the weight loading logic for individual parameters can be Similarly, the weight loading logic for individual parameters can be
overridden by defining a ``weight_loader`` method. overridden by defining a `weight_loader` method.
Detailed weight loading information can be viewed by setting the Detailed weight loading information can be viewed by setting the
environment variable ``VLLM_LOGGING_LEVEL=DEBUG``. environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
""" """
# Models trained using early version ColossalAI # Models trained using early version ColossalAI
@ -372,9 +372,9 @@ def flatten_bn(
concat: bool = False, concat: bool = False,
) -> list[torch.Tensor] | torch.Tensor: ) -> list[torch.Tensor] | torch.Tensor:
""" """
Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. Flatten the `B` and `N` dimensions of batched multimodal inputs.
The input tensor should have shape ``(B, N, ...)```. The input tensor should have shape `(B, N, ...)`.
""" """
if isinstance(x, torch.Tensor): if isinstance(x, torch.Tensor):
return x.flatten(0, 1) return x.flatten(0, 1)
@ -424,12 +424,12 @@ def _merge_multimodal_embeddings(
is_multimodal: torch.Tensor, is_multimodal: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
""" """
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
positions in ``inputs_embeds`` corresponding to placeholder tokens in positions in `inputs_embeds` corresponding to placeholder tokens in
``input_ids``. `input_ids`.
Note: Note:
This updates ``inputs_embeds`` in place. This updates `inputs_embeds` in place.
""" """
if len(multimodal_embeddings) == 0: if len(multimodal_embeddings) == 0:
return inputs_embeds return inputs_embeds
@ -475,14 +475,14 @@ def merge_multimodal_embeddings(
placeholder_token_id: int | list[int], placeholder_token_id: int | list[int],
) -> torch.Tensor: ) -> torch.Tensor:
""" """
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
positions in ``inputs_embeds`` corresponding to placeholder tokens in positions in `inputs_embeds` corresponding to placeholder tokens in
``input_ids``. `input_ids`.
``placeholder_token_id`` can be a list of token ids (e.g, token ids `placeholder_token_id` can be a list of token ids (e.g, token ids
of img_start, img_break, and img_end tokens) when needed: This means of img_start, img_break, and img_end tokens) when needed: This means
the order of these tokens in the ``input_ids`` MUST MATCH the order of the order of these tokens in the `input_ids` MUST MATCH the order of
their embeddings in ``multimodal_embeddings`` since we need to their embeddings in `multimodal_embeddings` since we need to
slice-merge instead of individually scattering. slice-merge instead of individually scattering.
For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
@ -497,7 +497,7 @@ def merge_multimodal_embeddings(
input_ids for a correct embedding merge. input_ids for a correct embedding merge.
Note: Note:
This updates ``inputs_embeds`` in place. This updates `inputs_embeds` in place.
""" """
if isinstance(placeholder_token_id, list): if isinstance(placeholder_token_id, list):
is_multimodal = isin_list(input_ids, placeholder_token_id) is_multimodal = isin_list(input_ids, placeholder_token_id)

View File

@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter):
# NOTE(@ksayers) some models such as mamba_mixer2 override the # NOTE(@ksayers) some models such as mamba_mixer2 override the
# weight loader to support custom loading. In the future, model-specific # weight loader to support custom loading. In the future, model-specific
# weight loading should be implemented via Model.load_weights. In the # weight loading should be implemented via Model.load_weights. In the
# meantime, support deleting and overriding `weight_loader`` attribute # meantime, support deleting and overriding `weight_loader` attribute
if self._weight_loader is None: if self._weight_loader is None:
raise AttributeError( raise AttributeError(
f"{self.__class__.__name__} weight_loader attribute has been deleted" f"{self.__class__.__name__} weight_loader attribute has been deleted"

View File

@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate):
Example: Example:
For each image, insert a number of ``<image>`` feature placeholders For each image, insert a number of `<image>` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token: equal to the feature size of the vision encoder after the `<s>` token:
```python ```python
PromptInsertion( PromptInsertion(
@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate):
) )
``` ```
Insert these tokens after a prefix ``Images:``: Insert these tokens after a prefix `Images:`:
```python ```python
PromptInsertion( PromptInsertion(
@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate):
Example: Example:
For each image, replace one ``<image>`` input placeholder in the prompt For each image, replace one `<image>` input placeholder in the prompt
with a number of ``<image>`` feature placeholders with a number of `<image>` feature placeholders
equal to the feature size of the vision encoder: equal to the feature size of the vision encoder:
```python ```python
@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate):
) )
``` ```
As above, but further pad the feature placeholders with ``<image_bos>`` As above, but further pad the feature placeholders with `<image_bos>`
and `<image_eos>``, which are not supposed to be passed to the vision and `<image_eos>`, which are not supposed to be passed to the vision
encoder: encoder:
```python ```python

View File

@ -307,7 +307,7 @@ class MultiModalRegistry:
""" """
Create dummy data for profiling the memory usage of a model. Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``. The model is identified by `model_config`.
""" """
processor = self.create_processor(model_config, cache=cache) processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor) profiler: MultiModalProfiler = MultiModalProfiler(processor)
@ -340,7 +340,7 @@ class MultiModalRegistry:
""" """
Create dummy data for profiling the memory usage of a model. Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``. The model is identified by `model_config`.
""" """
processor = self.create_processor(model_config, cache=cache) processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor) profiler: MultiModalProfiler = MultiModalProfiler(processor)

View File

@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
"0x74bd": "AMD_Instinct_MI300X_HF", "0x74bd": "AMD_Instinct_MI300X_HF",
} }
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`` # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
if "HIP_VISIBLE_DEVICES" in os.environ: if "HIP_VISIBLE_DEVICES" in os.environ:
val = os.environ["HIP_VISIBLE_DEVICES"] val = os.environ["HIP_VISIBLE_DEVICES"]
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None): if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):

View File

@ -168,7 +168,7 @@ class XPUPlatform(Platform):
parallel_config.distributed_executor_backend = "uni" parallel_config.distributed_executor_backend = "uni"
elif parallel_config.distributed_executor_backend == "mp": elif parallel_config.distributed_executor_backend == "mp":
# FIXME(kunshang): # FIXME(kunshang):
# spawn needs calling `if __name__ == '__main__':`` # spawn needs calling `if __name__ == '__main__':`
# fork is not supported for xpu start new process. # fork is not supported for xpu start new process.
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn": if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@ -306,10 +306,10 @@ class SamplingParams(
) )
def __post_init__(self) -> None: def __post_init__(self) -> None:
# how we deal with `best_of``: # how we deal with `best_of`:
# if `best_of`` is not set, we default to `n`; # if `best_of` is not set, we default to `n`;
# if `best_of`` is set, we set `n`` to `best_of`, # if `best_of` is set, we set `n` to `best_of`,
# and set `_real_n`` to the original `n`. # and set `_real_n` to the original `n`.
# when we return the result, we will check # when we return the result, we will check
# if we need to return `n` or `_real_n` results # if we need to return `n` or `_real_n` results
if self.best_of: if self.best_of:

View File

@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm
@functools.cache @functools.cache
def is_deep_gemm_supported() -> bool: def is_deep_gemm_supported() -> bool:
"""Return ``True`` if DeepGEMM is supported on the current platform. """Return `True` if DeepGEMM is supported on the current platform.
Currently, only Hopper and Blackwell GPUs are supported. Currently, only Hopper and Blackwell GPUs are supported.
""" """
is_supported_arch = current_platform.is_cuda() and ( is_supported_arch = current_platform.is_cuda() and (
@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool:
@functools.cache @functools.cache
def is_deep_gemm_e8m0_used() -> bool: def is_deep_gemm_e8m0_used() -> bool:
"""Return ``True`` if vLLM is configured to use DeepGEMM " """Return `True` if vLLM is configured to use DeepGEMM "
"E8M0 scale on a Hopper or Blackwell-class GPU. "E8M0 scale on a Hopper or Blackwell-class GPU.
""" """
if not is_deep_gemm_supported(): if not is_deep_gemm_supported():
@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
"""Return a global difference metric for unit tests. """Return a global difference metric for unit tests.
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
error, causing ``torch.testing.assert_close`` to fail. Instead of checking error, causing `torch.testing.assert_close` to fail. Instead of checking
every element, we compute a cosine-style similarity over the whole tensor every element, we compute a cosine-style similarity over the whole tensor
and report ``1 - sim``. Once kernel accuracy improves this helper can be and report `1 - sim`. Once kernel accuracy improves this helper can be
removed. removed.
""" """

View File

@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
@functools.cache @functools.cache
def has_flashinfer() -> bool: def has_flashinfer() -> bool:
"""Return ``True`` if FlashInfer is available.""" """Return `True` if FlashInfer is available."""
# Use find_spec to check if the module exists without importing it # Use find_spec to check if the module exists without importing it
# This avoids potential CUDA initialization side effects # This avoids potential CUDA initialization side effects
if importlib.util.find_spec("flashinfer") is None: if importlib.util.find_spec("flashinfer") is None:
@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper(
@functools.cache @functools.cache
def has_flashinfer_comm() -> bool: def has_flashinfer_comm() -> bool:
"""Return ``True`` if FlashInfer comm module is available.""" """Return `True` if FlashInfer comm module is available."""
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
@functools.cache @functools.cache
def has_flashinfer_all2all() -> bool: def has_flashinfer_all2all() -> bool:
"""Return ``True`` if FlashInfer mnnvl all2all is available.""" """Return `True` if FlashInfer mnnvl all2all is available."""
if not has_flashinfer_comm(): if not has_flashinfer_comm():
return False return False
@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool:
@functools.cache @functools.cache
def has_flashinfer_moe() -> bool: def has_flashinfer_moe() -> bool:
"""Return ``True`` if FlashInfer MoE module is available.""" """Return `True` if FlashInfer MoE module is available."""
return ( return (
has_flashinfer() has_flashinfer()
and importlib.util.find_spec("flashinfer.fused_moe") is not None and importlib.util.find_spec("flashinfer.fused_moe") is not None
@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool:
@functools.cache @functools.cache
def has_flashinfer_cutlass_fused_moe() -> bool: def has_flashinfer_cutlass_fused_moe() -> bool:
"""Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" """Return `True` if FlashInfer CUTLASS fused MoE is available."""
if not has_flashinfer_moe(): if not has_flashinfer_moe():
return False return False
@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
@functools.cache @functools.cache
def has_nvidia_artifactory() -> bool: def has_nvidia_artifactory() -> bool:
"""Return ``True`` if NVIDIA's artifactory is accessible. """Return `True` if NVIDIA's artifactory is accessible.
This checks connectivity to the kernel inference library artifactory This checks connectivity to the kernel inference library artifactory
which is required for downloading certain cubin kernels like TRTLLM FHMA. which is required for downloading certain cubin kernels like TRTLLM FHMA.
@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
def force_use_trtllm_attention() -> bool | None: def force_use_trtllm_attention() -> bool | None:
""" """
Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set, Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
return ``True`` if TRTLLM attention is forced to be used, return `True` if TRTLLM attention is forced to be used,
return ``False`` if TRTLLM attention is forced to be not used. return `False` if TRTLLM attention is forced to be not used.
""" """
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
@ -244,7 +244,7 @@ def use_trtllm_attention(
has_sinks: bool = False, has_sinks: bool = False,
has_spec: bool = False, has_spec: bool = False,
) -> bool: ) -> bool:
"""Return ``True`` if TRTLLM attention is used.""" """Return `True` if TRTLLM attention is used."""
force_use_trtllm = force_use_trtllm_attention() force_use_trtllm = force_use_trtllm_attention()
# Environment variable is set to 0 - respect it # Environment variable is set to 0 - respect it

View File

@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.request import Request from vllm.v1.request import Request
# BlockHash represents the hash of a single KV-cache block used for # BlockHash represents the hash of a single KV-cache block used for
# prefix caching. Treating it as a distinct type from ``bytes`` helps # prefix caching. Treating it as a distinct type from `bytes` helps
# catch accidental misuse when passing around raw byte strings. # catch accidental misuse when passing around raw byte strings.
BlockHash = NewType("BlockHash", bytes) BlockHash = NewType("BlockHash", bytes)
# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID. # `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
# It is represented as raw bytes for compactness and efficiency. The helper # It is represented as raw bytes for compactness and efficiency. The helper
# functions below pack/unpack the ``BlockHash`` and group id into/from the key. # functions below pack/unpack the `BlockHash` and group id into/from the key.
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes) BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
# ExternalBlockHash is used for reproducible prefix-cache block hashing. # ExternalBlockHash is used for reproducible prefix-cache block hashing.
# It's a union of ``bytes`` and ``int`` to keep backward compatibility # It's a union of `bytes` and `int` to keep backward compatibility
# after we default block hashing to use sha256 bytes. # after we default block hashing to use sha256 bytes.
ExternalBlockHash: TypeAlias = bytes | int ExternalBlockHash: TypeAlias = bytes | int
@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int
def make_block_hash_with_group_id( def make_block_hash_with_group_id(
block_hash: BlockHash, group_id: int block_hash: BlockHash, group_id: int
) -> BlockHashWithGroupId: ) -> BlockHashWithGroupId:
"""Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``. """Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
The group id is encoded using 4 bytes in big-endian order and appended to The group id is encoded using 4 bytes in big-endian order and appended to
the block hash bytes. This representation avoids creating tuples while the block hash bytes. This representation avoids creating tuples while
@ -54,12 +54,12 @@ def make_block_hash_with_group_id(
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash: def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
"""Extract the ``BlockHash`` from a ``BlockHashWithGroupId``.""" """Extract the `BlockHash` from a `BlockHashWithGroupId`."""
return BlockHash(key[:-4]) return BlockHash(key[:-4])
def get_group_id(key: BlockHashWithGroupId) -> int: def get_group_id(key: BlockHashWithGroupId) -> int:
"""Extract the group id from a ``BlockHashWithGroupId``.""" """Extract the group id from a `BlockHashWithGroupId`."""
return int.from_bytes(key[-4:], "big", signed=False) return int.from_bytes(key[-4:], "big", signed=False)

View File

@ -128,7 +128,7 @@ class CPUWorker(Worker):
"Please try to bind threads manually." "Please try to bind threads manually."
) )
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`` # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
logical_cpu_list = [ logical_cpu_list = [
x for x in logical_cpu_list if x.numa_node == selected_numa_node x for x in logical_cpu_list if x.numa_node == selected_numa_node

View File

@ -182,8 +182,8 @@ class TPUWorker:
if isinstance(layer_spec, AttentionSpec): if isinstance(layer_spec, AttentionSpec):
dtype = layer_spec.dtype dtype = layer_spec.dtype
# Use an empty tensor instead of `None`` to force Dynamo to pass # Use an empty tensor instead of `None` to force Dynamo to pass
# it by reference, rather by specializing on the value ``None``. # it by reference, rather by specializing on the value `None`.
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device) tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
kv_caches[layer_name] = tpu_kv_cache kv_caches[layer_name] = tpu_kv_cache
else: else: