From 6c9fdbf7258146a9e335c50aab12969cd95e9227 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 17 Oct 2025 10:47:34 +0100 Subject: [PATCH] [Docs] Replace `rst` style double-backtick with `md` single-backtick (#27091) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../benchmark_serving_multi_turn.py | 2 +- docs/models/extensions/fastsafetensor.md | 2 +- tests/models/registry.py | 6 ++-- tests/models/utils.py | 2 +- tools/check_init_lazy_imports.py | 2 +- vllm/assets/base.py | 2 +- vllm/benchmarks/serve.py | 2 +- vllm/compilation/decorators.py | 4 +-- vllm/config/pooler.py | 6 ++-- vllm/distributed/kv_events.py | 2 +- vllm/entrypoints/context.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/entrypoints/renderer.py | 22 ++++++------- vllm/inputs/data.py | 2 +- .../quantization/utils/flashinfer_fp4_moe.py | 2 +- .../layers/quantization/utils/fp8_utils.py | 10 +++--- vllm/model_executor/models/olmo.py | 6 ++-- vllm/model_executor/models/olmo2.py | 6 ++-- vllm/model_executor/models/ovis.py | 2 +- vllm/model_executor/models/utils.py | 32 +++++++++---------- vllm/model_executor/parameter.py | 2 +- vllm/multimodal/processing.py | 14 ++++---- vllm/multimodal/registry.py | 4 +-- vllm/platforms/rocm.py | 2 +- vllm/platforms/xpu.py | 2 +- vllm/sampling_params.py | 8 ++--- vllm/utils/deep_gemm.py | 8 ++--- vllm/utils/flashinfer.py | 20 ++++++------ vllm/v1/core/kv_cache_utils.py | 14 ++++---- vllm/v1/worker/cpu_worker.py | 2 +- vllm/v1/worker/tpu_worker.py | 4 +-- 31 files changed, 98 insertions(+), 98 deletions(-) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 2b0a6da60c256..67a085b40ed35 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -1251,7 +1251,7 @@ async def main() -> None: default=None, help="The model name used in the API. " "If not specified, the model name will be the " - "same as the ``--model`` argument. ", + "same as the `--model` argument. ", ) parser.add_argument( diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md index 2a5a18102dc28..0f30d4e2f69d2 100644 --- a/docs/models/extensions/fastsafetensor.md +++ b/docs/models/extensions/fastsafetensor.md @@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. -To enable this feature, use the ``--load-format fastsafetensors`` command-line argument +To enable this feature, use the `--load-format fastsafetensors` command-line argument diff --git a/tests/models/registry.py b/tests/models/registry.py index 8a3e2a6893e51..c6d6fa3f52ba5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -67,17 +67,17 @@ class _HfExamplesInfo: is_available_online: bool = True """ - Set this to ``False`` if the name of this architecture no longer exists on + Set this to `False` if the name of this architecture no longer exists on the HF repo. To maintain backwards compatibility, we have not removed them from the main model registry, so without this flag the registry tests will fail. """ trust_remote_code: bool = False - """The ``trust_remote_code`` level required to load the model.""" + """The `trust_remote_code` level required to load the model.""" hf_overrides: dict[str, Any] = field(default_factory=dict) - """The ``hf_overrides`` required to load the model.""" + """The `hf_overrides` required to load the model.""" max_model_len: int | None = None """ diff --git a/tests/models/utils.py b/tests/models/utils.py index f5c16b3c65421..ffdb6950678c9 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -162,7 +162,7 @@ def check_logprobs_close( # Test prompt logprobs closeness if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None: - # Both sequences' prompt logprobs lists are not `None`` + # Both sequences' prompt logprobs lists are not `None` # (although individual list elements may be `None`); # for each token's logprobs: for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( diff --git a/tools/check_init_lazy_imports.py b/tools/check_init_lazy_imports.py index 197cc8ff8f5ed..8b3a0b2a71be0 100644 --- a/tools/check_init_lazy_imports.py +++ b/tools/check_init_lazy_imports.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Ensure we perform lazy loading in vllm/__init__.py. -i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard, +i.e: appears only within the `if typing.TYPE_CHECKING:` guard, **except** for a short whitelist. """ diff --git a/vllm/assets/base.py b/vllm/assets/base.py index abf397e1cc1ce..5ca9de4076ad0 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -21,7 +21,7 @@ def get_cache_dir() -> Path: @lru_cache def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path: """ - Download an asset file from ``s3://vllm-public-assets`` + Download an asset file from `s3://vllm-public-assets` and return the path to the downloaded file. """ asset_directory = get_cache_dir() / "vllm_public_assets" diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 89958848c06a8..71d136d61ceaf 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1231,7 +1231,7 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, help="The model name used in the API. " "If not specified, the model name will be the " - "same as the ``--model`` argument. ", + "same as the `--model` argument. ", ) parser.add_argument( diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 811cbef4afabe..4cbe3044e4060 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -138,8 +138,8 @@ def support_torch_compile( """ def cls_decorator_helper(cls: _T) -> _T: - # helper to pass `dynamic_arg_dims`` to `_support_torch_compile`` - # to avoid too much indentation for `_support_torch_compile`` + # helper to pass `dynamic_arg_dims` to `_support_torch_compile` + # to avoid too much indentation for `_support_torch_compile` if not hasattr(cls, "forward"): raise TypeError("decorated class should have a forward method.") sig = inspect.signature(cls.forward) diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index e40fc6a9bb20c..0590f74aa4c93 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -66,15 +66,15 @@ class PoolerConfig: """ step_tag_id: int | None = None """ - If set, only the score corresponding to the ``step_tag_id`` in the + If set, only the score corresponding to the `step_tag_id` in the generated sentence should be returned. Otherwise, the scores for all tokens are returned. """ returned_token_ids: list[int] | None = None """ A list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of ``good_token`` and ``bad_token`` in the - ``math-shepherd-mistral-7b-prm`` model. + such as the token IDs of `good_token` and `bad_token` in the + `math-shepherd-mistral-7b-prm` model. """ def compute_hash(self) -> str: diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 6be2557ede40d..4711467dafbdc 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -117,7 +117,7 @@ class ZmqEventPublisher(EventPublisher): Parameters ---------- endpoint: - PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to + PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to connect. replay_endpoint: Optional ROUTER address for replay requests. When given, subscribers can diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 8f94880e431be..8886d7c42d8a6 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -515,7 +515,7 @@ class StreamingHarmonyContext(HarmonyContext): def render_for_completion(self) -> list[int]: # now this list of tokens as next turn's starting tokens - # `<|start|>assistant``, + # `<|start|>assistant`, # we need to process them in parser. rendered_tokens = super().render_for_completion() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 177d2b9174c5a..30bcb59437d93 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1504,7 +1504,7 @@ class LLM: """Return a snapshot of aggregated metrics from Prometheus. Returns: - A ``MetricSnapshot`` instance capturing the current state + A `MetricSnapshot` instance capturing the current state of all aggregated metrics from Prometheus. Note: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index a8ce5e3fc64dc..8fbc17e96f7bb 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -26,12 +26,12 @@ class RenderConfig: max_length: int | None = None """Maximum allowable total input token length. If provided, - token inputs longer than this raise ``ValueError``.""" + token inputs longer than this raise `ValueError`.""" truncate_prompt_tokens: int | None = None - """Number of tokens to keep. ``None`` means no truncation. - ``0`` yields an empty list (and skips embeds). - ``-1`` maps to ``model_config.max_model_len``.""" + """Number of tokens to keep. `None` means no truncation. + `0` yields an empty list (and skips embeds). + `-1` maps to `model_config.max_model_len`.""" add_special_tokens: bool | None = True """Whether to add model-specific special tokens during tokenization.""" @@ -107,10 +107,10 @@ class BaseRenderer(ABC): Args: prompt_or_prompts: One of: - - ``str``: Single text prompt. - - ``list[str]``: Batch of text prompts. - - ``list[int]``: Single pre-tokenized sequence. - - ``list[list[int]]``: Batch of pre-tokenized sequences. + - `str`: Single text prompt. + - `list[str]`: Batch of text prompts. + - `list[int]`: Single pre-tokenized sequence. + - `list[list[int]]`: Batch of pre-tokenized sequences. config: Render configuration controlling how prompts are prepared (e.g., tokenization and length handling). @@ -134,9 +134,9 @@ class BaseRenderer(ABC): Convert text/token and/or base64-encoded embeddings inputs into engine-ready prompt objects using a unified RenderConfig. - At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be + At least one of `prompt_or_prompts` or `prompt_embeds` must be provided and non-empty. If both are omitted or empty (e.g., empty - string and empty list), a ``ValueError`` is raised. + string and empty list), a `ValueError` is raised. Args: prompt_or_prompts: Text or token inputs to include. @@ -150,7 +150,7 @@ class BaseRenderer(ABC): Engine-ready prompt objects. Raises: - ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds`` + ValueError: If both `prompt_or_prompts` and `prompt_embeds` are omitted or empty (decoder prompt cannot be empty), or if length limits are exceeded. """ diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 5a8304ac05a67..1f138a72d0842 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -327,7 +327,7 @@ def zip_enc_dec_prompts( [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] instances. - ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + `mm_processor_kwargs` may also be provided; if a dict is passed, the same dictionary will be used for every encoder/decoder prompt. If an iterable is provided, it will be zipped with the encoder/decoder prompts. """ diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 5ce0188b60aed..b3a4cb2de1395 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -27,7 +27,7 @@ __all__ = [ def is_flashinfer_fp4_cutlass_moe_available() -> bool: - """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" + """Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" return ( envs.VLLM_USE_FLASHINFER_MOE_FP4 and has_flashinfer_cutlass_fused_moe() diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 51af40a119147..7af1e0a5c84f1 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -887,11 +887,11 @@ def requant_weight_ue8m0_inplace( UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace. Args: - weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``. - Expected shape ``(..., M, K)``. - weight_scale: Corresponding per-block scale tensor (``torch.float32``) - with shape ``(..., M // block_size[0], K // block_size[1])``. - block_size: 2-element iterable ``[block_m, block_k]`` describing the + weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`. + Expected shape `(..., M, K)`. + weight_scale: Corresponding per-block scale tensor (`torch.float32`) + with shape `(..., M // block_size[0], K // block_size[1])`. + block_size: 2-element iterable `[block_m, block_k]` describing the block quantisation granularity. """ if weight.numel() == 0: diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 1e1a1293136f4..390a91d3425ce 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -64,7 +64,7 @@ from .utils import ( class OlmoAttention(nn.Module): """ This is the attention block where the output is computed as - ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))` (plus another skip connection). """ @@ -144,7 +144,7 @@ class OlmoAttention(nn.Module): class OlmoMLP(nn.Module): """ This is the MLP block where the output is computed as - ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + `MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))` (plus another skip connection). """ @@ -193,7 +193,7 @@ class OlmoMLP(nn.Module): class OlmoDecoderLayer(nn.Module): """ This is a typical transformer block where the output is - computed as ``MLP(LN(x + Attention(LN(x))))`` + computed as `MLP(LN(x + Attention(LN(x))))` (plus another skip connection). """ diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index a0ae9923ad76e..7e39f6dff25e7 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -69,7 +69,7 @@ from vllm.transformers_utils.configs import Olmo3Config class Olmo2Attention(nn.Module): """ This is the attention block where the output is computed as - ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))` (plus another skip connection). """ @@ -190,7 +190,7 @@ class Olmo2Attention(nn.Module): class Olmo2MLP(nn.Module): """ This is the MLP block where the output is computed as - ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))`` + `MLP(x)` in `LN(MLP(x + LN(Attention(x))))` (plus another skip connection). """ @@ -235,7 +235,7 @@ class Olmo2MLP(nn.Module): class Olmo2DecoderLayer(nn.Module): """ This is a typical transformer block where the output is - computed as ``MLP(LN(x + Attention(LN(x))))`` + computed as `MLP(LN(x + Attention(LN(x))))` (plus another skip connection). """ diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index dd7cbf54857f1..cc6c9b4e72d76 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -166,7 +166,7 @@ class VisualTokenizer(torch.nn.Module): # e.g., for hidden_stride=2, this leads to a token length reduction: # 1024 -> 256 for aimv2 if self.config.hidden_stride > 1: - # this `d` maybe different from the above `d`` + # this `d` maybe different from the above `d` n, L, d = features.shape sqrt_l = int(L**0.5) assert sqrt_l**2 == L, ( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 71abfe98813da..4cac6e6133cdb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -99,13 +99,13 @@ class AutoWeightsLoader: the weights only once. The weight loading logic for individual modules can be overridden - by defining a ``load_weights`` method. + by defining a `load_weights` method. Similarly, the weight loading logic for individual parameters can be - overridden by defining a ``weight_loader`` method. + overridden by defining a `weight_loader` method. Detailed weight loading information can be viewed by setting the - environment variable ``VLLM_LOGGING_LEVEL=DEBUG``. + environment variable `VLLM_LOGGING_LEVEL=DEBUG`. """ # Models trained using early version ColossalAI @@ -372,9 +372,9 @@ def flatten_bn( concat: bool = False, ) -> list[torch.Tensor] | torch.Tensor: """ - Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. + Flatten the `B` and `N` dimensions of batched multimodal inputs. - The input tensor should have shape ``(B, N, ...)```. + The input tensor should have shape `(B, N, ...)`. """ if isinstance(x, torch.Tensor): return x.flatten(0, 1) @@ -424,12 +424,12 @@ def _merge_multimodal_embeddings( is_multimodal: torch.Tensor, ) -> torch.Tensor: """ - Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the - positions in ``inputs_embeds`` corresponding to placeholder tokens in - ``input_ids``. + Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the + positions in `inputs_embeds` corresponding to placeholder tokens in + `input_ids`. Note: - This updates ``inputs_embeds`` in place. + This updates `inputs_embeds` in place. """ if len(multimodal_embeddings) == 0: return inputs_embeds @@ -475,14 +475,14 @@ def merge_multimodal_embeddings( placeholder_token_id: int | list[int], ) -> torch.Tensor: """ - Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the - positions in ``inputs_embeds`` corresponding to placeholder tokens in - ``input_ids``. + Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the + positions in `inputs_embeds` corresponding to placeholder tokens in + `input_ids`. - ``placeholder_token_id`` can be a list of token ids (e.g, token ids + `placeholder_token_id` can be a list of token ids (e.g, token ids of img_start, img_break, and img_end tokens) when needed: This means - the order of these tokens in the ``input_ids`` MUST MATCH the order of - their embeddings in ``multimodal_embeddings`` since we need to + the order of these tokens in the `input_ids` MUST MATCH the order of + their embeddings in `multimodal_embeddings` since we need to slice-merge instead of individually scattering. For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where @@ -497,7 +497,7 @@ def merge_multimodal_embeddings( input_ids for a correct embedding merge. Note: - This updates ``inputs_embeds`` in place. + This updates `inputs_embeds` in place. """ if isinstance(placeholder_token_id, list): is_multimodal = isin_list(input_ids, placeholder_token_id) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index fd21a3244eb35..d3a91feab64d9 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter): # NOTE(@ksayers) some models such as mamba_mixer2 override the # weight loader to support custom loading. In the future, model-specific # weight loading should be implemented via Model.load_weights. In the - # meantime, support deleting and overriding `weight_loader`` attribute + # meantime, support deleting and overriding `weight_loader` attribute if self._weight_loader is None: raise AttributeError( f"{self.__class__.__name__} weight_loader attribute has been deleted" diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index b47e82a19d70a..d4477d8c855c0 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate): Example: - For each image, insert a number of ```` feature placeholders - equal to the feature size of the vision encoder after the ```` token: + For each image, insert a number of `` feature placeholders + equal to the feature size of the vision encoder after the `` token: ```python PromptInsertion( @@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate): ) ``` - Insert these tokens after a prefix ``Images:``: + Insert these tokens after a prefix `Images:`: ```python PromptInsertion( @@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate): Example: - For each image, replace one ```` input placeholder in the prompt - with a number of ```` feature placeholders + For each image, replace one `` input placeholder in the prompt + with a number of `` feature placeholders equal to the feature size of the vision encoder: ```python @@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate): ) ``` - As above, but further pad the feature placeholders with ```` - and ```, which are not supposed to be passed to the vision + As above, but further pad the feature placeholders with `` + and ``, which are not supposed to be passed to the vision encoder: ```python diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 1e1b4acdb8715..0ac10bd08b3f4 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -307,7 +307,7 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by ``model_config``. + The model is identified by `model_config`. """ processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) @@ -340,7 +340,7 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by ``model_config``. + The model is identified by `model_config`. """ processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d1d94048c0c46..4680050965bde 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = { "0x74bd": "AMD_Instinct_MI300X_HF", } -# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`` +# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES` if "HIP_VISIBLE_DEVICES" in os.environ: val = os.environ["HIP_VISIBLE_DEVICES"] if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None): diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 5e109cccfe761..5799f97b8038d 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -168,7 +168,7 @@ class XPUPlatform(Platform): parallel_config.distributed_executor_backend = "uni" elif parallel_config.distributed_executor_backend == "mp": # FIXME(kunshang): - # spawn needs calling `if __name__ == '__main__':`` + # spawn needs calling `if __name__ == '__main__':` # fork is not supported for xpu start new process. if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn": os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 76b89634f508c..3f583b393e204 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -306,10 +306,10 @@ class SamplingParams( ) def __post_init__(self) -> None: - # how we deal with `best_of``: - # if `best_of`` is not set, we default to `n`; - # if `best_of`` is set, we set `n`` to `best_of`, - # and set `_real_n`` to the original `n`. + # how we deal with `best_of`: + # if `best_of` is not set, we default to `n`; + # if `best_of` is set, we set `n` to `best_of`, + # and set `_real_n` to the original `n`. # when we return the result, we will check # if we need to return `n` or `_real_n` results if self.best_of: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 6c69e3fce7a15..fce4111f19bb2 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm @functools.cache def is_deep_gemm_supported() -> bool: - """Return ``True`` if DeepGEMM is supported on the current platform. + """Return `True` if DeepGEMM is supported on the current platform. Currently, only Hopper and Blackwell GPUs are supported. """ is_supported_arch = current_platform.is_cuda() and ( @@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool: @functools.cache def is_deep_gemm_e8m0_used() -> bool: - """Return ``True`` if vLLM is configured to use DeepGEMM " + """Return `True` if vLLM is configured to use DeepGEMM " "E8M0 scale on a Hopper or Blackwell-class GPU. """ if not is_deep_gemm_supported(): @@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): """Return a global difference metric for unit tests. DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element - error, causing ``torch.testing.assert_close`` to fail. Instead of checking + error, causing `torch.testing.assert_close` to fail. Instead of checking every element, we compute a cosine-style similarity over the whole tensor - and report ``1 - sim``. Once kernel accuracy improves this helper can be + and report `1 - sim`. Once kernel accuracy improves this helper can be removed. """ diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 24b80e389e838..d7e4ea2e03884 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get( @functools.cache def has_flashinfer() -> bool: - """Return ``True`` if FlashInfer is available.""" + """Return `True` if FlashInfer is available.""" # Use find_spec to check if the module exists without importing it # This avoids potential CUDA initialization side effects if importlib.util.find_spec("flashinfer") is None: @@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper( @functools.cache def has_flashinfer_comm() -> bool: - """Return ``True`` if FlashInfer comm module is available.""" + """Return `True` if FlashInfer comm module is available.""" return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None @functools.cache def has_flashinfer_all2all() -> bool: - """Return ``True`` if FlashInfer mnnvl all2all is available.""" + """Return `True` if FlashInfer mnnvl all2all is available.""" if not has_flashinfer_comm(): return False @@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool: @functools.cache def has_flashinfer_moe() -> bool: - """Return ``True`` if FlashInfer MoE module is available.""" + """Return `True` if FlashInfer MoE module is available.""" return ( has_flashinfer() and importlib.util.find_spec("flashinfer.fused_moe") is not None @@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool: @functools.cache def has_flashinfer_cutlass_fused_moe() -> bool: - """Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" + """Return `True` if FlashInfer CUTLASS fused MoE is available.""" if not has_flashinfer_moe(): return False @@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: @functools.cache def has_nvidia_artifactory() -> bool: - """Return ``True`` if NVIDIA's artifactory is accessible. + """Return `True` if NVIDIA's artifactory is accessible. This checks connectivity to the kernel inference library artifactory which is required for downloading certain cubin kernels like TRTLLM FHMA. @@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None: def force_use_trtllm_attention() -> bool | None: """ - Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set, - return ``True`` if TRTLLM attention is forced to be used, - return ``False`` if TRTLLM attention is forced to be not used. + Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set, + return `True` if TRTLLM attention is forced to be used, + return `False` if TRTLLM attention is forced to be not used. """ return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) @@ -244,7 +244,7 @@ def use_trtllm_attention( has_sinks: bool = False, has_spec: bool = False, ) -> bool: - """Return ``True`` if TRTLLM attention is used.""" + """Return `True` if TRTLLM attention is used.""" force_use_trtllm = force_use_trtllm_attention() # Environment variable is set to 0 - respect it diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6c9a77ccb2b6a..625ff28f2b4c3 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.request import Request # BlockHash represents the hash of a single KV-cache block used for -# prefix caching. Treating it as a distinct type from ``bytes`` helps +# prefix caching. Treating it as a distinct type from `bytes` helps # catch accidental misuse when passing around raw byte strings. BlockHash = NewType("BlockHash", bytes) -# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID. +# `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID. # It is represented as raw bytes for compactness and efficiency. The helper -# functions below pack/unpack the ``BlockHash`` and group id into/from the key. +# functions below pack/unpack the `BlockHash` and group id into/from the key. BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes) # ExternalBlockHash is used for reproducible prefix-cache block hashing. -# It's a union of ``bytes`` and ``int`` to keep backward compatibility +# It's a union of `bytes` and `int` to keep backward compatibility # after we default block hashing to use sha256 bytes. ExternalBlockHash: TypeAlias = bytes | int @@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int def make_block_hash_with_group_id( block_hash: BlockHash, group_id: int ) -> BlockHashWithGroupId: - """Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``. + """Pack a `BlockHash` and group id into a `BlockHashWithGroupId`. The group id is encoded using 4 bytes in big-endian order and appended to the block hash bytes. This representation avoids creating tuples while @@ -54,12 +54,12 @@ def make_block_hash_with_group_id( def get_block_hash(key: BlockHashWithGroupId) -> BlockHash: - """Extract the ``BlockHash`` from a ``BlockHashWithGroupId``.""" + """Extract the `BlockHash` from a `BlockHashWithGroupId`.""" return BlockHash(key[:-4]) def get_group_id(key: BlockHashWithGroupId) -> int: - """Extract the group id from a ``BlockHashWithGroupId``.""" + """Extract the group id from a `BlockHashWithGroupId`.""" return int.from_bytes(key[-4:], "big", signed=False) diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index d3cf457ab5da4..5b57df2d472c8 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -128,7 +128,7 @@ class CPUWorker(Worker): "Please try to bind threads manually." ) - # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`` + # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]` selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore logical_cpu_list = [ x for x in logical_cpu_list if x.numa_node == selected_numa_node diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9605ff6de9eb6..c19ed1fc0bea2 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -182,8 +182,8 @@ class TPUWorker: if isinstance(layer_spec, AttentionSpec): dtype = layer_spec.dtype - # Use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. + # Use an empty tensor instead of `None` to force Dynamo to pass + # it by reference, rather by specializing on the value `None`. tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device) kv_caches[layer_name] = tpu_kv_cache else: