mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 13:15:42 +08:00
[Docs] Replace rst style double-backtick with md single-backtick (#27091)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
483ea64611
commit
6c9fdbf725
@ -1251,7 +1251,7 @@ async def main() -> None:
|
|||||||
default=None,
|
default=None,
|
||||||
help="The model name used in the API. "
|
help="The model name used in the API. "
|
||||||
"If not specified, the model name will be the "
|
"If not specified, the model name will be the "
|
||||||
"same as the ``--model`` argument. ",
|
"same as the `--model` argument. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors
|
|||||||
|
|
||||||
Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
|
Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
|
||||||
|
|
||||||
To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
|
To enable this feature, use the `--load-format fastsafetensors` command-line argument
|
||||||
|
|||||||
@ -67,17 +67,17 @@ class _HfExamplesInfo:
|
|||||||
|
|
||||||
is_available_online: bool = True
|
is_available_online: bool = True
|
||||||
"""
|
"""
|
||||||
Set this to ``False`` if the name of this architecture no longer exists on
|
Set this to `False` if the name of this architecture no longer exists on
|
||||||
the HF repo. To maintain backwards compatibility, we have not removed them
|
the HF repo. To maintain backwards compatibility, we have not removed them
|
||||||
from the main model registry, so without this flag the registry tests will
|
from the main model registry, so without this flag the registry tests will
|
||||||
fail.
|
fail.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
"""The ``trust_remote_code`` level required to load the model."""
|
"""The `trust_remote_code` level required to load the model."""
|
||||||
|
|
||||||
hf_overrides: dict[str, Any] = field(default_factory=dict)
|
hf_overrides: dict[str, Any] = field(default_factory=dict)
|
||||||
"""The ``hf_overrides`` required to load the model."""
|
"""The `hf_overrides` required to load the model."""
|
||||||
|
|
||||||
max_model_len: int | None = None
|
max_model_len: int | None = None
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -162,7 +162,7 @@ def check_logprobs_close(
|
|||||||
|
|
||||||
# Test prompt logprobs closeness
|
# Test prompt logprobs closeness
|
||||||
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
|
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
|
||||||
# Both sequences' prompt logprobs lists are not `None``
|
# Both sequences' prompt logprobs lists are not `None`
|
||||||
# (although individual list elements may be `None`);
|
# (although individual list elements may be `None`);
|
||||||
# for each token's logprobs:
|
# for each token's logprobs:
|
||||||
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
|
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Ensure we perform lazy loading in vllm/__init__.py.
|
"""Ensure we perform lazy loading in vllm/__init__.py.
|
||||||
i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard,
|
i.e: appears only within the `if typing.TYPE_CHECKING:` guard,
|
||||||
**except** for a short whitelist.
|
**except** for a short whitelist.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,7 @@ def get_cache_dir() -> Path:
|
|||||||
@lru_cache
|
@lru_cache
|
||||||
def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
|
def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
|
||||||
"""
|
"""
|
||||||
Download an asset file from ``s3://vllm-public-assets``
|
Download an asset file from `s3://vllm-public-assets`
|
||||||
and return the path to the downloaded file.
|
and return the path to the downloaded file.
|
||||||
"""
|
"""
|
||||||
asset_directory = get_cache_dir() / "vllm_public_assets"
|
asset_directory = get_cache_dir() / "vllm_public_assets"
|
||||||
|
|||||||
@ -1231,7 +1231,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
default=None,
|
default=None,
|
||||||
help="The model name used in the API. "
|
help="The model name used in the API. "
|
||||||
"If not specified, the model name will be the "
|
"If not specified, the model name will be the "
|
||||||
"same as the ``--model`` argument. ",
|
"same as the `--model` argument. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@ -138,8 +138,8 @@ def support_torch_compile(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def cls_decorator_helper(cls: _T) -> _T:
|
def cls_decorator_helper(cls: _T) -> _T:
|
||||||
# helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
|
# helper to pass `dynamic_arg_dims` to `_support_torch_compile`
|
||||||
# to avoid too much indentation for `_support_torch_compile``
|
# to avoid too much indentation for `_support_torch_compile`
|
||||||
if not hasattr(cls, "forward"):
|
if not hasattr(cls, "forward"):
|
||||||
raise TypeError("decorated class should have a forward method.")
|
raise TypeError("decorated class should have a forward method.")
|
||||||
sig = inspect.signature(cls.forward)
|
sig = inspect.signature(cls.forward)
|
||||||
|
|||||||
@ -66,15 +66,15 @@ class PoolerConfig:
|
|||||||
"""
|
"""
|
||||||
step_tag_id: int | None = None
|
step_tag_id: int | None = None
|
||||||
"""
|
"""
|
||||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
If set, only the score corresponding to the `step_tag_id` in the
|
||||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||||
are returned.
|
are returned.
|
||||||
"""
|
"""
|
||||||
returned_token_ids: list[int] | None = None
|
returned_token_ids: list[int] | None = None
|
||||||
"""
|
"""
|
||||||
A list of indices for the vocabulary dimensions to be extracted,
|
A list of indices for the vocabulary dimensions to be extracted,
|
||||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
such as the token IDs of `good_token` and `bad_token` in the
|
||||||
``math-shepherd-mistral-7b-prm`` model.
|
`math-shepherd-mistral-7b-prm` model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
|
|||||||
@ -117,7 +117,7 @@ class ZmqEventPublisher(EventPublisher):
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
endpoint:
|
endpoint:
|
||||||
PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
|
PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to
|
||||||
connect.
|
connect.
|
||||||
replay_endpoint:
|
replay_endpoint:
|
||||||
Optional ROUTER address for replay requests. When given, subscribers can
|
Optional ROUTER address for replay requests. When given, subscribers can
|
||||||
|
|||||||
@ -515,7 +515,7 @@ class StreamingHarmonyContext(HarmonyContext):
|
|||||||
|
|
||||||
def render_for_completion(self) -> list[int]:
|
def render_for_completion(self) -> list[int]:
|
||||||
# now this list of tokens as next turn's starting tokens
|
# now this list of tokens as next turn's starting tokens
|
||||||
# `<|start|>assistant``,
|
# `<|start|>assistant`,
|
||||||
# we need to process them in parser.
|
# we need to process them in parser.
|
||||||
rendered_tokens = super().render_for_completion()
|
rendered_tokens = super().render_for_completion()
|
||||||
|
|
||||||
|
|||||||
@ -1504,7 +1504,7 @@ class LLM:
|
|||||||
"""Return a snapshot of aggregated metrics from Prometheus.
|
"""Return a snapshot of aggregated metrics from Prometheus.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A ``MetricSnapshot`` instance capturing the current state
|
A `MetricSnapshot` instance capturing the current state
|
||||||
of all aggregated metrics from Prometheus.
|
of all aggregated metrics from Prometheus.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
|
|||||||
@ -26,12 +26,12 @@ class RenderConfig:
|
|||||||
|
|
||||||
max_length: int | None = None
|
max_length: int | None = None
|
||||||
"""Maximum allowable total input token length. If provided,
|
"""Maximum allowable total input token length. If provided,
|
||||||
token inputs longer than this raise ``ValueError``."""
|
token inputs longer than this raise `ValueError`."""
|
||||||
|
|
||||||
truncate_prompt_tokens: int | None = None
|
truncate_prompt_tokens: int | None = None
|
||||||
"""Number of tokens to keep. ``None`` means no truncation.
|
"""Number of tokens to keep. `None` means no truncation.
|
||||||
``0`` yields an empty list (and skips embeds).
|
`0` yields an empty list (and skips embeds).
|
||||||
``-1`` maps to ``model_config.max_model_len``."""
|
`-1` maps to `model_config.max_model_len`."""
|
||||||
|
|
||||||
add_special_tokens: bool | None = True
|
add_special_tokens: bool | None = True
|
||||||
"""Whether to add model-specific special tokens during tokenization."""
|
"""Whether to add model-specific special tokens during tokenization."""
|
||||||
@ -107,10 +107,10 @@ class BaseRenderer(ABC):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt_or_prompts: One of:
|
prompt_or_prompts: One of:
|
||||||
- ``str``: Single text prompt.
|
- `str`: Single text prompt.
|
||||||
- ``list[str]``: Batch of text prompts.
|
- `list[str]`: Batch of text prompts.
|
||||||
- ``list[int]``: Single pre-tokenized sequence.
|
- `list[int]`: Single pre-tokenized sequence.
|
||||||
- ``list[list[int]]``: Batch of pre-tokenized sequences.
|
- `list[list[int]]`: Batch of pre-tokenized sequences.
|
||||||
config: Render configuration controlling how prompts are prepared
|
config: Render configuration controlling how prompts are prepared
|
||||||
(e.g., tokenization and length handling).
|
(e.g., tokenization and length handling).
|
||||||
|
|
||||||
@ -134,9 +134,9 @@ class BaseRenderer(ABC):
|
|||||||
Convert text/token and/or base64-encoded embeddings inputs into
|
Convert text/token and/or base64-encoded embeddings inputs into
|
||||||
engine-ready prompt objects using a unified RenderConfig.
|
engine-ready prompt objects using a unified RenderConfig.
|
||||||
|
|
||||||
At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be
|
At least one of `prompt_or_prompts` or `prompt_embeds` must be
|
||||||
provided and non-empty. If both are omitted or empty (e.g., empty
|
provided and non-empty. If both are omitted or empty (e.g., empty
|
||||||
string and empty list), a ``ValueError`` is raised.
|
string and empty list), a `ValueError` is raised.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt_or_prompts: Text or token inputs to include.
|
prompt_or_prompts: Text or token inputs to include.
|
||||||
@ -150,7 +150,7 @@ class BaseRenderer(ABC):
|
|||||||
Engine-ready prompt objects.
|
Engine-ready prompt objects.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds``
|
ValueError: If both `prompt_or_prompts` and `prompt_embeds`
|
||||||
are omitted or empty (decoder prompt cannot be empty), or if
|
are omitted or empty (decoder prompt cannot be empty), or if
|
||||||
length limits are exceeded.
|
length limits are exceeded.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -327,7 +327,7 @@ def zip_enc_dec_prompts(
|
|||||||
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
|
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
|
||||||
instances.
|
instances.
|
||||||
|
|
||||||
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
`mm_processor_kwargs` may also be provided; if a dict is passed, the same
|
||||||
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
||||||
provided, it will be zipped with the encoder/decoder prompts.
|
provided, it will be zipped with the encoder/decoder prompts.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -27,7 +27,7 @@ __all__ = [
|
|||||||
|
|
||||||
|
|
||||||
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
|
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
|
||||||
"""Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
|
"""Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
|
||||||
return (
|
return (
|
||||||
envs.VLLM_USE_FLASHINFER_MOE_FP4
|
envs.VLLM_USE_FLASHINFER_MOE_FP4
|
||||||
and has_flashinfer_cutlass_fused_moe()
|
and has_flashinfer_cutlass_fused_moe()
|
||||||
|
|||||||
@ -887,11 +887,11 @@ def requant_weight_ue8m0_inplace(
|
|||||||
UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
|
UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
weight: Block-quantised weight tensor stored in ``torch.float8_e4m3fn``.
|
weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`.
|
||||||
Expected shape ``(..., M, K)``.
|
Expected shape `(..., M, K)`.
|
||||||
weight_scale: Corresponding per-block scale tensor (``torch.float32``)
|
weight_scale: Corresponding per-block scale tensor (`torch.float32`)
|
||||||
with shape ``(..., M // block_size[0], K // block_size[1])``.
|
with shape `(..., M // block_size[0], K // block_size[1])`.
|
||||||
block_size: 2-element iterable ``[block_m, block_k]`` describing the
|
block_size: 2-element iterable `[block_m, block_k]` describing the
|
||||||
block quantisation granularity.
|
block quantisation granularity.
|
||||||
"""
|
"""
|
||||||
if weight.numel() == 0:
|
if weight.numel() == 0:
|
||||||
|
|||||||
@ -64,7 +64,7 @@ from .utils import (
|
|||||||
class OlmoAttention(nn.Module):
|
class OlmoAttention(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is the attention block where the output is computed as
|
This is the attention block where the output is computed as
|
||||||
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
`Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -144,7 +144,7 @@ class OlmoAttention(nn.Module):
|
|||||||
class OlmoMLP(nn.Module):
|
class OlmoMLP(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is the MLP block where the output is computed as
|
This is the MLP block where the output is computed as
|
||||||
``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
`MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -193,7 +193,7 @@ class OlmoMLP(nn.Module):
|
|||||||
class OlmoDecoderLayer(nn.Module):
|
class OlmoDecoderLayer(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is a typical transformer block where the output is
|
This is a typical transformer block where the output is
|
||||||
computed as ``MLP(LN(x + Attention(LN(x))))``
|
computed as `MLP(LN(x + Attention(LN(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ from vllm.transformers_utils.configs import Olmo3Config
|
|||||||
class Olmo2Attention(nn.Module):
|
class Olmo2Attention(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is the attention block where the output is computed as
|
This is the attention block where the output is computed as
|
||||||
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
`Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -190,7 +190,7 @@ class Olmo2Attention(nn.Module):
|
|||||||
class Olmo2MLP(nn.Module):
|
class Olmo2MLP(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is the MLP block where the output is computed as
|
This is the MLP block where the output is computed as
|
||||||
``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
|
`MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -235,7 +235,7 @@ class Olmo2MLP(nn.Module):
|
|||||||
class Olmo2DecoderLayer(nn.Module):
|
class Olmo2DecoderLayer(nn.Module):
|
||||||
"""
|
"""
|
||||||
This is a typical transformer block where the output is
|
This is a typical transformer block where the output is
|
||||||
computed as ``MLP(LN(x + Attention(LN(x))))``
|
computed as `MLP(LN(x + Attention(LN(x))))`
|
||||||
(plus another skip connection).
|
(plus another skip connection).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -166,7 +166,7 @@ class VisualTokenizer(torch.nn.Module):
|
|||||||
# e.g., for hidden_stride=2, this leads to a token length reduction:
|
# e.g., for hidden_stride=2, this leads to a token length reduction:
|
||||||
# 1024 -> 256 for aimv2
|
# 1024 -> 256 for aimv2
|
||||||
if self.config.hidden_stride > 1:
|
if self.config.hidden_stride > 1:
|
||||||
# this `d` maybe different from the above `d``
|
# this `d` maybe different from the above `d`
|
||||||
n, L, d = features.shape
|
n, L, d = features.shape
|
||||||
sqrt_l = int(L**0.5)
|
sqrt_l = int(L**0.5)
|
||||||
assert sqrt_l**2 == L, (
|
assert sqrt_l**2 == L, (
|
||||||
|
|||||||
@ -99,13 +99,13 @@ class AutoWeightsLoader:
|
|||||||
the weights only once.
|
the weights only once.
|
||||||
|
|
||||||
The weight loading logic for individual modules can be overridden
|
The weight loading logic for individual modules can be overridden
|
||||||
by defining a ``load_weights`` method.
|
by defining a `load_weights` method.
|
||||||
|
|
||||||
Similarly, the weight loading logic for individual parameters can be
|
Similarly, the weight loading logic for individual parameters can be
|
||||||
overridden by defining a ``weight_loader`` method.
|
overridden by defining a `weight_loader` method.
|
||||||
|
|
||||||
Detailed weight loading information can be viewed by setting the
|
Detailed weight loading information can be viewed by setting the
|
||||||
environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
|
environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Models trained using early version ColossalAI
|
# Models trained using early version ColossalAI
|
||||||
@ -372,9 +372,9 @@ def flatten_bn(
|
|||||||
concat: bool = False,
|
concat: bool = False,
|
||||||
) -> list[torch.Tensor] | torch.Tensor:
|
) -> list[torch.Tensor] | torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
|
Flatten the `B` and `N` dimensions of batched multimodal inputs.
|
||||||
|
|
||||||
The input tensor should have shape ``(B, N, ...)```.
|
The input tensor should have shape `(B, N, ...)`.
|
||||||
"""
|
"""
|
||||||
if isinstance(x, torch.Tensor):
|
if isinstance(x, torch.Tensor):
|
||||||
return x.flatten(0, 1)
|
return x.flatten(0, 1)
|
||||||
@ -424,12 +424,12 @@ def _merge_multimodal_embeddings(
|
|||||||
is_multimodal: torch.Tensor,
|
is_multimodal: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
|
Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
|
||||||
positions in ``inputs_embeds`` corresponding to placeholder tokens in
|
positions in `inputs_embeds` corresponding to placeholder tokens in
|
||||||
``input_ids``.
|
`input_ids`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
This updates ``inputs_embeds`` in place.
|
This updates `inputs_embeds` in place.
|
||||||
"""
|
"""
|
||||||
if len(multimodal_embeddings) == 0:
|
if len(multimodal_embeddings) == 0:
|
||||||
return inputs_embeds
|
return inputs_embeds
|
||||||
@ -475,14 +475,14 @@ def merge_multimodal_embeddings(
|
|||||||
placeholder_token_id: int | list[int],
|
placeholder_token_id: int | list[int],
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
|
Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
|
||||||
positions in ``inputs_embeds`` corresponding to placeholder tokens in
|
positions in `inputs_embeds` corresponding to placeholder tokens in
|
||||||
``input_ids``.
|
`input_ids`.
|
||||||
|
|
||||||
``placeholder_token_id`` can be a list of token ids (e.g, token ids
|
`placeholder_token_id` can be a list of token ids (e.g, token ids
|
||||||
of img_start, img_break, and img_end tokens) when needed: This means
|
of img_start, img_break, and img_end tokens) when needed: This means
|
||||||
the order of these tokens in the ``input_ids`` MUST MATCH the order of
|
the order of these tokens in the `input_ids` MUST MATCH the order of
|
||||||
their embeddings in ``multimodal_embeddings`` since we need to
|
their embeddings in `multimodal_embeddings` since we need to
|
||||||
slice-merge instead of individually scattering.
|
slice-merge instead of individually scattering.
|
||||||
|
|
||||||
For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
|
For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
|
||||||
@ -497,7 +497,7 @@ def merge_multimodal_embeddings(
|
|||||||
input_ids for a correct embedding merge.
|
input_ids for a correct embedding merge.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
This updates ``inputs_embeds`` in place.
|
This updates `inputs_embeds` in place.
|
||||||
"""
|
"""
|
||||||
if isinstance(placeholder_token_id, list):
|
if isinstance(placeholder_token_id, list):
|
||||||
is_multimodal = isin_list(input_ids, placeholder_token_id)
|
is_multimodal = isin_list(input_ids, placeholder_token_id)
|
||||||
|
|||||||
@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter):
|
|||||||
# NOTE(@ksayers) some models such as mamba_mixer2 override the
|
# NOTE(@ksayers) some models such as mamba_mixer2 override the
|
||||||
# weight loader to support custom loading. In the future, model-specific
|
# weight loader to support custom loading. In the future, model-specific
|
||||||
# weight loading should be implemented via Model.load_weights. In the
|
# weight loading should be implemented via Model.load_weights. In the
|
||||||
# meantime, support deleting and overriding `weight_loader`` attribute
|
# meantime, support deleting and overriding `weight_loader` attribute
|
||||||
if self._weight_loader is None:
|
if self._weight_loader is None:
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
f"{self.__class__.__name__} weight_loader attribute has been deleted"
|
f"{self.__class__.__name__} weight_loader attribute has been deleted"
|
||||||
|
|||||||
@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate):
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
For each image, insert a number of ``<image>`` feature placeholders
|
For each image, insert a number of `<image>` feature placeholders
|
||||||
equal to the feature size of the vision encoder after the ``<s>`` token:
|
equal to the feature size of the vision encoder after the `<s>` token:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
PromptInsertion(
|
PromptInsertion(
|
||||||
@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate):
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Insert these tokens after a prefix ``Images:``:
|
Insert these tokens after a prefix `Images:`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
PromptInsertion(
|
PromptInsertion(
|
||||||
@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate):
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
For each image, replace one ``<image>`` input placeholder in the prompt
|
For each image, replace one `<image>` input placeholder in the prompt
|
||||||
with a number of ``<image>`` feature placeholders
|
with a number of `<image>` feature placeholders
|
||||||
equal to the feature size of the vision encoder:
|
equal to the feature size of the vision encoder:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate):
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
As above, but further pad the feature placeholders with ``<image_bos>``
|
As above, but further pad the feature placeholders with `<image_bos>`
|
||||||
and `<image_eos>``, which are not supposed to be passed to the vision
|
and `<image_eos>`, which are not supposed to be passed to the vision
|
||||||
encoder:
|
encoder:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
@ -307,7 +307,7 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by ``model_config``.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
@ -340,7 +340,7 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by ``model_config``.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|||||||
@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
|
|||||||
"0x74bd": "AMD_Instinct_MI300X_HF",
|
"0x74bd": "AMD_Instinct_MI300X_HF",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
|
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
|
||||||
if "HIP_VISIBLE_DEVICES" in os.environ:
|
if "HIP_VISIBLE_DEVICES" in os.environ:
|
||||||
val = os.environ["HIP_VISIBLE_DEVICES"]
|
val = os.environ["HIP_VISIBLE_DEVICES"]
|
||||||
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
|
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
|
||||||
|
|||||||
@ -168,7 +168,7 @@ class XPUPlatform(Platform):
|
|||||||
parallel_config.distributed_executor_backend = "uni"
|
parallel_config.distributed_executor_backend = "uni"
|
||||||
elif parallel_config.distributed_executor_backend == "mp":
|
elif parallel_config.distributed_executor_backend == "mp":
|
||||||
# FIXME(kunshang):
|
# FIXME(kunshang):
|
||||||
# spawn needs calling `if __name__ == '__main__':``
|
# spawn needs calling `if __name__ == '__main__':`
|
||||||
# fork is not supported for xpu start new process.
|
# fork is not supported for xpu start new process.
|
||||||
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
|
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|||||||
@ -306,10 +306,10 @@ class SamplingParams(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
# how we deal with `best_of``:
|
# how we deal with `best_of`:
|
||||||
# if `best_of`` is not set, we default to `n`;
|
# if `best_of` is not set, we default to `n`;
|
||||||
# if `best_of`` is set, we set `n`` to `best_of`,
|
# if `best_of` is set, we set `n` to `best_of`,
|
||||||
# and set `_real_n`` to the original `n`.
|
# and set `_real_n` to the original `n`.
|
||||||
# when we return the result, we will check
|
# when we return the result, we will check
|
||||||
# if we need to return `n` or `_real_n` results
|
# if we need to return `n` or `_real_n` results
|
||||||
if self.best_of:
|
if self.best_of:
|
||||||
|
|||||||
@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def is_deep_gemm_supported() -> bool:
|
def is_deep_gemm_supported() -> bool:
|
||||||
"""Return ``True`` if DeepGEMM is supported on the current platform.
|
"""Return `True` if DeepGEMM is supported on the current platform.
|
||||||
Currently, only Hopper and Blackwell GPUs are supported.
|
Currently, only Hopper and Blackwell GPUs are supported.
|
||||||
"""
|
"""
|
||||||
is_supported_arch = current_platform.is_cuda() and (
|
is_supported_arch = current_platform.is_cuda() and (
|
||||||
@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool:
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def is_deep_gemm_e8m0_used() -> bool:
|
def is_deep_gemm_e8m0_used() -> bool:
|
||||||
"""Return ``True`` if vLLM is configured to use DeepGEMM "
|
"""Return `True` if vLLM is configured to use DeepGEMM "
|
||||||
"E8M0 scale on a Hopper or Blackwell-class GPU.
|
"E8M0 scale on a Hopper or Blackwell-class GPU.
|
||||||
"""
|
"""
|
||||||
if not is_deep_gemm_supported():
|
if not is_deep_gemm_supported():
|
||||||
@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
|
|||||||
"""Return a global difference metric for unit tests.
|
"""Return a global difference metric for unit tests.
|
||||||
|
|
||||||
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
|
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
|
||||||
error, causing ``torch.testing.assert_close`` to fail. Instead of checking
|
error, causing `torch.testing.assert_close` to fail. Instead of checking
|
||||||
every element, we compute a cosine-style similarity over the whole tensor
|
every element, we compute a cosine-style similarity over the whole tensor
|
||||||
and report ``1 - sim``. Once kernel accuracy improves this helper can be
|
and report `1 - sim`. Once kernel accuracy improves this helper can be
|
||||||
removed.
|
removed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_flashinfer() -> bool:
|
def has_flashinfer() -> bool:
|
||||||
"""Return ``True`` if FlashInfer is available."""
|
"""Return `True` if FlashInfer is available."""
|
||||||
# Use find_spec to check if the module exists without importing it
|
# Use find_spec to check if the module exists without importing it
|
||||||
# This avoids potential CUDA initialization side effects
|
# This avoids potential CUDA initialization side effects
|
||||||
if importlib.util.find_spec("flashinfer") is None:
|
if importlib.util.find_spec("flashinfer") is None:
|
||||||
@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper(
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_flashinfer_comm() -> bool:
|
def has_flashinfer_comm() -> bool:
|
||||||
"""Return ``True`` if FlashInfer comm module is available."""
|
"""Return `True` if FlashInfer comm module is available."""
|
||||||
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
|
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
|
||||||
|
|
||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_flashinfer_all2all() -> bool:
|
def has_flashinfer_all2all() -> bool:
|
||||||
"""Return ``True`` if FlashInfer mnnvl all2all is available."""
|
"""Return `True` if FlashInfer mnnvl all2all is available."""
|
||||||
if not has_flashinfer_comm():
|
if not has_flashinfer_comm():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool:
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_flashinfer_moe() -> bool:
|
def has_flashinfer_moe() -> bool:
|
||||||
"""Return ``True`` if FlashInfer MoE module is available."""
|
"""Return `True` if FlashInfer MoE module is available."""
|
||||||
return (
|
return (
|
||||||
has_flashinfer()
|
has_flashinfer()
|
||||||
and importlib.util.find_spec("flashinfer.fused_moe") is not None
|
and importlib.util.find_spec("flashinfer.fused_moe") is not None
|
||||||
@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool:
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_flashinfer_cutlass_fused_moe() -> bool:
|
def has_flashinfer_cutlass_fused_moe() -> bool:
|
||||||
"""Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
|
"""Return `True` if FlashInfer CUTLASS fused MoE is available."""
|
||||||
if not has_flashinfer_moe():
|
if not has_flashinfer_moe():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def has_nvidia_artifactory() -> bool:
|
def has_nvidia_artifactory() -> bool:
|
||||||
"""Return ``True`` if NVIDIA's artifactory is accessible.
|
"""Return `True` if NVIDIA's artifactory is accessible.
|
||||||
|
|
||||||
This checks connectivity to the kernel inference library artifactory
|
This checks connectivity to the kernel inference library artifactory
|
||||||
which is required for downloading certain cubin kernels like TRTLLM FHMA.
|
which is required for downloading certain cubin kernels like TRTLLM FHMA.
|
||||||
@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
|
|||||||
|
|
||||||
def force_use_trtllm_attention() -> bool | None:
|
def force_use_trtllm_attention() -> bool | None:
|
||||||
"""
|
"""
|
||||||
Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set,
|
Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
|
||||||
return ``True`` if TRTLLM attention is forced to be used,
|
return `True` if TRTLLM attention is forced to be used,
|
||||||
return ``False`` if TRTLLM attention is forced to be not used.
|
return `False` if TRTLLM attention is forced to be not used.
|
||||||
"""
|
"""
|
||||||
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
|
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ def use_trtllm_attention(
|
|||||||
has_sinks: bool = False,
|
has_sinks: bool = False,
|
||||||
has_spec: bool = False,
|
has_spec: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Return ``True`` if TRTLLM attention is used."""
|
"""Return `True` if TRTLLM attention is used."""
|
||||||
force_use_trtllm = force_use_trtllm_attention()
|
force_use_trtllm = force_use_trtllm_attention()
|
||||||
|
|
||||||
# Environment variable is set to 0 - respect it
|
# Environment variable is set to 0 - respect it
|
||||||
|
|||||||
@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import (
|
|||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
|
|
||||||
# BlockHash represents the hash of a single KV-cache block used for
|
# BlockHash represents the hash of a single KV-cache block used for
|
||||||
# prefix caching. Treating it as a distinct type from ``bytes`` helps
|
# prefix caching. Treating it as a distinct type from `bytes` helps
|
||||||
# catch accidental misuse when passing around raw byte strings.
|
# catch accidental misuse when passing around raw byte strings.
|
||||||
BlockHash = NewType("BlockHash", bytes)
|
BlockHash = NewType("BlockHash", bytes)
|
||||||
|
|
||||||
# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID.
|
# `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
|
||||||
# It is represented as raw bytes for compactness and efficiency. The helper
|
# It is represented as raw bytes for compactness and efficiency. The helper
|
||||||
# functions below pack/unpack the ``BlockHash`` and group id into/from the key.
|
# functions below pack/unpack the `BlockHash` and group id into/from the key.
|
||||||
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
|
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
|
||||||
|
|
||||||
# ExternalBlockHash is used for reproducible prefix-cache block hashing.
|
# ExternalBlockHash is used for reproducible prefix-cache block hashing.
|
||||||
# It's a union of ``bytes`` and ``int`` to keep backward compatibility
|
# It's a union of `bytes` and `int` to keep backward compatibility
|
||||||
# after we default block hashing to use sha256 bytes.
|
# after we default block hashing to use sha256 bytes.
|
||||||
ExternalBlockHash: TypeAlias = bytes | int
|
ExternalBlockHash: TypeAlias = bytes | int
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int
|
|||||||
def make_block_hash_with_group_id(
|
def make_block_hash_with_group_id(
|
||||||
block_hash: BlockHash, group_id: int
|
block_hash: BlockHash, group_id: int
|
||||||
) -> BlockHashWithGroupId:
|
) -> BlockHashWithGroupId:
|
||||||
"""Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``.
|
"""Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
|
||||||
|
|
||||||
The group id is encoded using 4 bytes in big-endian order and appended to
|
The group id is encoded using 4 bytes in big-endian order and appended to
|
||||||
the block hash bytes. This representation avoids creating tuples while
|
the block hash bytes. This representation avoids creating tuples while
|
||||||
@ -54,12 +54,12 @@ def make_block_hash_with_group_id(
|
|||||||
|
|
||||||
|
|
||||||
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
|
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
|
||||||
"""Extract the ``BlockHash`` from a ``BlockHashWithGroupId``."""
|
"""Extract the `BlockHash` from a `BlockHashWithGroupId`."""
|
||||||
return BlockHash(key[:-4])
|
return BlockHash(key[:-4])
|
||||||
|
|
||||||
|
|
||||||
def get_group_id(key: BlockHashWithGroupId) -> int:
|
def get_group_id(key: BlockHashWithGroupId) -> int:
|
||||||
"""Extract the group id from a ``BlockHashWithGroupId``."""
|
"""Extract the group id from a `BlockHashWithGroupId`."""
|
||||||
return int.from_bytes(key[-4:], "big", signed=False)
|
return int.from_bytes(key[-4:], "big", signed=False)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -128,7 +128,7 @@ class CPUWorker(Worker):
|
|||||||
"Please try to bind threads manually."
|
"Please try to bind threads manually."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]``
|
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
|
||||||
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
|
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
|
||||||
logical_cpu_list = [
|
logical_cpu_list = [
|
||||||
x for x in logical_cpu_list if x.numa_node == selected_numa_node
|
x for x in logical_cpu_list if x.numa_node == selected_numa_node
|
||||||
|
|||||||
@ -182,8 +182,8 @@ class TPUWorker:
|
|||||||
if isinstance(layer_spec, AttentionSpec):
|
if isinstance(layer_spec, AttentionSpec):
|
||||||
dtype = layer_spec.dtype
|
dtype = layer_spec.dtype
|
||||||
|
|
||||||
# Use an empty tensor instead of `None`` to force Dynamo to pass
|
# Use an empty tensor instead of `None` to force Dynamo to pass
|
||||||
# it by reference, rather by specializing on the value ``None``.
|
# it by reference, rather by specializing on the value `None`.
|
||||||
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
|
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
|
||||||
kv_caches[layer_name] = tpu_kv_cache
|
kv_caches[layer_name] = tpu_kv_cache
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user