From 46fae69cf04db85a3b187a5fdc061b21e24b9571 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 4 May 2025 06:59:24 +0800 Subject: [PATCH 1/3] [Misc] V0 fallback for `--enable-prompt-embeds` (#17615) Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 6 ++++++ vllm/inputs/preprocess.py | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f6f8fb69fb706..08dbb4c450393 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1237,6 +1237,12 @@ class EngineArgs: recommend_to_remove=False) return False + # No text embedding inputs so far. + if self.enable_prompt_embeds: + _raise_or_fallback(feature_name="--enable-prompt-embeds", + recommend_to_remove=False) + return False + # Only Fp16 and Bf16 dtypes since we only support FA. V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] if model_config.dtype not in V1_SUPPORTED_DTYPES: diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 53e0a477a12df..fe4775b210a87 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,6 @@ from typing import Any, Optional, Union, cast from typing_extensions import assert_never -from vllm import envs from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -306,8 +305,6 @@ class InputPreprocessor: if not self.model_config.enable_prompt_embeds: raise ValueError("You must set `--enable-prompt-embeds` to input " "`prompt_embeds`.") - if envs.VLLM_USE_V1: - raise ValueError("`prompt_embeds` is only available in V0.") prompt_embeds = parsed_content["prompt_embeds"] From d6484ef3c3a01dc89176ed49874a327303356bee Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 4 May 2025 03:42:43 +0100 Subject: [PATCH 2/3] Add full API docs and improve the UX of navigating them (#17485) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- .gitignore | 1 + docs/Makefile | 1 + docs/source/api/engine/async_llm_engine.md | 7 - docs/source/api/engine/index.md | 17 -- docs/source/api/engine/llm_engine.md | 7 - docs/source/api/inference_params.md | 21 -- docs/source/api/model/adapters.md | 9 - docs/source/api/model/index.md | 11 - docs/source/api/model/interfaces.md | 9 - docs/source/api/model/interfaces_base.md | 9 - docs/source/api/multimodal/index.md | 28 --- docs/source/api/multimodal/inputs.md | 49 ---- docs/source/api/multimodal/parse.md | 9 - docs/source/api/multimodal/processing.md | 9 - docs/source/api/multimodal/profiling.md | 9 - docs/source/api/multimodal/registry.md | 9 - docs/source/api/offline_inference/index.md | 9 - docs/source/api/offline_inference/llm.md | 7 - .../api/offline_inference/llm_inputs.md | 19 -- docs/source/api/summary.md | 133 ++++++++++ docs/source/autodoc2_docstring_parser.py | 21 ++ docs/source/conf.py | 128 +++++----- docs/source/design/arch_overview.md | 4 +- docs/source/features/compatibility_matrix.md | 6 +- docs/source/index.md | 7 +- docs/source/models/generative_models.md | 2 +- docs/source/models/pooling_models.md | 2 +- docs/source/serving/offline_inference.md | 4 +- examples/offline_inference/profiling.py | 2 +- requirements/docs.txt | 16 +- tests/conftest.py | 8 +- tests/tokenization/test_get_eos.py | 2 +- tests/utils.py | 2 +- tests/v1/core/test_scheduler.py | 2 +- vllm/attention/backends/mla/common.py | 2 + vllm/attention/backends/utils.py | 2 +- vllm/compilation/compiler_interface.py | 2 +- vllm/config.py | 10 +- vllm/connections.py | 2 +- vllm/distributed/kv_transfer/__init__.py | 1 + vllm/engine/async_llm_engine.py | 98 ++++---- vllm/engine/llm_engine.py | 105 ++++---- vllm/engine/multiprocessing/client.py | 4 +- vllm/engine/multiprocessing/engine.py | 12 +- vllm/engine/output_processor/multi_step.py | 4 +- vllm/engine/output_processor/single_step.py | 12 +- vllm/entrypoints/llm.py | 62 ++--- vllm/entrypoints/openai/protocol.py | 19 +- vllm/entrypoints/openai/serving_engine.py | 4 +- vllm/executor/executor_base.py | 2 +- vllm/inputs/__init__.py | 2 +- vllm/inputs/data.py | 42 ++-- vllm/inputs/preprocess.py | 22 +- vllm/inputs/registry.py | 8 +- vllm/logger.py | 6 +- vllm/lora/ops/triton_ops/__init__.py | 4 +- .../{lora_expand.py => lora_expand_op.py} | 0 .../{lora_shrink.py => lora_shrink_op.py} | 0 .../layers/rejection_sampler.py | 31 +-- vllm/model_executor/layers/sampler.py | 2 +- .../layers/typical_acceptance_sampler.py | 15 +- vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/interfaces.py | 4 +- vllm/model_executor/models/llava.py | 5 +- vllm/model_executor/models/llava_next.py | 7 +- vllm/model_executor/models/mistral3.py | 5 +- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/phi4mm_utils.py | 4 +- vllm/model_executor/models/pixtral.py | 4 +- vllm/model_executor/models/qwen_vl.py | 2 +- vllm/model_executor/models/registry.py | 12 +- vllm/model_executor/models/utils.py | 2 +- vllm/multimodal/__init__.py | 7 +- vllm/multimodal/base.py | 44 ++-- vllm/multimodal/inputs.py | 231 +++++++++--------- vllm/multimodal/parse.py | 10 +- vllm/multimodal/processing.py | 220 ++++++++--------- vllm/multimodal/profiling.py | 4 +- vllm/multimodal/registry.py | 20 +- vllm/multimodal/utils.py | 41 ++-- vllm/platforms/cpu.py | 2 - vllm/platforms/cuda.py | 8 +- vllm/platforms/interface.py | 6 +- vllm/profiler/__init__.py | 7 - vllm/sequence.py | 14 +- .../spec_decode/smaller_tp_proposer_worker.py | 3 +- vllm/transformers_utils/configs/dbrx.py | 3 +- vllm/transformers_utils/configs/exaone.py | 44 ++-- vllm/transformers_utils/tokenizer.py | 8 +- vllm/utils.py | 24 +- vllm/v1/attention/backends/mla/common.py | 2 + vllm/v1/core/kv_cache_manager.py | 2 + vllm/v1/engine/output_processor.py | 4 +- vllm/v1/sample/rejection_sampler.py | 2 +- vllm/v1/worker/gpu_worker.py | 7 +- vllm/v1/worker/utils.py | 6 +- vllm/worker/hpu_worker.py | 7 +- vllm/worker/multi_step_model_runner.py | 4 +- vllm/worker/worker.py | 7 +- vllm/worker/xpu_worker.py | 9 +- 101 files changed, 872 insertions(+), 980 deletions(-) delete mode 100644 docs/source/api/engine/async_llm_engine.md delete mode 100644 docs/source/api/engine/index.md delete mode 100644 docs/source/api/engine/llm_engine.md delete mode 100644 docs/source/api/inference_params.md delete mode 100644 docs/source/api/model/adapters.md delete mode 100644 docs/source/api/model/index.md delete mode 100644 docs/source/api/model/interfaces.md delete mode 100644 docs/source/api/model/interfaces_base.md delete mode 100644 docs/source/api/multimodal/index.md delete mode 100644 docs/source/api/multimodal/inputs.md delete mode 100644 docs/source/api/multimodal/parse.md delete mode 100644 docs/source/api/multimodal/processing.md delete mode 100644 docs/source/api/multimodal/profiling.md delete mode 100644 docs/source/api/multimodal/registry.md delete mode 100644 docs/source/api/offline_inference/index.md delete mode 100644 docs/source/api/offline_inference/llm.md delete mode 100644 docs/source/api/offline_inference/llm_inputs.md create mode 100644 docs/source/api/summary.md create mode 100644 docs/source/autodoc2_docstring_parser.py rename vllm/lora/ops/triton_ops/{lora_expand.py => lora_expand_op.py} (100%) rename vllm/lora/ops/triton_ops/{lora_shrink.py => lora_shrink_op.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 84ee991f56599..b3005b1b4b062 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -39,7 +39,7 @@ steps: - pip install -r ../../requirements/docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/api/inference_params.html + - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min source_file_dependencies: diff --git a/.gitignore b/.gitignore index 728213ceb74f0..2756c612b82f8 100644 --- a/.gitignore +++ b/.gitignore @@ -80,6 +80,7 @@ instance/ # Sphinx documentation docs/_build/ docs/source/getting_started/examples/ +docs/source/api/vllm # PyBuilder .pybuilder/ diff --git a/docs/Makefile b/docs/Makefile index 5b801f79d1f26..d3b429dfb9257 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -22,3 +22,4 @@ help: clean: @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) rm -rf "$(SOURCEDIR)/getting_started/examples" + rm -rf "$(SOURCEDIR)/api/vllm" diff --git a/docs/source/api/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md deleted file mode 100644 index 904feaa505164..0000000000000 --- a/docs/source/api/engine/async_llm_engine.md +++ /dev/null @@ -1,7 +0,0 @@ -# AsyncLLMEngine - -```{eval-rst} -.. autoclass:: vllm.AsyncLLMEngine - :members: - :show-inheritance: -``` diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md deleted file mode 100644 index b6544d94afdf8..0000000000000 --- a/docs/source/api/engine/index.md +++ /dev/null @@ -1,17 +0,0 @@ -# vLLM Engine - -```{eval-rst} -.. automodule:: vllm.engine -``` - -```{eval-rst} -.. currentmodule:: vllm.engine -``` - -:::{toctree} -:caption: Engines -:maxdepth: 2 - -llm_engine -async_llm_engine -::: diff --git a/docs/source/api/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md deleted file mode 100644 index d6613ef5562dc..0000000000000 --- a/docs/source/api/engine/llm_engine.md +++ /dev/null @@ -1,7 +0,0 @@ -# LLMEngine - -```{eval-rst} -.. autoclass:: vllm.LLMEngine - :members: - :show-inheritance: -``` diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md deleted file mode 100644 index 181c30cab9c4a..0000000000000 --- a/docs/source/api/inference_params.md +++ /dev/null @@ -1,21 +0,0 @@ -# Inference Parameters - -Inference parameters for vLLM APIs. - -(sampling-params)= - -## Sampling Parameters - -```{eval-rst} -.. autoclass:: vllm.SamplingParams - :members: -``` - -(pooling-params)= - -## Pooling Parameters - -```{eval-rst} -.. autoclass:: vllm.PoolingParams - :members: -``` diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md deleted file mode 100644 index e103a51d0070d..0000000000000 --- a/docs/source/api/model/adapters.md +++ /dev/null @@ -1,9 +0,0 @@ -# Model Adapters - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.adapters - :members: - :member-order: bysource -``` diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md deleted file mode 100644 index 8fee3a55c93de..0000000000000 --- a/docs/source/api/model/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# Model Development - -## Submodules - -:::{toctree} -:maxdepth: 1 - -interfaces_base -interfaces -adapters -::: diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md deleted file mode 100644 index 55bee57f64faa..0000000000000 --- a/docs/source/api/model/interfaces.md +++ /dev/null @@ -1,9 +0,0 @@ -# Optional Interfaces - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.interfaces - :members: - :member-order: bysource -``` diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md deleted file mode 100644 index 75d58d34228e9..0000000000000 --- a/docs/source/api/model/interfaces_base.md +++ /dev/null @@ -1,9 +0,0 @@ -# Base Model Interfaces - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.interfaces_base - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md deleted file mode 100644 index 069ed53e545c5..0000000000000 --- a/docs/source/api/multimodal/index.md +++ /dev/null @@ -1,28 +0,0 @@ -(multi-modality)= - -# Multi-Modality - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). - -## Module Contents - -```{eval-rst} -.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY -``` - -## Submodules - -:::{toctree} -:maxdepth: 1 - -inputs -parse -processing -profiling -registry -::: diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md deleted file mode 100644 index 21bd938be9e89..0000000000000 --- a/docs/source/api/multimodal/inputs.md +++ /dev/null @@ -1,49 +0,0 @@ -# Input Definitions - -## User-facing inputs - -```{eval-rst} -.. autodata:: vllm.multimodal.inputs.MultiModalDataDict -``` - -## Internal data structures - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.PlaceholderRange - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autodata:: vllm.multimodal.inputs.NestedTensors -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputs - :members: - :show-inheritance: -``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md deleted file mode 100644 index 4676139efe626..0000000000000 --- a/docs/source/api/multimodal/parse.md +++ /dev/null @@ -1,9 +0,0 @@ -# Data Parsing - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.parse - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md deleted file mode 100644 index 0d81c8d3966ee..0000000000000 --- a/docs/source/api/multimodal/processing.md +++ /dev/null @@ -1,9 +0,0 @@ -# Data Processing - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.processing - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md deleted file mode 100644 index b455145212202..0000000000000 --- a/docs/source/api/multimodal/profiling.md +++ /dev/null @@ -1,9 +0,0 @@ -# Memory Profiling - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.profiling - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md deleted file mode 100644 index 0737a4385cf32..0000000000000 --- a/docs/source/api/multimodal/registry.md +++ /dev/null @@ -1,9 +0,0 @@ -# Registry - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.registry - :members: - :member-order: bysource -``` diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md deleted file mode 100644 index ec2cc599d923c..0000000000000 --- a/docs/source/api/offline_inference/index.md +++ /dev/null @@ -1,9 +0,0 @@ -# Offline Inference - -:::{toctree} -:caption: Contents -:maxdepth: 1 - -llm -llm_inputs -::: diff --git a/docs/source/api/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md deleted file mode 100644 index 9f129d5e41686..0000000000000 --- a/docs/source/api/offline_inference/llm.md +++ /dev/null @@ -1,7 +0,0 @@ -# LLM Class - -```{eval-rst} -.. autoclass:: vllm.LLM - :members: - :show-inheritance: -``` diff --git a/docs/source/api/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md deleted file mode 100644 index 21f688a12c536..0000000000000 --- a/docs/source/api/offline_inference/llm_inputs.md +++ /dev/null @@ -1,19 +0,0 @@ -# LLM Inputs - -```{eval-rst} -.. autodata:: vllm.inputs.PromptType -``` - -```{eval-rst} -.. autoclass:: vllm.inputs.TextPrompt - :show-inheritance: - :members: - :member-order: bysource -``` - -```{eval-rst} -.. autoclass:: vllm.inputs.TokensPrompt - :show-inheritance: - :members: - :member-order: bysource -``` diff --git a/docs/source/api/summary.md b/docs/source/api/summary.md new file mode 100644 index 0000000000000..46de545f9ded4 --- /dev/null +++ b/docs/source/api/summary.md @@ -0,0 +1,133 @@ +# Summary + +(configuration)= + +## Configuration + +API documentation for vLLM's configuration classes. + +```{autodoc2-summary} + vllm.config.ModelConfig + vllm.config.CacheConfig + vllm.config.TokenizerPoolConfig + vllm.config.LoadConfig + vllm.config.ParallelConfig + vllm.config.SchedulerConfig + vllm.config.DeviceConfig + vllm.config.SpeculativeConfig + vllm.config.LoRAConfig + vllm.config.PromptAdapterConfig + vllm.config.MultiModalConfig + vllm.config.PoolerConfig + vllm.config.DecodingConfig + vllm.config.ObservabilityConfig + vllm.config.KVTransferConfig + vllm.config.CompilationConfig + vllm.config.VllmConfig +``` + +(offline-inference-api)= + +## Offline Inference + +LLM Class. + +```{autodoc2-summary} + vllm.LLM +``` + +LLM Inputs. + +```{autodoc2-summary} + vllm.inputs.PromptType + vllm.inputs.TextPrompt + vllm.inputs.TokensPrompt +``` + +## vLLM Engines + +Engine classes for offline and online inference. + +```{autodoc2-summary} + vllm.LLMEngine + vllm.AsyncLLMEngine +``` + +## Inference Parameters + +Inference parameters for vLLM APIs. + +(sampling-params)= +(pooling-params)= + +```{autodoc2-summary} + vllm.SamplingParams + vllm.PoolingParams +``` + +(multi-modality)= + +## Multi-Modality + +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. + +Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). + +```{autodoc2-summary} + vllm.multimodal.MULTIMODAL_REGISTRY +``` + +### Inputs + +User-facing inputs. + +```{autodoc2-summary} + vllm.multimodal.inputs.MultiModalDataDict +``` + +Internal data structures. + +```{autodoc2-summary} + vllm.multimodal.inputs.PlaceholderRange + vllm.multimodal.inputs.NestedTensors + vllm.multimodal.inputs.MultiModalFieldElem + vllm.multimodal.inputs.MultiModalFieldConfig + vllm.multimodal.inputs.MultiModalKwargsItem + vllm.multimodal.inputs.MultiModalKwargs + vllm.multimodal.inputs.MultiModalInputs +``` + +### Data Parsing + +```{autodoc2-summary} + vllm.multimodal.parse +``` + +### Data Processing + +```{autodoc2-summary} + vllm.multimodal.processing +``` + +### Memory Profiling + +```{autodoc2-summary} + vllm.multimodal.profiling +``` + +### Registry + +```{autodoc2-summary} + vllm.multimodal.registry +``` + +## Model Development + +```{autodoc2-summary} + vllm.model_executor.models.interfaces_base + vllm.model_executor.models.interfaces + vllm.model_executor.models.adapters +``` diff --git a/docs/source/autodoc2_docstring_parser.py b/docs/source/autodoc2_docstring_parser.py new file mode 100644 index 0000000000000..41c49ed1c545a --- /dev/null +++ b/docs/source/autodoc2_docstring_parser.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +from docutils import nodes +from myst_parser.parsers.sphinx_ import MystParser +from sphinx.ext.napoleon import docstring + + +class NapoleonParser(MystParser): + + def parse(self, input_string: str, document: nodes.document) -> None: + # Get the Sphinx configuration + config = document.settings.env.config + + parsed_content = str( + docstring.GoogleDocstring( + str(docstring.NumpyDocstring(input_string, config)), + config, + )) + return super().parse(parsed_content, document) + + +Parser = NapoleonParser diff --git a/docs/source/conf.py b/docs/source/conf.py index c2ad6f9fa3a55..060649e43b96b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,16 +13,17 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. import datetime -import inspect import logging import os +import re import sys +from pathlib import Path import requests -from sphinx.ext import autodoc logger = logging.getLogger(__name__) -sys.path.append(os.path.abspath("../..")) +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.append(os.path.abspath(REPO_ROOT)) # -- Project information ----------------------------------------------------- @@ -40,8 +41,7 @@ extensions = [ "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", + "autodoc2", "myst_parser", "sphinxarg.ext", "sphinx_design", @@ -49,7 +49,22 @@ extensions = [ ] myst_enable_extensions = [ "colon_fence", + "fieldlist", ] +autodoc2_packages = [ + { + "path": "../../vllm", + "exclude_dirs": ["__pycache__", "third_party"], + }, +] +autodoc2_output_dir = "api" +autodoc2_render_plugin = "myst" +autodoc2_hidden_objects = ["dunder", "private", "inherited"] +autodoc2_docstring_parser_regexes = [ + (".*", "docs.source.autodoc2_docstring_parser"), +] +autodoc2_sort_names = True +autodoc2_index_template = None # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -77,6 +92,11 @@ html_theme_options = { 'repository_url': 'https://github.com/vllm-project/vllm', 'use_repository_button': True, 'use_edit_page_button': True, + # Prevents the full API being added to the left sidebar of every page. + # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB. + 'collapse_navbar': True, + # Makes API visible in the right sidebar on API reference pages. + 'show_toc_level': 3, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -164,73 +184,64 @@ def linkcode_resolve(domain, info): return None if not info['module']: return None - filename = info['module'].replace('.', '/') - module = info['module'] - # try to determine the correct file and line number to link to - obj = sys.modules[module] + # Get path from module name + file = Path(f"{info['module'].replace('.', '/')}.py") + path = REPO_ROOT / file + if not path.exists(): + path = REPO_ROOT / file.with_suffix("") / "__init__.py" + if not path.exists(): + return None - # get as specific as we can - lineno: int = 0 - filename: str = "" - try: - for part in info['fullname'].split('.'): - obj = getattr(obj, part) + # Get the line number of the object + with open(path) as f: + lines = f.readlines() + name = info['fullname'].split(".")[-1] + pattern = fr"^( {{4}})*((def|class) )?{name}\b.*" + for lineno, line in enumerate(lines, 1): + if not line or line.startswith("#"): + continue + if re.match(pattern, line): + break - # Skip decorator wrappers by checking if the object is a function - # and has a __wrapped__ attribute (which decorators typically set) - while hasattr(obj, '__wrapped__'): - obj = obj.__wrapped__ + # If the line number is not found, return None + if lineno == len(lines): + return None - if not (inspect.isclass(obj) or inspect.isfunction(obj) - or inspect.ismethod(obj)): - obj = obj.__class__ # Get the class of the instance - - lineno = inspect.getsourcelines(obj)[1] - filename = (inspect.getsourcefile(obj) - or f"{filename}.py").split("vllm/", 1)[1] - except Exception: - # For some things, like a class member, won't work, so - # we'll use the line number of the parent (the class) - pass - - if filename.startswith("checkouts/"): + # If the line number is found, create the URL + filename = path.relative_to(REPO_ROOT) + if "checkouts" in path.parts: # a PR build on readthedocs - pr_number = filename.split("/")[1] - filename = filename.split("/", 2)[2] + pr_number = REPO_ROOT.name base, branch = get_repo_base_and_branch(pr_number) if base and branch: return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" - # Otherwise, link to the source file on the main branch return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" -# Mock out external dependencies here, otherwise the autodoc pages may be blank. +# Mock out external dependencies here, otherwise sphinx-argparse won't work. autodoc_mock_imports = [ + "huggingface_hub", + "pydantic", + "zmq", + "cloudpickle", + "aiohttp", + "starlette", "blake3", - "compressed_tensors", "cpuinfo", - "cv2", - "torch", "transformers", "psutil", - "prometheus_client", - "sentencepiece", "vllm._C", "PIL", "numpy", - 'triton', "tqdm", - "tensorizer", - "pynvml", - "outlines", - "xgrammar", - "librosa", - "soundfile", - "gguf", - "lark", - "decord", + # The mocks below are required by + # docs/source/serving/openai_compatible_server.md's + # vllm.entrypoints.openai.cli_args + "openai", + "fastapi", + "partial_json_parser", ] for mock_target in autodoc_mock_imports: @@ -241,18 +252,6 @@ for mock_target in autodoc_mock_imports: "been loaded into sys.modules when the sphinx build starts.", mock_target) - -class MockedClassDocumenter(autodoc.ClassDocumenter): - """Remove note about base class when a class is derived from object.""" - - def add_line(self, line: str, source: str, *lineno: int) -> None: - if line == " Bases: :py:class:`object`": - return - super().add_line(line, source, *lineno) - - -autodoc.ClassDocumenter = MockedClassDocumenter - intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "typing_extensions": @@ -264,7 +263,4 @@ intersphinx_mapping = { "psutil": ("https://psutil.readthedocs.io/en/stable", None), } -autodoc_preserve_defaults = True -autodoc_warningiserror = True - navigation_with_keys = False diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 7bed0a001d6f5..94bda8b5c58d5 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -52,8 +52,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -More API details can be found in the {doc}`Offline Inference -` section of the API docs. +More API details can be found in the [Offline Inference] +(#offline-inference-api) section of the API docs. The code for the `LLM` class can be found in . diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 6056ca0d366b5..8865d26deaeda 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h * [APC](#automatic-prefix-caching) * [LoRA](#lora-adapter) * prmpt adptr - * [SD](#spec_decode) + * [SD](#spec-decode) * CUDA graph * pooling * enc-dec @@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h * * * -- * [SD](#spec_decode) +- * [SD](#spec-decode) * ✅ * ✅ * ❌ @@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h * ✅ * [❌](gh-issue:8475) * ✅ -- * [SD](#spec_decode) +- * [SD](#spec-decode) * ✅ * ✅ * ✅ diff --git a/docs/source/index.md b/docs/source/index.md index 56ff7a485f58d..bbff7361f7528 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -194,11 +194,8 @@ contributing/vulnerability_management :caption: API Reference :maxdepth: 2 -api/offline_inference/index -api/engine/index -api/inference_params -api/multimodal/index -api/model/index +api/summary +api/vllm/vllm ::: % Latest news and acknowledgements diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 3291006ed668c..dd765e4a97658 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it. ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. -See [Engine Arguments](#engine-args) for a list of options when initializing the model. +See for a list of options when initializing the model. ### `LLM.generate` diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 7daa0ec1de4de..8c8d1832d3821 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. -See [Engine Arguments](#engine-args) for a list of options when initializing the model. +See for a list of options when initializing the model. ### `LLM.encode` diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 894878ed14e76..2621eda32542d 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -25,7 +25,7 @@ The available APIs depend on the type of model that is being run: Please refer to the above pages for more details about each API. :::{seealso} -[API Reference](/api/offline_inference/index) +[API Reference](#offline-inference-api) ::: (configuration-options)= @@ -33,7 +33,7 @@ Please refer to the above pages for more details about each API. ## Configuration Options This section lists the most common options for running the vLLM engine. -For a full list, refer to the [Engine Arguments](#engine-args) page. +For a full list, refer to the page. (model-resolution)= diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 9c818d0757345..99303950d39d2 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -14,7 +14,7 @@ import tqdm from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.profiler import layerwise_profile +from vllm.profiler.layerwise_profile import layerwise_profile from vllm.utils import FlexibleArgumentParser BATCH_SIZE_DEFAULT = 1 diff --git a/requirements/docs.txt b/requirements/docs.txt index cba86b52a9b93..401f714ae9f73 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,27 +1,15 @@ sphinx==8.2.3 sphinx-argparse==0.5.2 +sphinx-autodoc2==0.5.0 sphinx-book-theme==1.1.4 sphinx-copybutton==0.5.2 sphinx-design==0.6.1 sphinx-togglebutton==0.3.2 myst-parser==4.0.1 msgspec -cloudpickle commonmark # Required by sphinx-argparse when using :markdownhelp: # packages to install to build the documentation cachetools -pydantic >= 2.8 -f https://download.pytorch.org/whl/cpu -torch -py-cpuinfo -transformers -mistral_common >= 1.5.4 -aiohttp -starlette -scipy -openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -requests -zmq +torch \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index b1b4af86fab72..fa979f1093bef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -112,11 +112,11 @@ class AudioTestAssets(list[AudioAsset]): IMAGE_ASSETS = ImageTestAssets() -"""Singleton instance of :class:`ImageTestAssets`.""" +"""Singleton instance of {class}`ImageTestAssets`.""" VIDEO_ASSETS = VideoTestAssets() -"""Singleton instance of :class:`VideoTestAssets`.""" +"""Singleton instance of {class}`VideoTestAssets`.""" AUDIO_ASSETS = AudioTestAssets() -"""Singleton instance of :class:`AudioTestAssets`.""" +"""Singleton instance of {class}`AudioTestAssets`.""" @pytest.fixture(scope="function", autouse=True) @@ -724,7 +724,7 @@ def hf_runner(): class VllmRunner: """ The default value of some arguments have been modified from - :class:`~vllm.LLM` as follows: + {class}`~vllm.LLM` as follows: - `trust_remote_code`: Set to `True` instead of `False` for convenience. - `seed`: Set to `0` instead of `None` for test reproducibility. diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index fc47bcb9de371..8942f88912830 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -2,7 +2,7 @@ """ This test file includes some cases where it is inappropriate to only get the `eos_token_id` from the tokenizer as defined by -:meth:`vllm.LLMEngine._get_eos_token_id`. +{meth}`vllm.LLMEngine._get_eos_token_id`. """ from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/tests/utils.py b/tests/utils.py index 8f8c102b73b8e..0983687e2ce9a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -952,7 +952,7 @@ def get_client_text_logprob_generations( completions: list[Completion]) -> list[TextTextLogprobs]: '''Operates on the output of a request made to an Open-AI-protocol completions endpoint; obtains top-rank logprobs for each token in - each :class:`SequenceGroup` + each {class}`SequenceGroup` ''' text_generations = get_client_text_generations(completions) text = ''.join(text_generations) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 9987688b02fad..bfe9df10d4d19 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -44,7 +44,7 @@ def create_scheduler( (None) Returns: - :class:`Scheduler` instance + {class}`Scheduler` instance ''' if max_model_len is None: max_model_len = max_num_batched_tokens diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 12d85b74244f4..0100c082aa213 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """ +# MLA Common Components + This file implements common components for MLA implementations. First we define: diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 89f1ea9b8a570..54ffd5c45ff91 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -550,7 +550,7 @@ def get_num_prefill_decode_query_kv_tokens( based on the attention metadata and the specified attention type. Args: - attn_metadata (FlashAttentionMetadata): Attention Metadata object. + attn_metadata (AttentionMetadata): Attention Metadata object. attn_type (AttentionType): The type of attention being used. Returns: Tuple[int, int, int]: A tuple containing three integers: diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index c5454ccdcbf7e..b7e7a79bef0b3 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -39,7 +39,7 @@ class CompilerInterface: Gather all the relevant information from the vLLM config, to compute a hash so that we can cache the compiled model. - See :meth:`VllmConfig.compute_hash` to check what information + See {meth}`VllmConfig.compute_hash` to check what information is already considered by default. This function should only consider the information that is specific to the compiler. """ diff --git a/vllm/config.py b/vllm/config.py index 91ef9dcdbd565..3bac36fcbbeaa 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1911,10 +1911,10 @@ class SchedulerConfig: cuda_graph_sizes: list[int] = field(default_factory=lambda: [512]) """Cuda graph capture sizes, default is 512. - 1. if one value is provided, then the capture list would follow the pattern: - [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] - 2. more than one value (e.g. 1 2 128) is provided, - then the capture list will follow the provided list.""" + 1. if one value is provided, then the capture list would follow the + pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] + 2. more than one value (e.g. 1 2 128) is provided, then the capture list + will follow the provided list.""" delay_factor: float = 0.0 """Apply a delay (of delay factor multiplied by previous @@ -2888,7 +2888,7 @@ class PoolerConfig: pooling_type: Optional[str] = None """ The pooling method of the pooling model. This should be a key in - :class:`vllm.model_executor.layers.pooler.PoolingType`. + {class}`vllm.model_executor.layers.pooler.PoolingType`. """ normalize: Optional[bool] = None diff --git a/vllm/connections.py b/vllm/connections.py index 2c259bb7c3e64..9abc66050e18a 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -167,4 +167,4 @@ class HTTPConnection: global_http_connection = HTTPConnection() -"""The global :class:`HTTPConnection` instance used by vLLM.""" +"""The global {class}`HTTPConnection` instance used by vLLM.""" diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index ec07c6fe0d12d..a9f26607de49c 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_initialized, get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 50da9679d5aae..37bb12d442872 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -475,7 +475,7 @@ class _AsyncLLMEngine(LLMEngine): *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: - """Async version of :meth:`add_request`.""" + """Async version of {meth}`add_request`.""" if inputs is not None: prompt = inputs assert prompt is not None and params is not None @@ -582,20 +582,20 @@ async def build_guided_decoding_logits_processor_async( class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for :class:`LLMEngine`. + """An asynchronous wrapper for {class}`LLMEngine`. - This class is used to wrap the :class:`LLMEngine` class to make it + This class is used to wrap the {class}`LLMEngine` class to make it asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The :class:`LLMEngine` is kicked by the + processing incoming requests. The {class}`LLMEngine` is kicked by the generate method when there are requests in the waiting queue. The generate - method yields the outputs from the :class:`LLMEngine` to the caller. + method yields the outputs from the {class}`LLMEngine` to the caller. Args: log_requests: Whether to log the requests. start_engine_loop: If True, the background task to run the engine will be automatically started in the generate call. - *args: Arguments for :class:`LLMEngine`. - **kwargs: Arguments for :class:`LLMEngine`. + *args: Arguments for {class}`LLMEngine`. + **kwargs: Arguments for {class}`LLMEngine`. """ _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine @@ -985,7 +985,7 @@ class AsyncLLMEngine(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. @@ -1003,7 +1003,7 @@ class AsyncLLMEngine(EngineClient): Details: - If the engine is not running, start the background loop, which iteratively invokes - :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` to process the waiting requests. - Add the request to the engine's `RequestTracker`. On the next background loop, this request will be sent to @@ -1075,7 +1075,7 @@ class AsyncLLMEngine(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. @@ -1089,46 +1089,48 @@ class AsyncLLMEngine(EngineClient): for the request. Details: - - If the engine is not running, start the background loop, - which iteratively invokes - :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. + - If the engine is not running, start the background loop, + which iteratively invokes + {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + to process the waiting requests. + - Add the request to the engine's `RequestTracker`. + On the next background loop, this request will be sent to + the underlying engine. + Also, a corresponding `AsyncStream` will be created. + - Wait for the request outputs from `AsyncStream` and yield them. Example: - >>> # Please refer to entrypoints/api_server.py for - >>> # the complete example. - >>> - >>> # initialize the engine and the example input - >>> # note that engine_args here is AsyncEngineArgs instance - >>> engine = AsyncLLMEngine.from_engine_args(engine_args) - >>> example_input = { - >>> "input": "What is LLM?", - >>> "request_id": 0, - >>> } - >>> - >>> # start the generation - >>> results_generator = engine.encode( - >>> example_input["input"], - >>> PoolingParams(), - >>> example_input["request_id"]) - >>> - >>> # get the results - >>> final_output = None - >>> async for request_output in results_generator: - >>> if await request.is_disconnected(): - >>> # Abort the request if the client disconnects. - >>> await engine.abort(request_id) - >>> # Return or raise an error - >>> ... - >>> final_output = request_output - >>> - >>> # Process and return the final output - >>> ... + ``` + # Please refer to entrypoints/api_server.py for + # the complete example. + + # initialize the engine and the example input + # note that engine_args here is AsyncEngineArgs instance + engine = AsyncLLMEngine.from_engine_args(engine_args) + example_input = { + "input": "What is LLM?", + "request_id": 0, + } + + # start the generation + results_generator = engine.encode( + example_input["input"], + PoolingParams(), + example_input["request_id"]) + + # get the results + final_output = None + async for request_output in results_generator: + if await request.is_disconnected(): + # Abort the request if the client disconnects. + await engine.abort(request_id) + # Return or raise an error + ... + final_output = request_output + + # Process and return the final output + ... + ``` """ try: async for output in await self.add_request( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4398852daac98..38a20a418e215 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,11 +130,11 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The :class:`~vllm.LLM` class wraps this class for offline batched inference - and the :class:`AsyncLLMEngine` class wraps this class for online serving. + The {class}`~vllm.LLM` class wraps this class for offline batched inference + and the {class}`AsyncLLMEngine` class wraps this class for online serving. - The config arguments are derived from :class:`~vllm.EngineArgs`. (See - :ref:`engine-args`) + The config arguments are derived from {class}`~vllm.EngineArgs`. (See + {ref}`engine-args`) Args: model_config: The configuration related to the LLM model. @@ -694,11 +694,11 @@ class LLMEngine: Args: request_id: The unique ID of the request. - prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each input. params: Parameters for sampling or pooling. - :class:`~vllm.SamplingParams` for text generation. - :class:`~vllm.PoolingParams` for pooling. + {class}`~vllm.SamplingParams` for text generation. + {class}`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. lora_request: The LoRA request to add. @@ -710,10 +710,10 @@ class LLMEngine: Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of :class:`~vllm.Sequence` objects. - - Create a :class:`~vllm.SequenceGroup` object - from the list of :class:`~vllm.Sequence`. - - Add the :class:`~vllm.SequenceGroup` object to the scheduler. + - Create `n` number of {class}`~vllm.Sequence` objects. + - Create a {class}`~vllm.SequenceGroup` object + from the list of {class}`~vllm.Sequence`. + - Add the {class}`~vllm.SequenceGroup` object to the scheduler. Example: >>> # initialize engine @@ -861,8 +861,8 @@ class LLMEngine: Details: - Refer to the - :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` - from class :class:`~vllm.core.scheduler.Scheduler`. + {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group` + from class {class}`~vllm.core.scheduler.Scheduler`. Example: >>> # initialize engine and add a request with request_id @@ -1258,53 +1258,56 @@ class LLMEngine: def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. - .. figure:: https://i.imgur.com/sv2HssD.png - :alt: Overview of the step function - :align: center + :::{figure} https://i.imgur.com/sv2HssD.png + :alt: Overview of the step function + :align: center - Overview of the step function. + Overview of the step function. + ::: Details: - - Step 1: Schedules the sequences to be executed in the next - iteration and the token blocks to be swapped in/out/copy. + - Step 1: Schedules the sequences to be executed in the next + iteration and the token blocks to be swapped in/out/copy. - - Depending on the scheduling policy, - sequences may be `preempted/reordered`. - - A Sequence Group (SG) refer to a group of sequences - that are generated from the same prompt. + - Depending on the scheduling policy, + sequences may be `preempted/reordered`. + - A Sequence Group (SG) refer to a group of sequences + that are generated from the same prompt. - - Step 2: Calls the distributed executor to execute the model. - - Step 3: Processes the model output. This mainly includes: + - Step 2: Calls the distributed executor to execute the model. + - Step 3: Processes the model output. This mainly includes: - - Decodes the relevant outputs. - - Updates the scheduled sequence groups with model outputs - based on its `sampling parameters` (`use_beam_search` or not). - - Frees the finished sequence groups. + - Decodes the relevant outputs. + - Updates the scheduled sequence groups with model outputs + based on its `sampling parameters` (`use_beam_search` or not). + - Frees the finished sequence groups. - - Finally, it creates and returns the newly generated results. + - Finally, it creates and returns the newly generated results. Example: - >>> # Please see the example/ folder for more detailed examples. - >>> - >>> # initialize engine and request arguments - >>> engine = LLMEngine.from_engine_args(engine_args) - >>> example_inputs = [(0, "What is LLM?", - >>> SamplingParams(temperature=0.0))] - >>> - >>> # Start the engine with an event loop - >>> while True: - >>> if example_inputs: - >>> req_id, prompt, sampling_params = example_inputs.pop(0) - >>> engine.add_request(str(req_id),prompt,sampling_params) - >>> - >>> # continue the request processing - >>> request_outputs = engine.step() - >>> for request_output in request_outputs: - >>> if request_output.finished: - >>> # return or show the request output - >>> - >>> if not (engine.has_unfinished_requests() or example_inputs): - >>> break + ``` + # Please see the example/ folder for more detailed examples. + + # initialize engine and request arguments + engine = LLMEngine.from_engine_args(engine_args) + example_inputs = [(0, "What is LLM?", + SamplingParams(temperature=0.0))] + + # Start the engine with an event loop + while True: + if example_inputs: + req_id, prompt, sampling_params = example_inputs.pop(0) + engine.add_request(str(req_id),prompt,sampling_params) + + # continue the request processing + request_outputs = engine.step() + for request_output in request_outputs: + if request_output.finished: + # return or show the request output + + if not (engine.has_unfinished_requests() or example_inputs): + break + ``` """ if self.parallel_config.pipeline_parallel_size > 1: raise NotImplementedError( diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index d23a4c6ed598e..505d3d06b3ca2 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -491,7 +491,7 @@ class MQLLMEngineClient(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. @@ -560,7 +560,7 @@ class MQLLMEngineClient(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 6ed5ae0a94f1a..3d7b73f97a19a 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -41,18 +41,18 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) class MQLLMEngine: - """A multiprocessing wrapper for :class:`LLMEngine`. + """A multiprocessing wrapper for {class}`LLMEngine`. - This class is used to wrap the :class:`LLMEngine` class to enable use + This class is used to wrap the {class}`LLMEngine` class to enable use in concurrnet manner. It runs a background loop and uses zeromq to receive new requests and stream outputs incrementally via ipc. - The :class:`LLMEngine` generate or encode process is kicked off when a new + The {class}`LLMEngine` generate or encode process is kicked off when a new RPCProcessRequest is received by the input_socket. The self.engine_loop checks the input_socket for new requests, adds them to the LLMEngine if there are any, calls the internal - :class:`LLMEngine.step()`, and sends the RequestOutputs back over + {class}`LLMEngine.step()`, and sends the RequestOutputs back over the output_socket. If use_async_sockets is set, the logic associated with reading new @@ -64,8 +64,8 @@ class MQLLMEngine: ipc_path: Base path for zeromq interprocess messaging use_async_sockets: Whether to make send/recv async with GPU log_requests: Whether to log the requests. - *args: Arguments for :class:`LLMEngine`. - **kwargs: Arguments for :class:`LLMEngine`. + *args: Arguments for {class}`LLMEngine`. + **kwargs: Arguments for {class}`LLMEngine`. """ def __init__(self, diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 0f4c7517ebac8..4cfb22c5a7501 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -56,8 +56,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): scheduled computation. Args: - seq_group: the outputs are associated with this :class:`SequenceGroup` - outputs: the :class:`SequenceGroupOutput`s for all scheduler steps + seq_group: the outputs are associated with this {class}`SequenceGroup` + outputs: the {class}`SequenceGroupOutput`s for all scheduler steps """ for output in outputs: # Concatenate single-step prompt logprob processing results. diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index b5b51bb25a862..ea4b71a5b9cd2 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -19,7 +19,7 @@ logger = init_logger(__name__) def single_step_process_prompt_logprob( sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, output: CompletionSequenceGroupOutput) -> None: - """Process prompt logprobs associated with the :class:`SequenceGroupOutput` + """Process prompt logprobs associated with the {class}`SequenceGroupOutput` for a given step. Do nothing if the output has no prompt logprobs. @@ -27,9 +27,9 @@ def single_step_process_prompt_logprob( Account for the fact that transformers do not compute first-token logprobs. Args: - sg_output_proc: :class:`SequenceGroupOutputProcessor` instance - seq_group: the output is associated with this :class:`SequenceGroup` - output: the :class:`SequenceGroupOutput` for a single scheduler step + sg_output_proc: {class}`SequenceGroupOutputProcessor` instance + seq_group: the output is associated with this {class}`SequenceGroup` + output: the {class}`SequenceGroupOutput` for a single scheduler step """ prompt_logprobs = output.prompt_logprobs @@ -103,8 +103,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): scheduled computation. Args: - seq_group: the output is associated with this :class:`SequenceGroup` - outputs: the :class:`SequenceGroupOutput` for a single scheduler step + seq_group: the output is associated with this {class}`SequenceGroup` + outputs: the {class}`SequenceGroupOutput` for a single scheduler step """ assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 69523f36ffc41..a04ab885a72b8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ class LLM: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. - disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig` + disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig` disable_async_output_proc: Disable async output processing. This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files @@ -127,12 +127,13 @@ class LLM: compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. - **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - :ref:`engine-args`) + **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See + {ref}`engine-args`) - Note: - This class is intended to be used for offline inference. For online - serving, use the :class:`~vllm.AsyncLLMEngine` class instead. + :::{note} + This class is intended to be used for offline inference. For online + serving, use the {class}`~vllm.AsyncLLMEngine` class instead. + ::: """ DEPRECATE_LEGACY: ClassVar[bool] = True @@ -141,7 +142,7 @@ class LLM: DEPRECATE_INIT_POSARGS: ClassVar[bool] = True """ A flag to toggle whether to deprecate positional arguments in - :meth:`LLM.__init__`. + {meth}`LLM.__init__`. """ @classmethod @@ -398,7 +399,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See :class:`~vllm.inputs.PromptType` + for batch inference. See {class}`~vllm.inputs.PromptType` for more details about the format of each prompts. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. @@ -413,13 +414,14 @@ class LLM: Only applicable when priority scheduling policy is enabled. Returns: - A list of ``RequestOutput`` objects containing the + A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. - Note: - Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the ``inputs`` parameter. + :::{note} + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. + ::: """ runner_type = self.llm_engine.model_config.runner_type if runner_type not in ["generate", "transcription"]: @@ -488,16 +490,17 @@ class LLM: `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + {exc}`TimeoutError` on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. Returns: A list containing the results from each worker. - - Note: - It is recommended to use this API to only pass control messages, - and set up data-plane communication to pass data. + + :::{note} + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + ::: """ return self.llm_engine.collective_rpc(method, timeout, args, kwargs) @@ -664,7 +667,7 @@ class LLM: Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the :meth:`generate` method to generate the + tokenizer and calls the {meth}`generate` method to generate the responses. Multi-modal inputs can be passed in the same way you would pass them @@ -903,7 +906,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See :class:`~vllm.inputs.PromptType` + for batch inference. See {class}`~vllm.inputs.PromptType` for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -913,13 +916,14 @@ class LLM: generation, if any. Returns: - A list of ``PoolingRequestOutput`` objects containing the + A list of `PoolingRequestOutput` objects containing the pooled hidden states in the same order as the input prompts. - Note: - Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the ``inputs`` parameter. + :::{note} + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. + ::: """ runner_type = self.llm_engine.model_config.runner_type if runner_type != "pooling": @@ -992,7 +996,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See :class:`~vllm.inputs.PromptType` + for batch inference. See {class}`~vllm.inputs.PromptType` for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -1036,7 +1040,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See :class:`~vllm.inputs.PromptType` + for batch inference. See {class}`~vllm.inputs.PromptType` for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1168,7 +1172,7 @@ class LLM: text_1: can be a single prompt or a list of prompts, in which case it has to have the same length as the ``text_2`` list text_2: The texts to pair with the query to form the input - to the LLM. See :class:`~vllm.inputs.PromptType` for + to the LLM. See {class}`~vllm.inputs.PromptType` for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1277,7 +1281,7 @@ class LLM: def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the :meth:`sleep` method + Wake up the engine from sleep mode. See the {meth}`sleep` method for more details. Args: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 389557dfb7c35..40e477f031942 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,7 +5,6 @@ import json import re import time -from argparse import Namespace from typing import Annotated, Any, ClassVar, Literal, Optional, Union import torch @@ -25,23 +24,7 @@ from vllm.utils import random_uuid, resolve_obj_by_qualname logger = init_logger(__name__) -# torch is mocked during docs generation, -# so we have to provide the values as literals -_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807) -_LONG_INFO: Union["torch.iinfo", Namespace] - -try: - from sphinx.ext.autodoc.mock import _MockModule - - if isinstance(torch, _MockModule): - _LONG_INFO = _MOCK_LONG_INFO - else: - _LONG_INFO = torch.iinfo(torch.long) -except ModuleNotFoundError: - _LONG_INFO = torch.iinfo(torch.long) - -assert _LONG_INFO.min == _MOCK_LONG_INFO.min -assert _LONG_INFO.max == _MOCK_LONG_INFO.max +_LONG_INFO = torch.iinfo(torch.long) class OpenAIBaseModel(BaseModel): diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 6123811aabe1d..25069c28a0a27 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -275,7 +275,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> TextTokensPrompt: """ - A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs` + A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` that assumes single input. """ return next( @@ -296,7 +296,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: """ - A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs` + A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` that assumes multiple inputs. """ for text in prompt_inputs: diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 58796e5d7326c..522bd940211f8 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -74,7 +74,7 @@ class ExecutorBase(ABC): `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + {exc}`TimeoutError` on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 9914a9dcffcc0..0673aece91087 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -10,7 +10,7 @@ from .registry import (DummyData, InputContext, InputProcessingContext, INPUT_REGISTRY = InputRegistry() """ -The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine` +The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine` to dispatch data processing according to the target model. """ diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 86dbca1804126..c83ab73b614a7 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -80,22 +80,22 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt] """ Set of possible schemas for a single prompt: -- A text prompt (:class:`str` or :class:`TextPrompt`) -- A tokenized prompt (:class:`TokensPrompt`) -- An embeddings prompt (:class:`EmbedsPrompt`) +- A text prompt ({class}`str` or {class}`TextPrompt`) +- A tokenized prompt ({class}`TokensPrompt`) +- An embeddings prompt ({class}`EmbedsPrompt`) Note that "singleton" is as opposed to a data structure which encapsulates multiple prompts, i.e. of the sort which may be utilized for encoder/decoder models when the user desires to express both the encoder & decoder -prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt` +prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt` -A prompt of type :class:`SingletonPrompt` may be employed +A prompt of type {class}`SingletonPrompt` may be employed as (1) input to a decoder-only model, (2) input to the encoder of an encoder/decoder model, in the scenario where the decoder-prompt is not specified explicitly, or (3) as a member of a larger data structure encapsulating -more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt` +more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt` """ _T1_co = TypeVar("_T1_co", @@ -115,18 +115,18 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): comprising an explicit encoder prompt and a decoder prompt. The encoder and decoder prompts, respectively, may be formatted - according to any of the :class:`SingletonPrompt` schemas, + according to any of the {class}`SingletonPrompt` schemas, and are not required to have the same schema. Only the encoder prompt may have multi-modal data. mm_processor_kwargs should be at the top-level, and should not be set in the encoder/decoder prompts, since they are agnostic to the encoder/decoder. - Note that an :class:`ExplicitEncoderDecoderPrompt` may not + Note that an {class}`ExplicitEncoderDecoderPrompt` may not be used as an input to a decoder-only model, - and that the :code:`encoder_prompt` and :code:`decoder_prompt` + and that the `encoder_prompt` and `decoder_prompt` fields of this data structure themselves must be - :class:`SingletonPrompt` instances. + {class}`SingletonPrompt` instances. """ encoder_prompt: _T1_co @@ -141,11 +141,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types: -- A text prompt (:class:`str` or :class:`TextPrompt`) -- A tokenized prompt (:class:`TokensPrompt`) -- An embeddings prompt (:class:`EmbedsPrompt`) +- A text prompt ({class}`str` or {class}`TextPrompt`) +- A tokenized prompt ({class}`TokensPrompt`) +- An embeddings prompt ({class}`EmbedsPrompt`) - A single data structure containing both an encoder and a decoder prompt - (:class:`ExplicitEncoderDecoderPrompt`) + ({class}`ExplicitEncoderDecoderPrompt`) """ @@ -178,7 +178,7 @@ def token_inputs( prompt: Optional[str] = None, cache_salt: Optional[str] = None, ) -> TokenInputs: - """Construct :class:`TokenInputs` from optional values.""" + """Construct {class}`TokenInputs` from optional values.""" inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: @@ -221,7 +221,7 @@ def embeds_inputs( DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -The inputs in :class:`~vllm.LLMEngine` before they are +The inputs in {class}`~vllm.LLMEngine` before they are passed to the model executor. This specifies the data required for decoder-only models. """ @@ -229,7 +229,7 @@ This specifies the data required for decoder-only models. class EncoderDecoderInputs(TypedDict): """ - The inputs in :class:`~vllm.LLMEngine` before they are + The inputs in {class}`~vllm.LLMEngine` before they are passed to the model executor. This specifies the required data for encoder-decoder models. @@ -243,13 +243,13 @@ class EncoderDecoderInputs(TypedDict): SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -A processed :class:`SingletonPrompt` which can be passed to -:class:`vllm.sequence.Sequence`. +A processed {class}`SingletonPrompt` which can be passed to +{class}`vllm.sequence.Sequence`. """ ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ -The inputs to :data:`vllm.inputs.InputProcessor`. +The inputs to {data}`vllm.inputs.InputProcessor`. """ _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) @@ -277,7 +277,7 @@ def zip_enc_dec_prompts( ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - :class:`ExplicitEncoderDecoderPrompt` instances. + {class}`ExplicitEncoderDecoderPrompt` instances. ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same dictionary will be used for every encoder/decoder prompt. If an iterable is diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index fe4775b210a87..6e8effd60274f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -224,7 +224,7 @@ class InputPreprocessor: lora_request: Optional[LoRARequest], tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: - """Async version of :meth:`_tokenize_prompt`.""" + """Async version of {meth}`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) @@ -287,7 +287,7 @@ class InputPreprocessor: lora_request: Optional[LoRARequest], return_mm_hashes: bool = False, ) -> MultiModalInputs: - """Async version of :meth:`_process_multimodal`.""" + """Async version of {meth}`_process_multimodal`.""" tokenizer = await self._get_mm_tokenizer_async(lora_request) mm_processor = self.mm_registry.create_processor(self.model_config, @@ -472,7 +472,7 @@ class InputPreprocessor: Returns: - * :class:`SingletonInputs` instance + * {class}`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) @@ -508,7 +508,7 @@ class InputPreprocessor: lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: - """Async version of :meth:`_prompt_to_llm_inputs`.""" + """Async version of {meth}`_prompt_to_llm_inputs`.""" parsed = parse_singleton_prompt(prompt) if parsed["type"] == "embeds": @@ -644,7 +644,7 @@ class InputPreprocessor: ) -> EncoderDecoderInputs: """ For encoder/decoder models only: - Process an input prompt into an :class:`EncoderDecoderInputs` instance. + Process an input prompt into an {class}`EncoderDecoderInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -670,7 +670,7 @@ class InputPreprocessor: Returns: - * :class:`EncoderDecoderInputs` instance + * {class}`EncoderDecoderInputs` instance """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -710,7 +710,7 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> EncoderDecoderInputs: - """Async version of :meth:`_process_encoder_decoder_prompt`.""" + """Async version of {meth}`_process_encoder_decoder_prompt`.""" encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -778,7 +778,7 @@ class InputPreprocessor: ) -> DecoderOnlyInputs: """ For decoder-only models: - Process an input prompt into an :class:`DecoderOnlyInputs` instance. + Process an input prompt into an {class}`DecoderOnlyInputs` instance. Arguments: @@ -789,7 +789,7 @@ class InputPreprocessor: Returns: - * :class:`DecoderOnlyInputs` instance + * {class}`DecoderOnlyInputs` instance """ prompt_comps = self._prompt_to_llm_inputs( @@ -812,7 +812,7 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: - """Async version of :meth:`_process_decoder_only_prompt`.""" + """Async version of {meth}`_process_decoder_only_prompt`.""" prompt_comps = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, @@ -863,7 +863,7 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> ProcessorInputs: - """Async version of :meth:`preprocess`.""" + """Async version of {meth}`preprocess`.""" if self.model_config.is_encoder_decoder: assert not return_mm_hashes, ( "Multimodal hashes for encoder-decoder models should not be ", diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index d969922d58456..aecddbcd75159 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -38,7 +38,7 @@ class InputContext: ) -> _C: """ Get the HuggingFace configuration - (:class:`transformers.PretrainedConfig`) of the model, + ({class}`transformers.PretrainedConfig`) of the model, additionally checking its type. Raises: @@ -79,7 +79,7 @@ class InputContext: ) -> _P: """ Get the HuggingFace processor - (:class:`transformers.ProcessorMixin`) of the model, + ({class}`transformers.ProcessorMixin`) of the model, additionally checking its type. Raises: @@ -135,8 +135,8 @@ class InputProcessingContext(InputContext): kwargs: Mapping[str, object] = {}, ) -> BatchFeature: """ - Call :code:`hf_processor` on the prompt :code:`data` - (text, image, audio...) with configurable options :code:`kwargs`. + Call `hf_processor` on the prompt `data` + (text, image, audio...) with configurable options `kwargs`. """ assert callable(hf_processor) diff --git a/vllm/logger.py b/vllm/logger.py index c162e2e0465a6..cf32041c5b700 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -68,21 +68,21 @@ class _VllmLogger(Logger): """ Note: This class is just to provide type information. - We actually patch the methods directly on the :class:`logging.Logger` + We actually patch the methods directly on the {class}`logging.Logger` instance to avoid conflicting with other libraries such as `intel_extension_for_pytorch.utils._logger`. """ def info_once(self, msg: str, *args: Hashable) -> None: """ - As :meth:`info`, but subsequent calls with the same message + As {meth}`info`, but subsequent calls with the same message are silently dropped. """ _print_info_once(self, msg, *args) def warning_once(self, msg: str, *args: Hashable) -> None: """ - As :meth:`warning`, but subsequent calls with the same message + As {meth}`warning`, but subsequent calls with the same message are silently dropped. """ _print_warning_once(self, msg, *args) diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index acae0d972f4e7..5a39705e85712 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from vllm.lora.ops.triton_ops.lora_expand import lora_expand +from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta -from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink +from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink __all__ = [ "lora_expand", diff --git a/vllm/lora/ops/triton_ops/lora_expand.py b/vllm/lora/ops/triton_ops/lora_expand_op.py similarity index 100% rename from vllm/lora/ops/triton_ops/lora_expand.py rename to vllm/lora/ops/triton_ops/lora_expand_op.py diff --git a/vllm/lora/ops/triton_ops/lora_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py similarity index 100% rename from vllm/lora/ops/triton_ops/lora_shrink.py rename to vllm/lora/ops/triton_ops/lora_shrink_op.py diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 62e27b714866a..d1d3326ac3f23 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -261,15 +261,16 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): True, then a token can be accepted, else it should be rejected. - Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of - :math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according - to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the + Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of + {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according + to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the same conditional probability according to the draft model, the token is accepted with probability: - .. math:: - \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} - {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) + :::{math} + \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} + {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) + ::: This implementation does not apply causality. When using the output, if a token is rejected, subsequent tokens should not be used. @@ -312,18 +313,20 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): target model is recovered (within hardware numerics). The probability distribution used in this rejection case is constructed - as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of - :math:`x` given context :math:`x_1, \dots, x_n` according to the target - model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability + as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of + {math}`x` given context {math}`x_1, \dots, x_n` according to the target + model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability according to the draft model: - .. math:: - x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ + :::{math} + x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ + ::: - where :math:`(f(x))_+` is defined as: + where {math}`(f(x))_+` is defined as: - .. math:: - (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} + :::{math} + (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} + ::: See https://github.com/vllm-project/vllm/pull/2336 for a visualization of the draft, target, and recovered probability distributions. diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 9368992b24fe0..920c0f5a6ec9a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -235,7 +235,7 @@ class Sampler(nn.Module): * Defer Pythonization of sampling result & logprobs tensor * Encapsulate arguments required for deferred Pythonization - in the :class:`SamplerOutput` structure + in the {class}`SamplerOutput` structure Args: logits: (num_tokens, vocab_size). diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 95362c280b43b..527a301cd8e26 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): A draft token_id x_{n+k} is accepted if it satisfies the following condition - .. math:: - p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > - \min \left( \epsilon, \delta * \exp \left( - -H(p_{\text{original}}( - \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) + :::{math} + p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > + \min \left( \epsilon, \delta * \exp \left( + -H(p_{\text{original}}( + \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) + ::: - where :math:`p_{\text{original}}` corresponds to target_probs - and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters + where {math}`p_{\text{original}}` corresponds to target_probs + and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters specified using self._posterior_threshold and self._posterior_alpha This method computes the posterior probabilities for the given diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index eed49e74ac9f2..f44565bd2e01f 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, batch. pixel_values: The pixels in each input image. - See also: - :class:`Blip2ImageInputs` + :::{seealso} + {class}`Blip2ImageInputs` + ::: """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 0cda199af471f..7fea9647ead97 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -226,9 +226,9 @@ class SupportsPP(Protocol): intermediate_tensors: Optional["IntermediateTensors"], ) -> Union[Tensor, "IntermediateTensors"]: """ - Accept :class:`IntermediateTensors` when PP rank > 0. + Accept {class}`IntermediateTensors` when PP rank > 0. - Return :class:`IntermediateTensors` only for the last PP rank. + Return {class}`IntermediateTensors` only for the last PP rank. """ ... diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 3791b92ecc2a0..6287fdb3300cd 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): batch. pixel_values: The pixels in each input image. - See also: - :class:`LlavaImageInputs` + :::{seealso} + {class}`LlavaImageInputs` + ::: """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c646c0f03d1eb..c7e8d6991b25b 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by :func:`get_llava_next_image_feature_size`. + is given by {func}`get_llava_next_image_feature_size`. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. @@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values: The pixels in each grid patch for each input image. image_sizes: The original `(height, width)` for each input image. - See also: - :class:`LlavaNextImageInputs` + :::{seealso} + {class}`LlavaNextImageInputs` + ::: """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 6352ba236818f..42ec786f3a590 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, batch. pixel_values: The pixels in each input image. - See also: - :class:`Mistral3ImagePixelInputs` + :::{seealso} + {class}`Mistral3ImagePixelInputs` + ::: """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 46147a333b06e..75eebdacfdca0 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -965,7 +965,7 @@ def select_tiling( class MolmoProcessorWrapper: """ - Wraps :class:`MolmoProcessor` so that it can be called directly. + Wraps {class}`MolmoProcessor` so that it can be called directly. The original definition can be found here: https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 9f08a1c4c6f5a..4051763cec8ca 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -12,7 +12,7 @@ import torch.nn.functional as F from torch import Tensor, nn -class Block(nn.Module): +class BlockBase(nn.Module): """Block abstract module""" def __init__(self, input_size, output_size): @@ -1602,7 +1602,7 @@ class AttModule(nn.Module): return x, memory, pos_emb, att_mask -class AttBlock(Block, AttModule): +class AttBlock(BlockBase, AttModule): """Attention Block module to support both Attention and Block module.""" def memory_dims(self, max_len=False): diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 7b11a616e585d..c0b492dbfcb9d 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict): """ Shape: `(batch_size * num_images, num_channels, image_width, image_height)` - The result of stacking :attr:`ImageEncoding.tokens` from each prompt. + The result of stacking {attr}`ImageEncoding.tokens` from each prompt. """ class PixtralProcessorAdapter: """ Provide a HF-compatible interface for - :class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. + {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. """ def __init__(self, tokenizer: MistralTokenizer) -> None: diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 9f370d7aab4e4..199b885a58506 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad( tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: """ The logic of adding image pad tokens should only be applied in - :class:`QwenVLProcessor`, so they are patched out here. + {class}`QwenVLProcessor`, so they are patched out here. The definition of the wrapped tokenizer can be found here: https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 156a201de35ae..e25941faa148c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -19,7 +19,6 @@ import cloudpickle import torch.nn as nn from vllm.logger import init_logger -from vllm.utils import is_in_doc_build from .interfaces import (has_inner_state, has_noops, is_attention_free, is_hybrid, supports_cross_encoding, @@ -375,13 +374,13 @@ class _ModelRegistry: """ Register an external model to be used in vLLM. - :code:`model_cls` can be either: + `model_cls` can be either: - - A :class:`torch.nn.Module` class directly referencing the model. - - A string in the format :code:`:` which can be used to + - A {class}`torch.nn.Module` class directly referencing the model. + - A string in the format `:` which can be used to lazily import the model. This is useful to avoid initializing CUDA when importing the model and thus the related error - :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`. + `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. """ if not isinstance(model_arch, str): msg = f"`model_arch` should be a string, not a {type(model_arch)}" @@ -400,8 +399,7 @@ class _ModelRegistry: raise ValueError(msg) model = _LazyRegisteredModel(*split_str) - elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass( - model_cls, nn.Module)): + elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module): model = _RegisteredModel.from_model_cls(model_cls) else: msg = ("`model_cls` should be a string or PyTorch model class, " diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 7ed0560ee43fe..1be40ecd3e28b 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -66,7 +66,7 @@ class WeightsMapper: class AutoWeightsLoader: """ - Helper class to load weights into a :class:`torch.nn.Module`. It is able + Helper class to load weights into a {class}`torch.nn.Module`. It is able to automatically detect child modules and parameters while iterating over the weights only once. diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index c65d9407dcd1a..756ea11311daf 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -8,11 +8,12 @@ from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() """ -The global :class:`~MultiModalRegistry` is used by model runners to +The global {class}`~MultiModalRegistry` is used by model runners to dispatch data processing according to the target model. -See also: - :ref:`mm-processing` +:::{seealso} +{ref}`mm-processing` +::: """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 2f93922fcedb9..184c801e64d86 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -64,35 +64,35 @@ class MultiModalPlaceholderMap: Examples: - .. code-block:: + ``` + Prompt: |AAAA BBBB What's in these images?| + Positions: |.................................| - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| + images = [A, B] + src_ranges = [(0, 4), (4, 8)] + dest_ranges = [(0, 4), (5, 9)] - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ..... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | + images = [A, B] + src_ranges = [(2, 4), (4, 6)] + dest_ranges = [(0, 2), (3, 5)] - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ......... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | + images = [B] + src_ranges = [(0, 4)] + dest_ranges = [(0, 4)] - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | .......................| - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| - - images = [] - src_ranges = [] - dest_ranges = [] + images = [] + src_ranges = [] + dest_ranges = [] + ``` """ seq_mm_data = seq_group.multi_modal_data seq_mm_placeholders = seq_group.multi_modal_placeholders diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 978fb42319391..61d8eb62ffafb 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -26,27 +26,27 @@ _T = TypeVar("_T") HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] """ -A :class:`transformers.image_utils.ImageInput` representing a single image -item, which can be passed to a HuggingFace :code:`ImageProcessor`. +A {class}`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace `ImageProcessor`. """ HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, list[np.ndarray], list[torch.Tensor]] """ -A :class:`transformers.image_utils.VideoInput` representing a single video -item, which can be passed to a HuggingFace :code:`VideoProcessor`. +A {class}`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace `VideoProcessor`. """ HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] """ Represents a single audio -item, which can be passed to a HuggingFace :code:`AudioProcessor`. +item, which can be passed to a HuggingFace `AudioProcessor`. """ ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] """ -A :class:`transformers.image_utils.ImageInput` representing a single image -item, which can be passed to a HuggingFace :code:`ImageProcessor`. +A {class}`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace `ImageProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, which are treated as image embeddings; @@ -55,8 +55,8 @@ these are directly passed to the model without HF processing. VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] """ -A :class:`transformers.image_utils.VideoInput` representing a single video -item, which can be passed to a HuggingFace :code:`VideoProcessor`. +A {class}`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace `VideoProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, which are treated as video embeddings; @@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], torch.Tensor] """ Represents a single audio -item, which can be passed to a HuggingFace :code:`AudioProcessor`. +item, which can be passed to a HuggingFace `AudioProcessor`. Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate is different from that expected by the model; @@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]] Either a single data item, or a list of data items. The number of data items allowed per modality is restricted by -:code:`--limit-mm-per-prompt`. +`--limit-mm-per-prompt`. """ @@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] """ A dictionary containing an entry for each modality type to input. -The built-in modalities are defined by :class:`MultiModalDataBuiltins`. +The built-in modalities are defined by {class}`MultiModalDataBuiltins`. """ @@ -116,14 +116,14 @@ class PlaceholderRange: Example: - Prompt: :code:`AAAA BBBB What is in these images?` + Prompt: `AAAA BBBB What is in these images?` - Images A and B will have: + Images A and B will have: - .. code-block:: - - A: PlaceholderRange(offset=0, length=4) - B: PlaceholderRange(offset=5, length=4) + ``` + A: PlaceholderRange(offset=0, length=4) + B: PlaceholderRange(offset=5, length=4) + ``` """ offset: int @@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match. def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: - """Equality check between :data:`NestedTensors` objects.""" + """Equality check between {data}`NestedTensors` objects.""" if isinstance(a, torch.Tensor): return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): @@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -:meth:`MultiModalKwargs.batch`. +{meth}`MultiModalKwargs.batch`. """ @@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via class MultiModalFieldElem: """ Represents a keyword argument corresponding to a multi-modal item - in :class:`MultiModalKwargs`. + in {class}`MultiModalKwargs`. """ modality: str @@ -205,13 +205,13 @@ class MultiModalFieldElem: key: str """ - The key of this field in :class:`MultiModalKwargs`, + The key of this field in {class}`MultiModalKwargs`, i.e. the name of the keyword argument to be passed to the model. """ data: NestedTensors """ - The tensor data of this field in :class:`MultiModalKwargs`, + The tensor data of this field in {class}`MultiModalKwargs`, i.e. the value of the keyword argument to be passed to the model. """ @@ -234,7 +234,7 @@ class MultiModalFieldElem: class BaseMultiModalField(ABC): """ Defines how to interpret tensor data belonging to a keyword argument in - :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa. + {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa. """ def _field_factory(self, *, modality: str, key: str): @@ -259,10 +259,10 @@ class BaseMultiModalField(ABC): data: NestedTensors, ) -> Sequence[MultiModalFieldElem]: """ - Construct :class:`MultiModalFieldElem` instances to represent + Construct {class}`MultiModalFieldElem` instances to represent the provided data. - This is the inverse of :meth:`reduce_data`. + This is the inverse of {meth}`reduce_data`. """ raise NotImplementedError @@ -272,9 +272,9 @@ class BaseMultiModalField(ABC): def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: """ - Merge the data from multiple instances of :class:`MultiModalFieldElem`. + Merge the data from multiple instances of {class}`MultiModalFieldElem`. - This is the inverse of :meth:`build_elems`. + This is the inverse of {meth}`build_elems`. """ field_types = [type(item.field) for item in elems] if len(set(field_types)) > 1: @@ -286,8 +286,9 @@ class BaseMultiModalField(ABC): @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - See also: - :func:`MultiModalFieldConfig.batched` + :::{seealso} + {func}`MultiModalFieldConfig.batched` + ::: """ def build_elems( @@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - See also: - :func:`MultiModalFieldConfig.flat` - :func:`MultiModalFieldConfig.flat_from_sizes` + :::{seealso} + {func}`MultiModalFieldConfig.flat` + {func}`MultiModalFieldConfig.flat_from_sizes` + ::: """ slices: Union[Sequence[slice], Sequence[Sequence[slice]]] dim: int = 0 @@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalSharedField(BaseMultiModalField): """ - See also: - :func:`MultiModalFieldConfig.shared` + :::{seealso} + {func}`MultiModalFieldConfig.shared` + ::: """ batch_size: int @@ -390,17 +393,17 @@ class MultiModalFieldConfig: Example: - .. code-block:: + ``` + Input: + Data: [[AAAA] + [BBBB] + [CCCC]] - Input: - Data: [[AAAA] - [BBBB] - [CCCC]] - - Output: - Element 1: [AAAA] - Element 2: [BBBB] - Element 3: [CCCC] + Output: + Element 1: [AAAA] + Element 2: [BBBB] + Element 3: [CCCC] + ``` """ return MultiModalFieldConfig( field=MultiModalBatchedField(), @@ -425,35 +428,35 @@ class MultiModalFieldConfig: Example: - .. code-block:: - - Given: - slices: [slice(0, 3), slice(3, 7), slice(7, 9)] + ``` + Given: + slices: [slice(0, 3), slice(3, 7), slice(7, 9)] - Input: - Data: [AAABBBBCC] + Input: + Data: [AAABBBBCC] - Output: - Element 1: [AAA] - Element 2: [BBBB] - Element 3: [CC] - - .. code-block:: + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + ``` - Given: - slices: [ - (slice(None), slice(0, 3)), - (slice(None), slice(3, 7)), - (slice(None), slice(7, 9))] - dim: 1 + ``` + Given: + slices: [ + (slice(None), slice(0, 3)), + (slice(None), slice(3, 7)), + (slice(None), slice(7, 9))] + dim: 1 - Input: - Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]] + Input: + Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]] - Output: - Element 1: [[A],[A],[A]] - Element 2: [[B],[B],[B],[B]] - Element 3: [[C],[C]] + Output: + Element 1: [[A],[A],[A]] + Element 2: [[B],[B],[B],[B]] + Element 3: [[C],[C]] + ``` """ return MultiModalFieldConfig( field=MultiModalFlatField(slices=slices, dim=dim), @@ -477,36 +480,36 @@ class MultiModalFieldConfig: Example: - .. code-block:: - - Given: - size_per_item: [3, 4, 2] + ``` + Given: + size_per_item: [3, 4, 2] - Input: - Data: [AAABBBBCC] + Input: + Data: [AAABBBBCC] - Output: - Element 1: [AAA] - Element 2: [BBBB] - Element 3: [CC] + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + ``` - - .. code-block:: + ``` + Given: + slices: [3, 4, 2] + dim: 1 - Given: - slices: [3, 4, 2] - dim: 1 + Input: + Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]] - Input: - Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]] + Output: + Element 1: [[A],[A],[A]] + Element 2: [[B],[B],[B],[B]] + Element 3: [[C],[C]] + ``` - Output: - Element 1: [[A],[A],[A]] - Element 2: [[B],[B],[B],[B]] - Element 3: [[C],[C]] - - See also: - :func:`MultiModalFieldConfig.flat` + :::{seealso} + {func}`MultiModalFieldConfig.flat` + ::: """ if size_per_item.ndim != 1: @@ -535,19 +538,19 @@ class MultiModalFieldConfig: Example: - .. code-block:: - - Given: - batch_size: 4 + ``` + Given: + batch_size: 4 - Input: - Data: [XYZ] + Input: + Data: [XYZ] - Output: - Element 1: [XYZ] - Element 2: [XYZ] - Element 3: [XYZ] - Element 4: [XYZ] + Output: + Element 1: [XYZ] + Element 2: [XYZ] + Element 3: [XYZ] + Element 4: [XYZ] + ``` """ return MultiModalFieldConfig( field=MultiModalSharedField(batch_size), @@ -570,8 +573,8 @@ class MultiModalFieldConfig: class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): """ - A collection of :class:`MultiModalFieldElem` - corresponding to a data item in :class:`MultiModalDataItems`. + A collection of {class}`MultiModalFieldElem` + corresponding to a data item in {class}`MultiModalDataItems`. """ @staticmethod @@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. + {meth}`~torch.nn.Module.forward`. - The metadata :code:`items` enables us to obtain the keyword arguments - corresponding to each data item in :class:`MultiModalDataItems`, via - :meth:`get_item` and :meth:`get_items`. + The metadata `items` enables us to obtain the keyword arguments + corresponding to each data item in {class}`MultiModalDataItems`, via + {meth}`get_item` and {meth}`get_items`. """ @staticmethod @@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod def from_items(items: Sequence[MultiModalKwargsItem]): - """Construct a new :class:`MultiModalKwargs` from multiple items.""" + """Construct a new {class}`MultiModalKwargs` from multiple items.""" elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) for item in items: for key, elem in item.items(): @@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality. class MultiModalInputs(TypedDict): """ Represents the outputs of - :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, + {class}`vllm.multimodal.processing.BaseMultiModalProcessor`, ready to be passed to vLLM internals. """ @@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict): mm_placeholders: MultiModalPlaceholderDict """ For each modality, information about the placeholder tokens in - :code:`prompt_token_ids`. + `prompt_token_ids`. """ cache_salt: NotRequired[str] @@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict): class MultiModalEncDecInputs(MultiModalInputs): """ - Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor` + Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor` ready to be passed to vLLM internals. """ diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 9707b9cfcf8bf..f9588431c8efe 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -25,7 +25,7 @@ _I = TypeVar("_I") class ModalityDataItems(ABC, Generic[_T, _I]): """ - Represents data items for a modality in :class:`MultiModalDataItems`. + Represents data items for a modality in {class}`MultiModalDataItems`. """ def __init__(self, data: _T, modality: str) -> None: @@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized + As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized such that each entry corresponds to a list. """ @@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ Get the number of data items belonging to a modality. - If `strict=False`, return `0` instead of raising :exc:`KeyError` + If `strict=False`, return `0` instead of raising {exc}`KeyError` even if the modality is not found. """ if modality not in self: @@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], class MultiModalDataParser: """ - Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into - :class:`MultiModalDataItems`. + Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into + {class}`MultiModalDataItems`. Args: target_sr (float, optional): Enables automatic resampling of audio diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 58168d0e850c2..27b059b3ee62f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]): is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None """ - Given :attr:`full`, return a boolean mask of shape `(len(full),)` + Given {attr}`full`, return a boolean mask of shape `(len(full),)` indicating which positions of `full` to assign embeddings to. `None` (default) means to assign embeddings to all positions of `full`. The embeddings are obtained by calling - :class:`SupportsMultiModal.get_multimodal_embeddings`. + {class}`SupportsMultiModal.get_multimodal_embeddings`. """ @staticmethod @@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails] The token sequence or text that are part of the update. If only part of the content corresponds to feature placeholders, you can -use :class:`PromptUpdateDetails` to specify which part. +use {class}`PromptUpdateDetails` to specify which part. """ PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateInfo] """ -Given the index of the processed item within :attr:`modality`, +Given the index of the processed item within {attr}`modality`, output the corresponding token sequence (or text). For convenience, you can directly pass in the token sequence (or text) @@ -213,52 +213,52 @@ class PromptInsertion(PromptUpdate): Example: - For each image, insert a number of ```` feature placeholders - equal to the feature size of the vision encoder after the ```` token: + For each image, insert a number of ```` feature placeholders + equal to the feature size of the vision encoder after the ```` token: - .. code-block:: python + ```python + PromptInsertion( + modality="image", + target="", + insertion="" * image_feature_size, + ) + ``` - PromptInsertion( - modality="image", - target="", - insertion="" * image_feature_size, - ) + Insert these tokens at the start of the prompt: - Insert these tokens at the start of the prompt: + ```python + PromptInsertion( + modality="image", + target=PromptIndexTargets.start(), + insertion="" * image_feature_size, + ) + ``` - .. code-block:: python + Insert these tokens after a prefix ``Images:``: - PromptInsertion( - modality="image", - target=PromptIndexTargets.start(), - insertion="" * image_feature_size, - ) + ```python + PromptInsertion( + modality="image", + target=PromptIndexTargets.prefix("Images:"), + insertion="" * image_feature_size, + ) + ``` - Insert these tokens after a prefix ``Images:``: + Insert these tokens at the end of the prompt: - .. code-block:: python - - PromptInsertion( - modality="image", - target=PromptIndexTargets.prefix("Images:"), - insertion="" * image_feature_size, - ) - - Insert these tokens at the end of the prompt: - - .. code-block:: python - - PromptInsertion( - modality="image", - target=PromptIndexTargets.end(), - insertion="" * image_feature_size, - ) + ```python + PromptInsertion( + modality="image", + target=PromptIndexTargets.end(), + insertion="" * image_feature_size, + ) + ``` """ insertion: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within :attr:`modality`, - output the token sequence (or text) to insert right after :attr:`target`. + Given the index of the processed item within {attr}`modality`, + output the token sequence (or text) to insert right after {attr}`target`. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -280,57 +280,57 @@ class PromptReplacement(PromptUpdate): Example: - For each image, replace one ```` input placeholder in the prompt - with a number of ```` feature placeholders - equal to the feature size of the vision encoder: + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: - .. code-block:: python + ```python + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + ``` - PromptReplacement( - modality="image", - target="", - replacement="" * image_feature_size, - ) + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: - As above, but further pad the feature placeholders with ```` - and ```, which are not supposed to be passed to the vision - encoder: + ```python + PromptReplacement( + modality="image", + target="", + replacement=PromptUpdateDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + ``` - .. code-block:: python + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: - PromptReplacement( - modality="image", - target="", - replacement=PromptUpdateDetails( - full="".join([ - "", - "" * image_feature_size, - "", - ]), - features="" * image_feature_size, - ), - ) - - To avoid unnecessary tokenization during prompt replacement, - we recommended passing token sequences instead of text: - - .. code-block:: python - - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=PromptUpdateDetails( - full=([image_bos_id] + [image_token_id] * image_feature_size - + [image_eos_id]), - features=[image_token_id] * image_feature_size, - ), - ) + ```python + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptUpdateDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) + ``` """ replacement: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within :attr:`modality`, - output the token sequence (or text) to replace :attr:`target`. + Given the index of the processed item within {attr}`modality`, + output the token sequence (or text) to replace {attr}`target`. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: - """Convenience function to apply :func:`full_groupby` based on modality.""" + """Convenience function to apply {func}`full_groupby` based on modality.""" return full_groupby(values, key=lambda x: x.modality) @dataclass class _BoundPromptSequence: """ - A :data:`_PromptSeq` bound to a tokenizer to automatically + A {data}`_PromptSeq` bound to a tokenizer to automatically convert between token sequence and text representations. """ tokenizer: AnyTokenizer = field(repr=False) @@ -443,8 +443,8 @@ class _BoundPromptContent: @dataclass class BoundPromptUpdate: """ - A :class:`PromptUpdate` bound to a tokenizer to automatically convert - :attr:`target` and the result of :meth:`get_content` between + A {class}`PromptUpdate` bound to a tokenizer to automatically convert + {attr}`target` and the result of {meth}`get_content` between token sequence and text representations. """ _origin: PromptUpdate @@ -479,7 +479,7 @@ class BoundPromptUpdate: def get_content(self, item_idx: int) -> _BoundPromptContent: """ - Given the index of the processed item within :attr:`modality`, + Given the index of the processed item within {attr}`modality`, output the token sequence (or text) to update. """ content = self.content @@ -516,7 +516,7 @@ def iter_token_matches( match_ids: list[int], ) -> Generator[_TokenMatch]: """ - Yield each occurrence of :code:`match_ids` in :code:`token_ids`. + Yield each occurrence of `match_ids` in `token_ids`. Note that empty matches are ignored. """ @@ -545,8 +545,8 @@ def replace_token_matches( new_ids: list[int], ) -> list[int]: """ - Replace each occurrence of :code:`match_ids` in :code:`token_ids` - with :code:`new_ids`. + Replace each occurrence of `match_ids` in `token_ids` + with `new_ids`. Note that empty matches are ignored. """ @@ -654,7 +654,7 @@ def find_token_matches( prompt: list[int], prompt_updates: Sequence[BoundPromptUpdate], ) -> Sequence[PromptTargetMatch]: - """Return each target of :code:`prompt_updates` found in :code:`prompt`.""" + """Return each target of `prompt_updates` found in `prompt`.""" def get_matches(update: BoundPromptUpdate): target = update.target @@ -680,7 +680,7 @@ def find_text_matches( prompt: str, prompt_updates: Sequence[BoundPromptUpdate], ) -> Sequence[PromptTargetMatch]: - """Return each target of :code:`prompt_updates` found in :code:`prompt`.""" + """Return each target of `prompt_updates` found in `prompt`.""" def get_matches(update: BoundPromptUpdate): target = update.target @@ -707,7 +707,7 @@ def _resolve_matches( mm_matches: Mapping[str, Sequence[PromptTargetMatch]], ) -> list[PromptTargetMatch]: """ - Resolve :code:`mm_matches` to ensure that there are no overlapping matches, + Resolve `mm_matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ matches = [m for matches in mm_matches.values() for m in matches] @@ -731,7 +731,7 @@ def _apply_matches( mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_item_counts: Mapping[str, int], ) -> list[_S]: - """Apply the updates in :code:`mm_matches` to :code:`prompt`.""" + """Apply the updates in `mm_matches` to `prompt`.""" out_seqs = list[Union[str, list[int]]]() prev_end_idx = 0 next_idx_by_modality = defaultdict[str, int](lambda: 0) @@ -780,7 +780,7 @@ def apply_token_matches( mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_item_counts: Mapping[str, int], ) -> list[int]: - """Apply the updates in :code:`mm_matches` to :code:`prompt`.""" + """Apply the updates in `mm_matches` to `prompt`.""" if not mm_matches: return prompt @@ -794,7 +794,7 @@ def apply_text_matches( mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_item_counts: Mapping[str, int], ) -> str: - """Apply the updates in :code:`mm_matches` to :code:`prompt`.""" + """Apply the updates in `mm_matches` to `prompt`.""" if not mm_matches: return prompt @@ -809,7 +809,7 @@ def _iter_placeholders( mm_item_counts: Mapping[str, int], ) -> Iterable[PlaceholderFeaturesInfo]: """ - Yield each set of placeholder tokens found in :code:`prompt`. + Yield each set of placeholder tokens found in `prompt`. Matches are exclusive even when multiple modalities share the same placeholder tokens. In that case, the modality that @@ -1016,7 +1016,7 @@ class ProcessingCache: ) -> None: """ Put a processed multi-modal item into the cache - according to its dependencies (see :meth:`get`). + according to its dependencies (see {meth}`get`). """ cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, **{modality: input_item}, @@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo) MultiModalHashes = dict[str, list[str]] """ -A collection of hashes with a similar structure as :class:`MultiModalKwargs`. +A collection of hashes with a similar structure as {class}`MultiModalKwargs`. """ @@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ Abstract base class to process multi-modal inputs to be used in vLLM. - Not to be confused with :class:`transformers.ProcessorMixin`. + Not to be confused with {class}`transformers.ProcessorMixin`. """ def __init__(self, @@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _get_data_parser(self) -> MultiModalDataParser: """ Construct a parser to preprocess multi-modal data items - before passing them to :meth:`_get_hf_mm_data`. + before passing them to {meth}`_get_hf_mm_data`. You can support additional modalities by creating a subclass - of :class:`MultiModalDataParser` that has additional subparsers. + of {class}`MultiModalDataParser` that has additional subparsers. """ return MultiModalDataParser() @@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data: MultiModalDataDict, ) -> MultiModalDataItems: """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` - before passing them to :meth:`_get_hf_mm_data`. + Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems` + before passing them to {meth}`_get_hf_mm_data`. """ mm_items = self.data_parser.parse_mm_data(mm_data) supported_mm_limits = self.info.get_supported_mm_limits() @@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): inputs. Moreover, this information is critical to determine the token positions - in order to construct :class:`~vllm-multimodal.input.PlaceholderRange` + in order to construct {class}`~vllm-multimodal.input.PlaceholderRange` for each multi-modal item. """ raise NotImplementedError @@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ Return whether the HF processor applies prompt updates. - For most HF processors, this should be :code:`True` when multi-modal - data items are passed, but :code:`False` when multi-modal embeddings + For most HF processors, this should be `True` when multi-modal + data items are passed, but `False` when multi-modal embeddings are passed. """ return not any( @@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Most HF processors accept prompt text but not prompt tokens. If the HF processor adds or removes tokens that are not related to multi-modal data, you should override this method so it is consistent - with the output of :meth:`_apply_hf_processor_text_only` on the + with the output of {meth}`_apply_hf_processor_text_only` on the corresponding text. """ return prompt_tokens @@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Since HF processor requires that text and multi-modal items correspond to each other, we generate dummy text using - :class:`DummyInputsBuilder` to go along with the multi-modal data. + {class}`DummyInputsBuilder` to go along with the multi-modal data. """ mm_counts = mm_items.get_all_counts() @@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Apply the HF processor on the prompt text and multi-modal data. In addition, return whether prompt updates have been applied - (for most HF processors, this should be :code:`True`). + (for most HF processors, this should be `True`). Note: - If :code:`enable_hf_prompt_update=False`, we use HF processor + If `enable_hf_prompt_update=False`, we use HF processor to perform prompt updates if available; HF processor requires that the prompt corresponds to multi-modal items. """ diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index b351acc8c5950..b5875124c1266 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -25,7 +25,7 @@ logger = init_logger(__name__) class ProcessorInputs: """ Represents the keyword arguments to - :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. """ prompt_text: str mm_data: MultiModalDataDict @@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): # TODO: @abstractmethod after transition def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: """ - Build the text input corresponding to :code:`mm_counts`. + Build the text input corresponding to `mm_counts`. """ if (type(self).get_dummy_processor_inputs == BaseDummyInputsBuilder.get_dummy_processor_inputs): diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 9d2b4e486b1af..3e62f4c43e10d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) class ProcessingInfoFactory(Protocol[_I_co]): - """Constructs a :class:`MultiModalProcessor` instance from the context.""" + """Constructs a {class}`MultiModalProcessor` instance from the context.""" def __call__( self, @@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]): class DummyInputsBuilderFactory(Protocol[_I]): """ - Constructs a :class:`BaseDummyInputsBuilder` instance from the context. + Constructs a {class}`BaseDummyInputsBuilder` instance from the context. """ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: @@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]): class MultiModalProcessorFactory(Protocol[_I]): - """Constructs a :class:`MultiModalProcessor` instance from the context.""" + """Constructs a {class}`MultiModalProcessor` instance from the context.""" def __call__( self, @@ -150,7 +150,7 @@ class MultiModalRegistry: Get the maximum number of tokens from each modality for profiling the memory usage of a model. - See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. + See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ mm_limits = self.get_mm_limits_per_prompt(model_config) @@ -165,7 +165,7 @@ class MultiModalRegistry: Get the maximum number of multi-modal tokens for profiling the memory usage of a model. - See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. + See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ return sum(self.get_max_tokens_by_modality(model_config).values()) @@ -208,8 +208,9 @@ class MultiModalRegistry: When the model receives multi-modal data, the provided function is invoked to transform the data into a dictionary of model inputs. - See also: - :ref:`mm-processing` + :::{seealso} + {ref}`mm-processing` + ::: """ def wrapper(model_cls: N) -> N: @@ -253,8 +254,9 @@ class MultiModalRegistry: """ Create a multi-modal processor for a specific model and tokenizer. - See also: - :ref:`mm-processing` + :::{seealso} + {ref}`mm-processing` + ::: """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3f9b5be28b02b..aef5f669ac689 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,7 +2,7 @@ from itertools import groupby from pathlib import Path -from typing import TYPE_CHECKING, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse import numpy as np @@ -24,6 +24,10 @@ _M = TypeVar("_M") if TYPE_CHECKING: from .hasher import MultiModalHashDict from .inputs import MultiModalKwargs, MultiModalPlaceholderDict +else: + MultiModalHashDict = Any + MultiModalKwargs = Any + MultiModalPlaceholderDict = Any class MediaConnector: @@ -255,7 +259,7 @@ class MediaConnector: global_media_connector = MediaConnector() -"""The global :class:`MediaConnector` instance used by vLLM.""" +"""The global {class}`MediaConnector` instance used by vLLM.""" fetch_audio = global_media_connector.fetch_audio fetch_image = global_media_connector.fetch_image @@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str: def merge_and_sort_multimodal_metadata( - mm_positions: "MultiModalPlaceholderDict", - mm_hashes: Optional["MultiModalHashDict"], + mm_positions: MultiModalPlaceholderDict, + mm_hashes: Optional[MultiModalHashDict], ) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]: """Given a MultiModalPlaceholderDict, merge all PlaceholderRange objects from all available modalities into a single list of - PlaceholderRange, sorted by their offset (starting index in the input + PlaceholderRange, sorted by their offset (starting index in the input sequence) in the ascending order. - Optionally if a MultiModalHashDict is given, same operation will be + Optionally if a `MultiModalHashDict` is given, same operation will be applied to the object and the sorted list of hashes will be returned. Returns: - list[str]: List of item modalities in order of their positions in - the input sequence. - list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from - mm_positions. - Optional[list[str]]: Sorted list of all hashes from mm_hashes if - given, None otherwise. + list[str]: List of item modalities in order of their positions in the + input sequence. + list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from + mm_positions. + Optional[list[str]]: Sorted list of all hashes from mm_hashes if given, + None otherwise. """ modalities = list(mm_positions.keys()) @@ -352,22 +356,23 @@ def merge_and_sort_multimodal_metadata( def group_mm_inputs_by_modality( - mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]: - """Group consecutive MultiModalKwargs from mm_inputs with the same modality - together into the same list for batching purpose. For MultiModalKwargs with + mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]: + """Group consecutive MultiModalKwargs from mm_inputs with the same modality + together into the same list for batching purpose. For MultiModalKwargs with multiple modalities, put them into their own list. Args: mm_inputs: List of MultiModalKwargs. Returns: - list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each - inner list contains consecutive MultiModalKwargs with same modality. + list[list[vllm.multimodal.MultiModalKwargs]]: List of list of + `MultiModalKwargs`, each inner list contains consecutive + `MultiModalKwargs` with same modality. """ if not mm_inputs: return [] - def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]: + def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]: # If the input has multiple modalities, return a id as the unique key # for the mm_input input. if len(mm_input.modalities) > 1: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 70553354a0602..e45522a4c407e 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -19,8 +19,6 @@ if TYPE_CHECKING: else: VllmConfig = None -logger = init_logger(__name__) - class CpuPlatform(Platform): _enum = PlatformEnum.CPU diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index f82af426b5a8b..ab03dece8c136 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -454,10 +454,4 @@ finally: CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform -try: - from sphinx.ext.autodoc.mock import _MockModule - - if not isinstance(pynvml, _MockModule): - CudaPlatform.log_warnings() -except ModuleNotFoundError: - CudaPlatform.log_warnings() +CudaPlatform.log_warnings() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 6a78e00a90495..5df0e9d3d0728 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -146,7 +146,7 @@ class Platform: return self._enum == PlatformEnum.OOT def is_cuda_alike(self) -> bool: - """Stateless version of :func:`torch.cuda.is_available`.""" + """Stateless version of {func}`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) def is_sleep_mode_available(self) -> bool: @@ -165,7 +165,7 @@ class Platform: cls, device_id: int = 0, ) -> Optional[DeviceCapability]: - """Stateless version of :func:`torch.cuda.get_device_capability`.""" + """Stateless version of {func}`torch.cuda.get_device_capability`.""" return None @classmethod @@ -180,7 +180,7 @@ class Platform: The ``capability`` argument can either be: - A tuple ``(major, minor)``. - - An integer ````. (See :meth:`DeviceCapability.to_int`) + - An integer ````. (See {meth}`DeviceCapability.to_int`) """ current_capability = cls.get_device_capability(device_id=device_id) if current_capability is None: diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py index 00af72b1d41fc..e69de29bb2d1d 100644 --- a/vllm/profiler/__init__.py +++ b/vllm/profiler/__init__.py @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from .layerwise_profile import layerwise_profile - -__all__ = [ - "layerwise_profile", -] diff --git a/vllm/sequence.py b/vllm/sequence.py index 5bc9b8a6fc82a..91f769d6dbd9f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1 def array_full(token_id: int, count: int): - """:class:`array` equivalent of :func:`numpy.full`.""" + """{class}`array` equivalent of {func}`numpy.full`.""" return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count @@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct, def from_prompt_token_counts( *token_counts: tuple[int, int]) -> "SequenceData": """ - Construct a :class:`SequenceData` instance by concatenating + Construct a {class}`SequenceData` instance by concatenating prompt token sequences. Each tuple represents one token sequence, expressed in the form - :code:`(token_id, count)`. + `(token_id, count)`. """ if len(token_counts) == 0: return SequenceData.from_seqs([]) @@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct, prompt_embeds: Optional[torch.Tensor] = None, ) -> "SequenceData": """ - Construct a :class:`SequenceData` instance from prompt and output + Construct a {class}`SequenceData` instance from prompt and output token sequences. """ prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, @@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct, class Sequence: """Stores the data, status, and block information of a sequence. - The sequence is constructed from the :data:`DecoderOnlyInputs` - (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) - instance passed in through the :code:`inputs` constructor argument. + The sequence is constructed from the {data}`DecoderOnlyInputs` + (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the `inputs` constructor argument. Args: seq_id: The ID of the sequence. diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 6919562465097..ea3d91d7893bb 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase): """Create a SmallerTpProposerWorker. Args: - worker (MultiStepWorker): an actual worker wrapped with this class + worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an + actual worker wrapped with this class draft_ranks (List[int]): if this value is given, only the GPU ranks written in this value participate in draft generation """ diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index 8f40b2b7df7ab..bffa127fecb25 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig): initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabling this will also - allow the model to output the auxiliary loss. See [here]() for more details + Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. router_aux_loss_coef (`float`, *optional*, defaults to 0.001): The aux loss factor for the total loss. diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 39364367e3031..8181604191a19 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig): Instantiating a configuration with the defaults will yield a similar configuration to that of the Exaone - Configuration objects inherit from :class:`~transformers.PretrainedConfig` + Configuration objects inherit from {class}`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from : class:`~transformers.PretrainedConfig` for more information. Args: - vocab_size (:obj:`int`, `optional`, defaults to 50257): + vocab_size ({obj}`int`, `optional`, defaults to 50257): Vocabulary size of the GPT Lingvo model. Defines the number of - different tokens that can be represented by the :obj:`inputs_ids` - passed when calling :class:`~transformers.ExaoneModel`. Vocabulary + different tokens that can be represented by the {obj}`inputs_ids` + passed when calling {class}`~transformers.ExaoneModel`. Vocabulary size of the model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class: `~transformers.EXAONEModel`. - hidden_size (:obj:`int`, `optional`, defaults to 2048): + hidden_size ({obj}`int`, `optional`, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. - num_layers (:obj:`int`, `optional`, defaults to 24): + num_layers ({obj}`int`, `optional`, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the @@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig): specified, will default to `num_attention_heads`. rotary_pct (`float`, *optional*, defaults to 0.25): percentage of hidden dimensions to allocate to rotary embeddings - intermediate_size (:obj:`int`, `optional`, defaults to 8192): + intermediate_size ({obj}`int`, `optional`, defaults to 8192): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - activation_function (:obj:`str` or :obj:`function`, `optional`, - defaults to :obj:`"gelu_new"`): + activation_function ({obj}`str` or {obj}`function`, `optional`, + defaults to {obj}`"gelu_new"`): The non-linear activation function (function or string) in the - encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`, - :obj:`"selu"` and :obj:`"gelu_new"` are supported. - embed_dropout (:obj:`float`, `optional`, defaults to 0.0): + encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`, + {obj}`"selu"` and {obj}`"gelu_new"` are supported. + embed_dropout ({obj}`float`, `optional`, defaults to 0.0): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. - attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + attention_dropout ({obj}`float`, `optional`, defaults to 0.0): The dropout ratio for the attention probabilities. - max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): + max_position_embeddings ({obj}`int`, `optional`, defaults to 2048): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (:obj:`int`, `optional`, defaults to 2): - The vocabulary size of the :obj:`token_type_ids` passed when calling - :class:`~transformers.EXAONEModel`. - initializer_range (:obj:`float`, `optional`, defaults to 0.02): + type_vocab_size ({obj}`int`, `optional`, defaults to 2): + The vocabulary size of the {obj}`token_type_ids` passed when calling + {class}`~transformers.EXAONEModel`. + initializer_range ({obj}`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): + layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5): The epsilon used by the layer normalization layers. - use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if ``config.is_decoder=True``. - gradient_checkpointing (:obj:`bool`, `optional`, - defaults to :obj:`False`): + gradient_checkpointing ({obj}`bool`, `optional`, + defaults to {obj}`False`): If True, use gradient checkpointing to save memory at the expense of slower backward pass. Example:: diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 57b9242b88ab5..e31580ede57ba 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -39,9 +39,9 @@ def decode_tokens( ) -> str: """ Backend-agnostic equivalent of HF's - :code:`tokenizer.decode(token_ids, ...)`. + `tokenizer.decode(token_ids, ...)`. - :code:`skip_special_tokens=None` means to use the backend's default + `skip_special_tokens=None` means to use the backend's default settings. """ if skip_special_tokens is not None: @@ -61,9 +61,9 @@ def encode_tokens( ) -> list[int]: """ Backend-agnostic equivalent of HF's - :code:`tokenizer.encode(text, ...)`. + `tokenizer.encode(text, ...)`. - :code:`add_special_tokens=None` means to use the backend's default + `add_special_tokens=None` means to use the backend's default settings. """ diff --git a/vllm/utils.py b/vllm/utils.py index f85bbe3a5990a..3f334f94bc2a8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]): """ Gets the cumulative number of hits and queries against this cache. - If :code:`delta=True`, instead gets these statistics - since the last call that also passed :code:`delta=True`. + If `delta=True`, instead gets these statistics + since the last call that also passed `delta=True`. """ info = CacheInfo(hits=self._hits, total=self._total) @@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]: def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): """ - Unlike :class:`itertools.groupby`, groups are not broken by + Unlike {class}`itertools.groupby`, groups are not broken by non-contiguous data. """ groups = defaultdict[_K, list[_V]](list) @@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor: return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) -def is_in_doc_build() -> bool: - try: - from sphinx.ext.autodoc.mock import _MockModule - return isinstance(torch, _MockModule) - except ModuleNotFoundError: - return False - - def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): """ Import a Python file according to its file path. @@ -1820,10 +1812,11 @@ class _PlaceholderBase: Disallows downstream usage of placeholder modules. We need to explicitly override each dunder method because - :meth:`__getattr__` is not called when they are accessed. + {meth}`__getattr__` is not called when they are accessed. - See also: - [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) + :::{seealso} + [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) + ::: """ def __getattr__(self, key: str) -> Never: @@ -2052,9 +2045,6 @@ def direct_register_custom_op( library object. If you want to bind the operator to a different library, make sure the library object is alive when the operator is used. """ - if is_in_doc_build(): - return - if not supports_custom_op(): from vllm.platforms import current_platform assert not current_platform.is_cuda_alike(), ( diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 3e77555d7f942..8b1875e7356b2 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """ +# MLA Common Components + This file implements common components for MLA implementations. First we define: diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index cb13a5b7a02fc..a2fa5825bb1af 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -180,6 +180,7 @@ class KVCacheManager: as eagle. Blocks layout: + ``` ----------------------------------------------------------------------- | < computed > | < new computed > | < new > | < pre-allocated > | ----------------------------------------------------------------------- @@ -189,6 +190,7 @@ class KVCacheManager: ------------------------------------------------ | | -------------- + ``` The following *_blocks are illustrated in this layout. Returns: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index f76c44cb8bca7..5f5ffe6e09db6 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -308,7 +308,7 @@ class OutputProcessor: * If there is no queue (for usage with LLMEngine), return a list of RequestOutput objects. - ****************** NOTE FOR DEVELOPERS ****************** + NOTE FOR DEVELOPERS vLLM V1 minimizes the number of python loops over the full batch to ensure system overheads are minimized. This is the @@ -316,8 +316,6 @@ class OutputProcessor: If you need to touch every element of the batch, do it from within the loop below. - - ********************************************************** """ request_outputs: list[RequestOutput] = [] diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 9061a64db57c9..b25443dd45edd 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -75,7 +75,7 @@ class RejectionSampler(nn.Module): outside of the rejection sampler with the default sampling strategy. It allows for more flexibility in the sampling process such as top_p, top_k sampling. - sampling_metadata (SamplingMetadata): + sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata): Additional metadata needed for sampling, such as temperature, top-k/top-p parameters, or other relevant information. Returns: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 68c4e94fcd73e..ac6861f93a832 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -170,9 +170,10 @@ class Worker(WorkerBase): Then, it calculate the free memory that can be used for KV cache in bytes. - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + :::{tip} + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + ::: """ torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index e46ca0c90fe38..267754036b317 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs( ) -> None: """ Perform sanity checks for the result of - :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. + {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. """ assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( "Expected multimodal embeddings to be a list/tuple of 2D tensors, " @@ -39,7 +39,7 @@ def scatter_mm_placeholders( Scatter the multimodal embeddings into a contiguous tensor that represents the placeholder tokens. - :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. + {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. Args: embeds: The multimodal embeddings. @@ -66,7 +66,7 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of :func:`scatter_mm_placeholders`. + This is the operation of {func}`scatter_mm_placeholders`. """ if is_embed is None: return placeholders diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 8d7d5d7adc105..7898c645d66af 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + :::{tip} + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + ::: """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 58bf31cf2f33d..0825abbed1437 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -734,11 +734,11 @@ def _pythonize_sampler_output( cache: Optional[PythonizationCache], ) -> None: """ This function is only called when the output tensors are ready. - See :class:`ModelOutput`. + See {class}`ModelOutput`. Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, adding a Pythonized output data structure - (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`. + ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`. Args: model_input diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 78ea990de820c..1a14919ddfb26 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + :::{tip} + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + ::: """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 3aea0d7419d02..17f533525171b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -26,7 +26,7 @@ logger = init_logger(__name__) class XPUWorker(LoRANotSupportedWorkerBase, Worker): """A worker class that executes (a partition of) the model on a GPU. - + Each worker is associated with a single XPU device. The worker is responsible for maintaining the KV cache and executing the model on the XPU. In case of distributed inference, each worker is assigned a partition @@ -93,9 +93,10 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + :::{tip} + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + ::: """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. From 2858830c39da0ae153bc1328dbba7680f5fbebe1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 4 May 2025 20:43:05 +0800 Subject: [PATCH 3/3] [Bugfix] Prioritize dtype in root config before checking text config (#17629) Signed-off-by: DarkLight1337 --- vllm/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3bac36fcbbeaa..40beace3040c5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2954,10 +2954,12 @@ def _get_and_verify_dtype( ) -> torch.dtype: # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. - config_dtype = getattr(config.get_text_config(), "torch_dtype", None) + config_dtype = getattr(config, "torch_dtype", None) - # Fallback for multi-modal models if the root config + # Fallbacks for multi-modal models if the root config # does not define torch_dtype + if config_dtype is None: + config_dtype = getattr(config.get_text_config(), "torch_dtype", None) if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None)