mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 05:34:55 +08:00
Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
46fae69cf0
commit
d6484ef3c3
@ -39,7 +39,7 @@ steps:
|
||||
- pip install -r ../../requirements/docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
source_file_dependencies:
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -80,6 +80,7 @@ instance/
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/
|
||||
docs/source/api/vllm
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
|
||||
@ -22,3 +22,4 @@ help:
|
||||
clean:
|
||||
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
rm -rf "$(SOURCEDIR)/getting_started/examples"
|
||||
rm -rf "$(SOURCEDIR)/api/vllm"
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
# AsyncLLMEngine
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.AsyncLLMEngine
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -1,17 +0,0 @@
|
||||
# vLLM Engine
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.engine
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: vllm.engine
|
||||
```
|
||||
|
||||
:::{toctree}
|
||||
:caption: Engines
|
||||
:maxdepth: 2
|
||||
|
||||
llm_engine
|
||||
async_llm_engine
|
||||
:::
|
||||
@ -1,7 +0,0 @@
|
||||
# LLMEngine
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.LLMEngine
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -1,21 +0,0 @@
|
||||
# Inference Parameters
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
(sampling-params)=
|
||||
|
||||
## Sampling Parameters
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.SamplingParams
|
||||
:members:
|
||||
```
|
||||
|
||||
(pooling-params)=
|
||||
|
||||
## Pooling Parameters
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.PoolingParams
|
||||
:members:
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Model Adapters
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.model_executor.models.adapters
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,11 +0,0 @@
|
||||
# Model Development
|
||||
|
||||
## Submodules
|
||||
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
interfaces_base
|
||||
interfaces
|
||||
adapters
|
||||
:::
|
||||
@ -1,9 +0,0 @@
|
||||
# Optional Interfaces
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.model_executor.models.interfaces
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Base Model Interfaces
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.model_executor.models.interfaces_base
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,28 +0,0 @@
|
||||
(multi-modality)=
|
||||
|
||||
# Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
||||
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
|
||||
```
|
||||
|
||||
## Submodules
|
||||
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inputs
|
||||
parse
|
||||
processing
|
||||
profiling
|
||||
registry
|
||||
:::
|
||||
@ -1,49 +0,0 @@
|
||||
# Input Definitions
|
||||
|
||||
## User-facing inputs
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
|
||||
```
|
||||
|
||||
## Internal data structures
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.inputs.NestedTensors
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Data Parsing
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.parse
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Data Processing
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.processing
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Memory Profiling
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.profiling
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Registry
|
||||
|
||||
## Module Contents
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.registry
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
@ -1,9 +0,0 @@
|
||||
# Offline Inference
|
||||
|
||||
:::{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
llm
|
||||
llm_inputs
|
||||
:::
|
||||
@ -1,7 +0,0 @@
|
||||
# LLM Class
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.LLM
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -1,19 +0,0 @@
|
||||
# LLM Inputs
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.inputs.PromptType
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.inputs.TextPrompt
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.inputs.TokensPrompt
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:member-order: bysource
|
||||
```
|
||||
133
docs/source/api/summary.md
Normal file
133
docs/source/api/summary.md
Normal file
@ -0,0 +1,133 @@
|
||||
# Summary
|
||||
|
||||
(configuration)=
|
||||
|
||||
## Configuration
|
||||
|
||||
API documentation for vLLM's configuration classes.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.config.ModelConfig
|
||||
vllm.config.CacheConfig
|
||||
vllm.config.TokenizerPoolConfig
|
||||
vllm.config.LoadConfig
|
||||
vllm.config.ParallelConfig
|
||||
vllm.config.SchedulerConfig
|
||||
vllm.config.DeviceConfig
|
||||
vllm.config.SpeculativeConfig
|
||||
vllm.config.LoRAConfig
|
||||
vllm.config.PromptAdapterConfig
|
||||
vllm.config.MultiModalConfig
|
||||
vllm.config.PoolerConfig
|
||||
vllm.config.DecodingConfig
|
||||
vllm.config.ObservabilityConfig
|
||||
vllm.config.KVTransferConfig
|
||||
vllm.config.CompilationConfig
|
||||
vllm.config.VllmConfig
|
||||
```
|
||||
|
||||
(offline-inference-api)=
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.LLM
|
||||
```
|
||||
|
||||
LLM Inputs.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.inputs.PromptType
|
||||
vllm.inputs.TextPrompt
|
||||
vllm.inputs.TokensPrompt
|
||||
```
|
||||
|
||||
## vLLM Engines
|
||||
|
||||
Engine classes for offline and online inference.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.LLMEngine
|
||||
vllm.AsyncLLMEngine
|
||||
```
|
||||
|
||||
## Inference Parameters
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
(sampling-params)=
|
||||
(pooling-params)=
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.SamplingParams
|
||||
vllm.PoolingParams
|
||||
```
|
||||
|
||||
(multi-modality)=
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
||||
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.MULTIMODAL_REGISTRY
|
||||
```
|
||||
|
||||
### Inputs
|
||||
|
||||
User-facing inputs.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.inputs.MultiModalDataDict
|
||||
```
|
||||
|
||||
Internal data structures.
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.inputs.PlaceholderRange
|
||||
vllm.multimodal.inputs.NestedTensors
|
||||
vllm.multimodal.inputs.MultiModalFieldElem
|
||||
vllm.multimodal.inputs.MultiModalFieldConfig
|
||||
vllm.multimodal.inputs.MultiModalKwargsItem
|
||||
vllm.multimodal.inputs.MultiModalKwargs
|
||||
vllm.multimodal.inputs.MultiModalInputs
|
||||
```
|
||||
|
||||
### Data Parsing
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.parse
|
||||
```
|
||||
|
||||
### Data Processing
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.processing
|
||||
```
|
||||
|
||||
### Memory Profiling
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.profiling
|
||||
```
|
||||
|
||||
### Registry
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.multimodal.registry
|
||||
```
|
||||
|
||||
## Model Development
|
||||
|
||||
```{autodoc2-summary}
|
||||
vllm.model_executor.models.interfaces_base
|
||||
vllm.model_executor.models.interfaces
|
||||
vllm.model_executor.models.adapters
|
||||
```
|
||||
21
docs/source/autodoc2_docstring_parser.py
Normal file
21
docs/source/autodoc2_docstring_parser.py
Normal file
@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from docutils import nodes
|
||||
from myst_parser.parsers.sphinx_ import MystParser
|
||||
from sphinx.ext.napoleon import docstring
|
||||
|
||||
|
||||
class NapoleonParser(MystParser):
|
||||
|
||||
def parse(self, input_string: str, document: nodes.document) -> None:
|
||||
# Get the Sphinx configuration
|
||||
config = document.settings.env.config
|
||||
|
||||
parsed_content = str(
|
||||
docstring.GoogleDocstring(
|
||||
str(docstring.NumpyDocstring(input_string, config)),
|
||||
config,
|
||||
))
|
||||
return super().parse(parsed_content, document)
|
||||
|
||||
|
||||
Parser = NapoleonParser
|
||||
@ -13,16 +13,17 @@
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
||||
import datetime
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from sphinx.ext import autodoc
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
sys.path.append(os.path.abspath("../.."))
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.append(os.path.abspath(REPO_ROOT))
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
@ -40,8 +41,7 @@ extensions = [
|
||||
"sphinx.ext.linkcode",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx_copybutton",
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.autosummary",
|
||||
"autodoc2",
|
||||
"myst_parser",
|
||||
"sphinxarg.ext",
|
||||
"sphinx_design",
|
||||
@ -49,7 +49,22 @@ extensions = [
|
||||
]
|
||||
myst_enable_extensions = [
|
||||
"colon_fence",
|
||||
"fieldlist",
|
||||
]
|
||||
autodoc2_packages = [
|
||||
{
|
||||
"path": "../../vllm",
|
||||
"exclude_dirs": ["__pycache__", "third_party"],
|
||||
},
|
||||
]
|
||||
autodoc2_output_dir = "api"
|
||||
autodoc2_render_plugin = "myst"
|
||||
autodoc2_hidden_objects = ["dunder", "private", "inherited"]
|
||||
autodoc2_docstring_parser_regexes = [
|
||||
(".*", "docs.source.autodoc2_docstring_parser"),
|
||||
]
|
||||
autodoc2_sort_names = True
|
||||
autodoc2_index_template = None
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
@ -77,6 +92,11 @@ html_theme_options = {
|
||||
'repository_url': 'https://github.com/vllm-project/vllm',
|
||||
'use_repository_button': True,
|
||||
'use_edit_page_button': True,
|
||||
# Prevents the full API being added to the left sidebar of every page.
|
||||
# Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
|
||||
'collapse_navbar': True,
|
||||
# Makes API visible in the right sidebar on API reference pages.
|
||||
'show_toc_level': 3,
|
||||
}
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
@ -164,73 +184,64 @@ def linkcode_resolve(domain, info):
|
||||
return None
|
||||
if not info['module']:
|
||||
return None
|
||||
filename = info['module'].replace('.', '/')
|
||||
module = info['module']
|
||||
|
||||
# try to determine the correct file and line number to link to
|
||||
obj = sys.modules[module]
|
||||
# Get path from module name
|
||||
file = Path(f"{info['module'].replace('.', '/')}.py")
|
||||
path = REPO_ROOT / file
|
||||
if not path.exists():
|
||||
path = REPO_ROOT / file.with_suffix("") / "__init__.py"
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
# get as specific as we can
|
||||
lineno: int = 0
|
||||
filename: str = ""
|
||||
try:
|
||||
for part in info['fullname'].split('.'):
|
||||
obj = getattr(obj, part)
|
||||
# Get the line number of the object
|
||||
with open(path) as f:
|
||||
lines = f.readlines()
|
||||
name = info['fullname'].split(".")[-1]
|
||||
pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
|
||||
for lineno, line in enumerate(lines, 1):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if re.match(pattern, line):
|
||||
break
|
||||
|
||||
# Skip decorator wrappers by checking if the object is a function
|
||||
# and has a __wrapped__ attribute (which decorators typically set)
|
||||
while hasattr(obj, '__wrapped__'):
|
||||
obj = obj.__wrapped__
|
||||
# If the line number is not found, return None
|
||||
if lineno == len(lines):
|
||||
return None
|
||||
|
||||
if not (inspect.isclass(obj) or inspect.isfunction(obj)
|
||||
or inspect.ismethod(obj)):
|
||||
obj = obj.__class__ # Get the class of the instance
|
||||
|
||||
lineno = inspect.getsourcelines(obj)[1]
|
||||
filename = (inspect.getsourcefile(obj)
|
||||
or f"{filename}.py").split("vllm/", 1)[1]
|
||||
except Exception:
|
||||
# For some things, like a class member, won't work, so
|
||||
# we'll use the line number of the parent (the class)
|
||||
pass
|
||||
|
||||
if filename.startswith("checkouts/"):
|
||||
# If the line number is found, create the URL
|
||||
filename = path.relative_to(REPO_ROOT)
|
||||
if "checkouts" in path.parts:
|
||||
# a PR build on readthedocs
|
||||
pr_number = filename.split("/")[1]
|
||||
filename = filename.split("/", 2)[2]
|
||||
pr_number = REPO_ROOT.name
|
||||
base, branch = get_repo_base_and_branch(pr_number)
|
||||
if base and branch:
|
||||
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
|
||||
|
||||
# Otherwise, link to the source file on the main branch
|
||||
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
|
||||
|
||||
|
||||
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
|
||||
# Mock out external dependencies here, otherwise sphinx-argparse won't work.
|
||||
autodoc_mock_imports = [
|
||||
"huggingface_hub",
|
||||
"pydantic",
|
||||
"zmq",
|
||||
"cloudpickle",
|
||||
"aiohttp",
|
||||
"starlette",
|
||||
"blake3",
|
||||
"compressed_tensors",
|
||||
"cpuinfo",
|
||||
"cv2",
|
||||
"torch",
|
||||
"transformers",
|
||||
"psutil",
|
||||
"prometheus_client",
|
||||
"sentencepiece",
|
||||
"vllm._C",
|
||||
"PIL",
|
||||
"numpy",
|
||||
'triton',
|
||||
"tqdm",
|
||||
"tensorizer",
|
||||
"pynvml",
|
||||
"outlines",
|
||||
"xgrammar",
|
||||
"librosa",
|
||||
"soundfile",
|
||||
"gguf",
|
||||
"lark",
|
||||
"decord",
|
||||
# The mocks below are required by
|
||||
# docs/source/serving/openai_compatible_server.md's
|
||||
# vllm.entrypoints.openai.cli_args
|
||||
"openai",
|
||||
"fastapi",
|
||||
"partial_json_parser",
|
||||
]
|
||||
|
||||
for mock_target in autodoc_mock_imports:
|
||||
@ -241,18 +252,6 @@ for mock_target in autodoc_mock_imports:
|
||||
"been loaded into sys.modules when the sphinx build starts.",
|
||||
mock_target)
|
||||
|
||||
|
||||
class MockedClassDocumenter(autodoc.ClassDocumenter):
|
||||
"""Remove note about base class when a class is derived from object."""
|
||||
|
||||
def add_line(self, line: str, source: str, *lineno: int) -> None:
|
||||
if line == " Bases: :py:class:`object`":
|
||||
return
|
||||
super().add_line(line, source, *lineno)
|
||||
|
||||
|
||||
autodoc.ClassDocumenter = MockedClassDocumenter
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
"typing_extensions":
|
||||
@ -264,7 +263,4 @@ intersphinx_mapping = {
|
||||
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
|
||||
}
|
||||
|
||||
autodoc_preserve_defaults = True
|
||||
autodoc_warningiserror = True
|
||||
|
||||
navigation_with_keys = False
|
||||
|
||||
@ -52,8 +52,8 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
More API details can be found in the {doc}`Offline Inference
|
||||
</api/offline_inference/index>` section of the API docs.
|
||||
More API details can be found in the [Offline Inference]
|
||||
(#offline-inference-api) section of the API docs.
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
||||
* [APC](#automatic-prefix-caching)
|
||||
* [LoRA](#lora-adapter)
|
||||
* <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||
* [SD](#spec_decode)
|
||||
* [SD](#spec-decode)
|
||||
* CUDA graph
|
||||
* <abbr title="Pooling Models">pooling</abbr>
|
||||
* <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
||||
@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
||||
*
|
||||
*
|
||||
*
|
||||
- * [SD](#spec_decode)
|
||||
- * [SD](#spec-decode)
|
||||
* ✅
|
||||
* ✅
|
||||
* ❌
|
||||
@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
||||
* ✅
|
||||
* [❌](gh-issue:8475)
|
||||
* ✅
|
||||
- * [SD](#spec_decode)
|
||||
- * [SD](#spec-decode)
|
||||
* ✅
|
||||
* ✅
|
||||
* ✅
|
||||
|
||||
@ -194,11 +194,8 @@ contributing/vulnerability_management
|
||||
:caption: API Reference
|
||||
:maxdepth: 2
|
||||
|
||||
api/offline_inference/index
|
||||
api/engine/index
|
||||
api/inference_params
|
||||
api/multimodal/index
|
||||
api/model/index
|
||||
api/summary
|
||||
api/vllm/vllm
|
||||
:::
|
||||
|
||||
% Latest news and acknowledgements
|
||||
|
||||
@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it.
|
||||
## Offline Inference
|
||||
|
||||
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
||||
See [Engine Arguments](#engine-args) for a list of options when initializing the model.
|
||||
See <project:#configuration> for a list of options when initializing the model.
|
||||
|
||||
### `LLM.generate`
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
## Offline Inference
|
||||
|
||||
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
||||
See [Engine Arguments](#engine-args) for a list of options when initializing the model.
|
||||
See <project:#configuration> for a list of options when initializing the model.
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ The available APIs depend on the type of model that is being run:
|
||||
Please refer to the above pages for more details about each API.
|
||||
|
||||
:::{seealso}
|
||||
[API Reference](/api/offline_inference/index)
|
||||
[API Reference](#offline-inference-api)
|
||||
:::
|
||||
|
||||
(configuration-options)=
|
||||
@ -33,7 +33,7 @@ Please refer to the above pages for more details about each API.
|
||||
## Configuration Options
|
||||
|
||||
This section lists the most common options for running the vLLM engine.
|
||||
For a full list, refer to the [Engine Arguments](#engine-args) page.
|
||||
For a full list, refer to the <project:#configuration> page.
|
||||
|
||||
(model-resolution)=
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler import layerwise_profile
|
||||
from vllm.profiler.layerwise_profile import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BATCH_SIZE_DEFAULT = 1
|
||||
|
||||
@ -1,27 +1,15 @@
|
||||
sphinx==8.2.3
|
||||
sphinx-argparse==0.5.2
|
||||
sphinx-autodoc2==0.5.0
|
||||
sphinx-book-theme==1.1.4
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinx-design==0.6.1
|
||||
sphinx-togglebutton==0.3.2
|
||||
myst-parser==4.0.1
|
||||
msgspec
|
||||
cloudpickle
|
||||
commonmark # Required by sphinx-argparse when using :markdownhelp:
|
||||
|
||||
# packages to install to build the documentation
|
||||
cachetools
|
||||
pydantic >= 2.8
|
||||
-f https://download.pytorch.org/whl/cpu
|
||||
torch
|
||||
py-cpuinfo
|
||||
transformers
|
||||
mistral_common >= 1.5.4
|
||||
aiohttp
|
||||
starlette
|
||||
scipy
|
||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
requests
|
||||
zmq
|
||||
torch
|
||||
@ -112,11 +112,11 @@ class AudioTestAssets(list[AudioAsset]):
|
||||
|
||||
|
||||
IMAGE_ASSETS = ImageTestAssets()
|
||||
"""Singleton instance of :class:`ImageTestAssets`."""
|
||||
"""Singleton instance of {class}`ImageTestAssets`."""
|
||||
VIDEO_ASSETS = VideoTestAssets()
|
||||
"""Singleton instance of :class:`VideoTestAssets`."""
|
||||
"""Singleton instance of {class}`VideoTestAssets`."""
|
||||
AUDIO_ASSETS = AudioTestAssets()
|
||||
"""Singleton instance of :class:`AudioTestAssets`."""
|
||||
"""Singleton instance of {class}`AudioTestAssets`."""
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
@ -724,7 +724,7 @@ def hf_runner():
|
||||
class VllmRunner:
|
||||
"""
|
||||
The default value of some arguments have been modified from
|
||||
:class:`~vllm.LLM` as follows:
|
||||
{class}`~vllm.LLM` as follows:
|
||||
|
||||
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
||||
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
"""
|
||||
This test file includes some cases where it is inappropriate to
|
||||
only get the `eos_token_id` from the tokenizer as defined by
|
||||
:meth:`vllm.LLMEngine._get_eos_token_id`.
|
||||
{meth}`vllm.LLMEngine._get_eos_token_id`.
|
||||
"""
|
||||
from vllm.transformers_utils.config import try_get_generation_config
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -952,7 +952,7 @@ def get_client_text_logprob_generations(
|
||||
completions: list[Completion]) -> list[TextTextLogprobs]:
|
||||
'''Operates on the output of a request made to an Open-AI-protocol
|
||||
completions endpoint; obtains top-rank logprobs for each token in
|
||||
each :class:`SequenceGroup`
|
||||
each {class}`SequenceGroup`
|
||||
'''
|
||||
text_generations = get_client_text_generations(completions)
|
||||
text = ''.join(text_generations)
|
||||
|
||||
@ -44,7 +44,7 @@ def create_scheduler(
|
||||
(None)
|
||||
|
||||
Returns:
|
||||
:class:`Scheduler` instance
|
||||
{class}`Scheduler` instance
|
||||
'''
|
||||
if max_model_len is None:
|
||||
max_model_len = max_num_batched_tokens
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
# MLA Common Components
|
||||
|
||||
This file implements common components for MLA implementations.
|
||||
|
||||
First we define:
|
||||
|
||||
@ -550,7 +550,7 @@ def get_num_prefill_decode_query_kv_tokens(
|
||||
based on the attention metadata and the specified attention type.
|
||||
|
||||
Args:
|
||||
attn_metadata (FlashAttentionMetadata): Attention Metadata object.
|
||||
attn_metadata (AttentionMetadata): Attention Metadata object.
|
||||
attn_type (AttentionType): The type of attention being used.
|
||||
Returns:
|
||||
Tuple[int, int, int]: A tuple containing three integers:
|
||||
|
||||
@ -39,7 +39,7 @@ class CompilerInterface:
|
||||
Gather all the relevant information from the vLLM config,
|
||||
to compute a hash so that we can cache the compiled model.
|
||||
|
||||
See :meth:`VllmConfig.compute_hash` to check what information
|
||||
See {meth}`VllmConfig.compute_hash` to check what information
|
||||
is already considered by default. This function should only
|
||||
consider the information that is specific to the compiler.
|
||||
"""
|
||||
|
||||
@ -1911,10 +1911,10 @@ class SchedulerConfig:
|
||||
|
||||
cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
|
||||
"""Cuda graph capture sizes, default is 512.
|
||||
1. if one value is provided, then the capture list would follow the pattern:
|
||||
[1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
|
||||
2. more than one value (e.g. 1 2 128) is provided,
|
||||
then the capture list will follow the provided list."""
|
||||
1. if one value is provided, then the capture list would follow the
|
||||
pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
|
||||
2. more than one value (e.g. 1 2 128) is provided, then the capture list
|
||||
will follow the provided list."""
|
||||
|
||||
delay_factor: float = 0.0
|
||||
"""Apply a delay (of delay factor multiplied by previous
|
||||
@ -2888,7 +2888,7 @@ class PoolerConfig:
|
||||
pooling_type: Optional[str] = None
|
||||
"""
|
||||
The pooling method of the pooling model. This should be a key in
|
||||
:class:`vllm.model_executor.layers.pooler.PoolingType`.
|
||||
{class}`vllm.model_executor.layers.pooler.PoolingType`.
|
||||
"""
|
||||
|
||||
normalize: Optional[bool] = None
|
||||
|
||||
@ -167,4 +167,4 @@ class HTTPConnection:
|
||||
|
||||
|
||||
global_http_connection = HTTPConnection()
|
||||
"""The global :class:`HTTPConnection` instance used by vLLM."""
|
||||
"""The global {class}`HTTPConnection` instance used by vLLM."""
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
|
||||
from vllm.distributed.kv_transfer.kv_transfer_state import (
|
||||
ensure_kv_transfer_initialized, get_kv_transfer_group,
|
||||
has_kv_transfer_group, is_v1_kv_transfer_group)
|
||||
|
||||
@ -475,7 +475,7 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
*,
|
||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
||||
) -> None:
|
||||
"""Async version of :meth:`add_request`."""
|
||||
"""Async version of {meth}`add_request`."""
|
||||
if inputs is not None:
|
||||
prompt = inputs
|
||||
assert prompt is not None and params is not None
|
||||
@ -582,20 +582,20 @@ async def build_guided_decoding_logits_processor_async(
|
||||
|
||||
|
||||
class AsyncLLMEngine(EngineClient):
|
||||
"""An asynchronous wrapper for :class:`LLMEngine`.
|
||||
"""An asynchronous wrapper for {class}`LLMEngine`.
|
||||
|
||||
This class is used to wrap the :class:`LLMEngine` class to make it
|
||||
This class is used to wrap the {class}`LLMEngine` class to make it
|
||||
asynchronous. It uses asyncio to create a background loop that keeps
|
||||
processing incoming requests. The :class:`LLMEngine` is kicked by the
|
||||
processing incoming requests. The {class}`LLMEngine` is kicked by the
|
||||
generate method when there are requests in the waiting queue. The generate
|
||||
method yields the outputs from the :class:`LLMEngine` to the caller.
|
||||
method yields the outputs from the {class}`LLMEngine` to the caller.
|
||||
|
||||
Args:
|
||||
log_requests: Whether to log the requests.
|
||||
start_engine_loop: If True, the background task to run the engine
|
||||
will be automatically started in the generate call.
|
||||
*args: Arguments for :class:`LLMEngine`.
|
||||
**kwargs: Arguments for :class:`LLMEngine`.
|
||||
*args: Arguments for {class}`LLMEngine`.
|
||||
**kwargs: Arguments for {class}`LLMEngine`.
|
||||
"""
|
||||
|
||||
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
||||
@ -985,7 +985,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
||||
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each input.
|
||||
sampling_params: The sampling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
@ -1003,7 +1003,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
Details:
|
||||
- If the engine is not running, start the background loop,
|
||||
which iteratively invokes
|
||||
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||
to process the waiting requests.
|
||||
- Add the request to the engine's `RequestTracker`.
|
||||
On the next background loop, this request will be sent to
|
||||
@ -1075,7 +1075,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
||||
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each input.
|
||||
pooling_params: The pooling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
@ -1089,46 +1089,48 @@ class AsyncLLMEngine(EngineClient):
|
||||
for the request.
|
||||
|
||||
Details:
|
||||
- If the engine is not running, start the background loop,
|
||||
which iteratively invokes
|
||||
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||
to process the waiting requests.
|
||||
- Add the request to the engine's `RequestTracker`.
|
||||
On the next background loop, this request will be sent to
|
||||
the underlying engine.
|
||||
Also, a corresponding `AsyncStream` will be created.
|
||||
- Wait for the request outputs from `AsyncStream` and yield them.
|
||||
- If the engine is not running, start the background loop,
|
||||
which iteratively invokes
|
||||
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||
to process the waiting requests.
|
||||
- Add the request to the engine's `RequestTracker`.
|
||||
On the next background loop, this request will be sent to
|
||||
the underlying engine.
|
||||
Also, a corresponding `AsyncStream` will be created.
|
||||
- Wait for the request outputs from `AsyncStream` and yield them.
|
||||
|
||||
Example:
|
||||
>>> # Please refer to entrypoints/api_server.py for
|
||||
>>> # the complete example.
|
||||
>>>
|
||||
>>> # initialize the engine and the example input
|
||||
>>> # note that engine_args here is AsyncEngineArgs instance
|
||||
>>> engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
>>> example_input = {
|
||||
>>> "input": "What is LLM?",
|
||||
>>> "request_id": 0,
|
||||
>>> }
|
||||
>>>
|
||||
>>> # start the generation
|
||||
>>> results_generator = engine.encode(
|
||||
>>> example_input["input"],
|
||||
>>> PoolingParams(),
|
||||
>>> example_input["request_id"])
|
||||
>>>
|
||||
>>> # get the results
|
||||
>>> final_output = None
|
||||
>>> async for request_output in results_generator:
|
||||
>>> if await request.is_disconnected():
|
||||
>>> # Abort the request if the client disconnects.
|
||||
>>> await engine.abort(request_id)
|
||||
>>> # Return or raise an error
|
||||
>>> ...
|
||||
>>> final_output = request_output
|
||||
>>>
|
||||
>>> # Process and return the final output
|
||||
>>> ...
|
||||
```
|
||||
# Please refer to entrypoints/api_server.py for
|
||||
# the complete example.
|
||||
|
||||
# initialize the engine and the example input
|
||||
# note that engine_args here is AsyncEngineArgs instance
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
example_input = {
|
||||
"input": "What is LLM?",
|
||||
"request_id": 0,
|
||||
}
|
||||
|
||||
# start the generation
|
||||
results_generator = engine.encode(
|
||||
example_input["input"],
|
||||
PoolingParams(),
|
||||
example_input["request_id"])
|
||||
|
||||
# get the results
|
||||
final_output = None
|
||||
async for request_output in results_generator:
|
||||
if await request.is_disconnected():
|
||||
# Abort the request if the client disconnects.
|
||||
await engine.abort(request_id)
|
||||
# Return or raise an error
|
||||
...
|
||||
final_output = request_output
|
||||
|
||||
# Process and return the final output
|
||||
...
|
||||
```
|
||||
"""
|
||||
try:
|
||||
async for output in await self.add_request(
|
||||
|
||||
@ -130,11 +130,11 @@ class LLMEngine:
|
||||
iteration-level scheduling and efficient memory management to maximize the
|
||||
serving throughput.
|
||||
|
||||
The :class:`~vllm.LLM` class wraps this class for offline batched inference
|
||||
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
|
||||
The {class}`~vllm.LLM` class wraps this class for offline batched inference
|
||||
and the {class}`AsyncLLMEngine` class wraps this class for online serving.
|
||||
|
||||
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
|
||||
:ref:`engine-args`)
|
||||
The config arguments are derived from {class}`~vllm.EngineArgs`. (See
|
||||
{ref}`engine-args`)
|
||||
|
||||
Args:
|
||||
model_config: The configuration related to the LLM model.
|
||||
@ -694,11 +694,11 @@ class LLMEngine:
|
||||
|
||||
Args:
|
||||
request_id: The unique ID of the request.
|
||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
||||
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each input.
|
||||
params: Parameters for sampling or pooling.
|
||||
:class:`~vllm.SamplingParams` for text generation.
|
||||
:class:`~vllm.PoolingParams` for pooling.
|
||||
{class}`~vllm.SamplingParams` for text generation.
|
||||
{class}`~vllm.PoolingParams` for pooling.
|
||||
arrival_time: The arrival time of the request. If None, we use
|
||||
the current monotonic time.
|
||||
lora_request: The LoRA request to add.
|
||||
@ -710,10 +710,10 @@ class LLMEngine:
|
||||
Details:
|
||||
- Set arrival_time to the current time if it is None.
|
||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||
- Create `n` number of :class:`~vllm.Sequence` objects.
|
||||
- Create a :class:`~vllm.SequenceGroup` object
|
||||
from the list of :class:`~vllm.Sequence`.
|
||||
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
|
||||
- Create `n` number of {class}`~vllm.Sequence` objects.
|
||||
- Create a {class}`~vllm.SequenceGroup` object
|
||||
from the list of {class}`~vllm.Sequence`.
|
||||
- Add the {class}`~vllm.SequenceGroup` object to the scheduler.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine
|
||||
@ -861,8 +861,8 @@ class LLMEngine:
|
||||
|
||||
Details:
|
||||
- Refer to the
|
||||
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
||||
from class :class:`~vllm.core.scheduler.Scheduler`.
|
||||
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
||||
from class {class}`~vllm.core.scheduler.Scheduler`.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine and add a request with request_id
|
||||
@ -1258,53 +1258,56 @@ class LLMEngine:
|
||||
def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
|
||||
"""Performs one decoding iteration and returns newly generated results.
|
||||
|
||||
.. figure:: https://i.imgur.com/sv2HssD.png
|
||||
:alt: Overview of the step function
|
||||
:align: center
|
||||
:::{figure} https://i.imgur.com/sv2HssD.png
|
||||
:alt: Overview of the step function
|
||||
:align: center
|
||||
|
||||
Overview of the step function.
|
||||
Overview of the step function.
|
||||
:::
|
||||
|
||||
Details:
|
||||
- Step 1: Schedules the sequences to be executed in the next
|
||||
iteration and the token blocks to be swapped in/out/copy.
|
||||
- Step 1: Schedules the sequences to be executed in the next
|
||||
iteration and the token blocks to be swapped in/out/copy.
|
||||
|
||||
- Depending on the scheduling policy,
|
||||
sequences may be `preempted/reordered`.
|
||||
- A Sequence Group (SG) refer to a group of sequences
|
||||
that are generated from the same prompt.
|
||||
- Depending on the scheduling policy,
|
||||
sequences may be `preempted/reordered`.
|
||||
- A Sequence Group (SG) refer to a group of sequences
|
||||
that are generated from the same prompt.
|
||||
|
||||
- Step 2: Calls the distributed executor to execute the model.
|
||||
- Step 3: Processes the model output. This mainly includes:
|
||||
- Step 2: Calls the distributed executor to execute the model.
|
||||
- Step 3: Processes the model output. This mainly includes:
|
||||
|
||||
- Decodes the relevant outputs.
|
||||
- Updates the scheduled sequence groups with model outputs
|
||||
based on its `sampling parameters` (`use_beam_search` or not).
|
||||
- Frees the finished sequence groups.
|
||||
- Decodes the relevant outputs.
|
||||
- Updates the scheduled sequence groups with model outputs
|
||||
based on its `sampling parameters` (`use_beam_search` or not).
|
||||
- Frees the finished sequence groups.
|
||||
|
||||
- Finally, it creates and returns the newly generated results.
|
||||
- Finally, it creates and returns the newly generated results.
|
||||
|
||||
Example:
|
||||
>>> # Please see the example/ folder for more detailed examples.
|
||||
>>>
|
||||
>>> # initialize engine and request arguments
|
||||
>>> engine = LLMEngine.from_engine_args(engine_args)
|
||||
>>> example_inputs = [(0, "What is LLM?",
|
||||
>>> SamplingParams(temperature=0.0))]
|
||||
>>>
|
||||
>>> # Start the engine with an event loop
|
||||
>>> while True:
|
||||
>>> if example_inputs:
|
||||
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
|
||||
>>> engine.add_request(str(req_id),prompt,sampling_params)
|
||||
>>>
|
||||
>>> # continue the request processing
|
||||
>>> request_outputs = engine.step()
|
||||
>>> for request_output in request_outputs:
|
||||
>>> if request_output.finished:
|
||||
>>> # return or show the request output
|
||||
>>>
|
||||
>>> if not (engine.has_unfinished_requests() or example_inputs):
|
||||
>>> break
|
||||
```
|
||||
# Please see the example/ folder for more detailed examples.
|
||||
|
||||
# initialize engine and request arguments
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
example_inputs = [(0, "What is LLM?",
|
||||
SamplingParams(temperature=0.0))]
|
||||
|
||||
# Start the engine with an event loop
|
||||
while True:
|
||||
if example_inputs:
|
||||
req_id, prompt, sampling_params = example_inputs.pop(0)
|
||||
engine.add_request(str(req_id),prompt,sampling_params)
|
||||
|
||||
# continue the request processing
|
||||
request_outputs = engine.step()
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
# return or show the request output
|
||||
|
||||
if not (engine.has_unfinished_requests() or example_inputs):
|
||||
break
|
||||
```
|
||||
"""
|
||||
if self.parallel_config.pipeline_parallel_size > 1:
|
||||
raise NotImplementedError(
|
||||
|
||||
@ -491,7 +491,7 @@ class MQLLMEngineClient(EngineClient):
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
||||
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each input.
|
||||
sampling_params: The sampling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
@ -560,7 +560,7 @@ class MQLLMEngineClient(EngineClient):
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
||||
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each input.
|
||||
pooling_params: The pooling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
|
||||
@ -41,18 +41,18 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
|
||||
|
||||
|
||||
class MQLLMEngine:
|
||||
"""A multiprocessing wrapper for :class:`LLMEngine`.
|
||||
"""A multiprocessing wrapper for {class}`LLMEngine`.
|
||||
|
||||
This class is used to wrap the :class:`LLMEngine` class to enable use
|
||||
This class is used to wrap the {class}`LLMEngine` class to enable use
|
||||
in concurrnet manner. It runs a background loop and uses zeromq to
|
||||
receive new requests and stream outputs incrementally via ipc.
|
||||
|
||||
The :class:`LLMEngine` generate or encode process is kicked off when a new
|
||||
The {class}`LLMEngine` generate or encode process is kicked off when a new
|
||||
RPCProcessRequest is received by the input_socket.
|
||||
|
||||
The self.engine_loop checks the input_socket for new requests,
|
||||
adds them to the LLMEngine if there are any, calls the internal
|
||||
:class:`LLMEngine.step()`, and sends the RequestOutputs back over
|
||||
{class}`LLMEngine.step()`, and sends the RequestOutputs back over
|
||||
the output_socket.
|
||||
|
||||
If use_async_sockets is set, the logic associated with reading new
|
||||
@ -64,8 +64,8 @@ class MQLLMEngine:
|
||||
ipc_path: Base path for zeromq interprocess messaging
|
||||
use_async_sockets: Whether to make send/recv async with GPU
|
||||
log_requests: Whether to log the requests.
|
||||
*args: Arguments for :class:`LLMEngine`.
|
||||
**kwargs: Arguments for :class:`LLMEngine`.
|
||||
*args: Arguments for {class}`LLMEngine`.
|
||||
**kwargs: Arguments for {class}`LLMEngine`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
||||
@ -56,8 +56,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
scheduled computation.
|
||||
|
||||
Args:
|
||||
seq_group: the outputs are associated with this :class:`SequenceGroup`
|
||||
outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
|
||||
seq_group: the outputs are associated with this {class}`SequenceGroup`
|
||||
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
|
||||
"""
|
||||
for output in outputs:
|
||||
# Concatenate single-step prompt logprob processing results.
|
||||
|
||||
@ -19,7 +19,7 @@ logger = init_logger(__name__)
|
||||
def single_step_process_prompt_logprob(
|
||||
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
|
||||
output: CompletionSequenceGroupOutput) -> None:
|
||||
"""Process prompt logprobs associated with the :class:`SequenceGroupOutput`
|
||||
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput`
|
||||
for a given step.
|
||||
|
||||
Do nothing if the output has no prompt logprobs.
|
||||
@ -27,9 +27,9 @@ def single_step_process_prompt_logprob(
|
||||
Account for the fact that transformers do not compute first-token logprobs.
|
||||
|
||||
Args:
|
||||
sg_output_proc: :class:`SequenceGroupOutputProcessor` instance
|
||||
seq_group: the output is associated with this :class:`SequenceGroup`
|
||||
output: the :class:`SequenceGroupOutput` for a single scheduler step
|
||||
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
|
||||
seq_group: the output is associated with this {class}`SequenceGroup`
|
||||
output: the {class}`SequenceGroupOutput` for a single scheduler step
|
||||
"""
|
||||
prompt_logprobs = output.prompt_logprobs
|
||||
|
||||
@ -103,8 +103,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
scheduled computation.
|
||||
|
||||
Args:
|
||||
seq_group: the output is associated with this :class:`SequenceGroup`
|
||||
outputs: the :class:`SequenceGroupOutput` for a single scheduler step
|
||||
seq_group: the output is associated with this {class}`SequenceGroup`
|
||||
outputs: the {class}`SequenceGroupOutput` for a single scheduler step
|
||||
"""
|
||||
assert len(outputs) == 1, "Single step should only have 1 output."
|
||||
output = outputs[0]
|
||||
|
||||
@ -115,7 +115,7 @@ class LLM:
|
||||
to eager mode. Additionally for encoder-decoder models, if the
|
||||
sequence length of the encoder input is larger than this, we fall
|
||||
back to the eager mode.
|
||||
disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
|
||||
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
|
||||
disable_async_output_proc: Disable async output processing.
|
||||
This may result in lower performance.
|
||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||
@ -127,12 +127,13 @@ class LLM:
|
||||
compilation_config: Either an integer or a dictionary. If it is an
|
||||
integer, it is used as the level of compilation optimization. If it
|
||||
is a dictionary, it can specify the full compilation configuration.
|
||||
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
|
||||
:ref:`engine-args`)
|
||||
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
|
||||
{ref}`engine-args`)
|
||||
|
||||
Note:
|
||||
This class is intended to be used for offline inference. For online
|
||||
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
|
||||
:::{note}
|
||||
This class is intended to be used for offline inference. For online
|
||||
serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
|
||||
:::
|
||||
"""
|
||||
|
||||
DEPRECATE_LEGACY: ClassVar[bool] = True
|
||||
@ -141,7 +142,7 @@ class LLM:
|
||||
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
|
||||
"""
|
||||
A flag to toggle whether to deprecate positional arguments in
|
||||
:meth:`LLM.__init__`.
|
||||
{meth}`LLM.__init__`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@ -398,7 +399,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
sampling_params: The sampling parameters for text generation. If
|
||||
None, we use the default sampling parameters.
|
||||
@ -413,13 +414,14 @@ class LLM:
|
||||
Only applicable when priority scheduling policy is enabled.
|
||||
|
||||
Returns:
|
||||
A list of ``RequestOutput`` objects containing the
|
||||
A list of `RequestOutput` objects containing the
|
||||
generated completions in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the ``inputs`` parameter.
|
||||
:::{note}
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
:::
|
||||
"""
|
||||
runner_type = self.llm_engine.model_config.runner_type
|
||||
if runner_type not in ["generate", "transcription"]:
|
||||
@ -488,16 +490,17 @@ class LLM:
|
||||
`self` argument, in addition to the arguments passed in `args`
|
||||
and `kwargs`. The `self` argument will be the worker object.
|
||||
timeout: Maximum time in seconds to wait for execution. Raises a
|
||||
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
args: Positional arguments to pass to the worker method.
|
||||
kwargs: Keyword arguments to pass to the worker method.
|
||||
|
||||
Returns:
|
||||
A list containing the results from each worker.
|
||||
|
||||
Note:
|
||||
It is recommended to use this API to only pass control messages,
|
||||
and set up data-plane communication to pass data.
|
||||
|
||||
:::{note}
|
||||
It is recommended to use this API to only pass control messages,
|
||||
and set up data-plane communication to pass data.
|
||||
:::
|
||||
"""
|
||||
|
||||
return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
|
||||
@ -664,7 +667,7 @@ class LLM:
|
||||
Generate responses for a chat conversation.
|
||||
|
||||
The chat conversation is converted into a text prompt using the
|
||||
tokenizer and calls the :meth:`generate` method to generate the
|
||||
tokenizer and calls the {meth}`generate` method to generate the
|
||||
responses.
|
||||
|
||||
Multi-modal inputs can be passed in the same way you would pass them
|
||||
@ -903,7 +906,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
@ -913,13 +916,14 @@ class LLM:
|
||||
generation, if any.
|
||||
|
||||
Returns:
|
||||
A list of ``PoolingRequestOutput`` objects containing the
|
||||
A list of `PoolingRequestOutput` objects containing the
|
||||
pooled hidden states in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the ``inputs`` parameter.
|
||||
:::{note}
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
:::
|
||||
"""
|
||||
runner_type = self.llm_engine.model_config.runner_type
|
||||
if runner_type != "pooling":
|
||||
@ -992,7 +996,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
@ -1036,7 +1040,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
@ -1168,7 +1172,7 @@ class LLM:
|
||||
text_1: can be a single prompt or a list of prompts, in which
|
||||
case it has to have the same length as the ``text_2`` list
|
||||
text_2: The texts to pair with the query to form the input
|
||||
to the LLM. See :class:`~vllm.inputs.PromptType` for
|
||||
to the LLM. See {class}`~vllm.inputs.PromptType` for
|
||||
more details about the format of each prompts.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
@ -1277,7 +1281,7 @@ class LLM:
|
||||
|
||||
def wake_up(self, tags: Optional[list[str]] = None):
|
||||
"""
|
||||
Wake up the engine from sleep mode. See the :meth:`sleep` method
|
||||
Wake up the engine from sleep mode. See the {meth}`sleep` method
|
||||
for more details.
|
||||
|
||||
Args:
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from argparse import Namespace
|
||||
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -25,23 +24,7 @@ from vllm.utils import random_uuid, resolve_obj_by_qualname
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# torch is mocked during docs generation,
|
||||
# so we have to provide the values as literals
|
||||
_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
|
||||
_LONG_INFO: Union["torch.iinfo", Namespace]
|
||||
|
||||
try:
|
||||
from sphinx.ext.autodoc.mock import _MockModule
|
||||
|
||||
if isinstance(torch, _MockModule):
|
||||
_LONG_INFO = _MOCK_LONG_INFO
|
||||
else:
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
except ModuleNotFoundError:
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
|
||||
assert _LONG_INFO.min == _MOCK_LONG_INFO.min
|
||||
assert _LONG_INFO.max == _MOCK_LONG_INFO.max
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
|
||||
|
||||
class OpenAIBaseModel(BaseModel):
|
||||
|
||||
@ -275,7 +275,7 @@ class OpenAIServing:
|
||||
add_special_tokens: bool = True,
|
||||
) -> TextTokensPrompt:
|
||||
"""
|
||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
||||
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||
that assumes single input.
|
||||
"""
|
||||
return next(
|
||||
@ -296,7 +296,7 @@ class OpenAIServing:
|
||||
add_special_tokens: bool = True,
|
||||
) -> Iterator[TextTokensPrompt]:
|
||||
"""
|
||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
||||
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||
that assumes multiple inputs.
|
||||
"""
|
||||
for text in prompt_inputs:
|
||||
|
||||
@ -74,7 +74,7 @@ class ExecutorBase(ABC):
|
||||
`self` argument, in addition to the arguments passed in `args`
|
||||
and `kwargs`. The `self` argument will be the worker object.
|
||||
timeout: Maximum time in seconds to wait for execution. Raises a
|
||||
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
args: Positional arguments to pass to the worker method.
|
||||
kwargs: Keyword arguments to pass to the worker method.
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
|
||||
|
||||
INPUT_REGISTRY = InputRegistry()
|
||||
"""
|
||||
The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
|
||||
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
|
||||
to dispatch data processing according to the target model.
|
||||
"""
|
||||
|
||||
|
||||
@ -80,22 +80,22 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
|
||||
"""
|
||||
Set of possible schemas for a single prompt:
|
||||
|
||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
||||
- A tokenized prompt (:class:`TokensPrompt`)
|
||||
- An embeddings prompt (:class:`EmbedsPrompt`)
|
||||
- A text prompt ({class}`str` or {class}`TextPrompt`)
|
||||
- A tokenized prompt ({class}`TokensPrompt`)
|
||||
- An embeddings prompt ({class}`EmbedsPrompt`)
|
||||
|
||||
Note that "singleton" is as opposed to a data structure
|
||||
which encapsulates multiple prompts, i.e. of the sort
|
||||
which may be utilized for encoder/decoder models when
|
||||
the user desires to express both the encoder & decoder
|
||||
prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
||||
prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
|
||||
|
||||
A prompt of type :class:`SingletonPrompt` may be employed
|
||||
A prompt of type {class}`SingletonPrompt` may be employed
|
||||
as (1) input to a decoder-only model, (2) input to
|
||||
the encoder of an encoder/decoder model, in the scenario
|
||||
where the decoder-prompt is not specified explicitly, or
|
||||
(3) as a member of a larger data structure encapsulating
|
||||
more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
||||
more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
|
||||
"""
|
||||
|
||||
_T1_co = TypeVar("_T1_co",
|
||||
@ -115,18 +115,18 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
|
||||
comprising an explicit encoder prompt and a decoder prompt.
|
||||
|
||||
The encoder and decoder prompts, respectively, may be formatted
|
||||
according to any of the :class:`SingletonPrompt` schemas,
|
||||
according to any of the {class}`SingletonPrompt` schemas,
|
||||
and are not required to have the same schema.
|
||||
|
||||
Only the encoder prompt may have multi-modal data. mm_processor_kwargs
|
||||
should be at the top-level, and should not be set in the encoder/decoder
|
||||
prompts, since they are agnostic to the encoder/decoder.
|
||||
|
||||
Note that an :class:`ExplicitEncoderDecoderPrompt` may not
|
||||
Note that an {class}`ExplicitEncoderDecoderPrompt` may not
|
||||
be used as an input to a decoder-only model,
|
||||
and that the :code:`encoder_prompt` and :code:`decoder_prompt`
|
||||
and that the `encoder_prompt` and `decoder_prompt`
|
||||
fields of this data structure themselves must be
|
||||
:class:`SingletonPrompt` instances.
|
||||
{class}`SingletonPrompt` instances.
|
||||
"""
|
||||
|
||||
encoder_prompt: _T1_co
|
||||
@ -141,11 +141,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
|
||||
Set of possible schemas for an LLM input, including
|
||||
both decoder-only and encoder/decoder input types:
|
||||
|
||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
||||
- A tokenized prompt (:class:`TokensPrompt`)
|
||||
- An embeddings prompt (:class:`EmbedsPrompt`)
|
||||
- A text prompt ({class}`str` or {class}`TextPrompt`)
|
||||
- A tokenized prompt ({class}`TokensPrompt`)
|
||||
- An embeddings prompt ({class}`EmbedsPrompt`)
|
||||
- A single data structure containing both an encoder and a decoder prompt
|
||||
(:class:`ExplicitEncoderDecoderPrompt`)
|
||||
({class}`ExplicitEncoderDecoderPrompt`)
|
||||
"""
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ def token_inputs(
|
||||
prompt: Optional[str] = None,
|
||||
cache_salt: Optional[str] = None,
|
||||
) -> TokenInputs:
|
||||
"""Construct :class:`TokenInputs` from optional values."""
|
||||
"""Construct {class}`TokenInputs` from optional values."""
|
||||
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
||||
|
||||
if prompt is not None:
|
||||
@ -221,7 +221,7 @@ def embeds_inputs(
|
||||
|
||||
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
||||
"""
|
||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||
The inputs in {class}`~vllm.LLMEngine` before they are
|
||||
passed to the model executor.
|
||||
This specifies the data required for decoder-only models.
|
||||
"""
|
||||
@ -229,7 +229,7 @@ This specifies the data required for decoder-only models.
|
||||
|
||||
class EncoderDecoderInputs(TypedDict):
|
||||
"""
|
||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||
The inputs in {class}`~vllm.LLMEngine` before they are
|
||||
passed to the model executor.
|
||||
|
||||
This specifies the required data for encoder-decoder models.
|
||||
@ -243,13 +243,13 @@ class EncoderDecoderInputs(TypedDict):
|
||||
|
||||
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
||||
"""
|
||||
A processed :class:`SingletonPrompt` which can be passed to
|
||||
:class:`vllm.sequence.Sequence`.
|
||||
A processed {class}`SingletonPrompt` which can be passed to
|
||||
{class}`vllm.sequence.Sequence`.
|
||||
"""
|
||||
|
||||
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
|
||||
"""
|
||||
The inputs to :data:`vllm.inputs.InputProcessor`.
|
||||
The inputs to {data}`vllm.inputs.InputProcessor`.
|
||||
"""
|
||||
|
||||
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
|
||||
@ -277,7 +277,7 @@ def zip_enc_dec_prompts(
|
||||
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
|
||||
"""
|
||||
Zip encoder and decoder prompts together into a list of
|
||||
:class:`ExplicitEncoderDecoderPrompt` instances.
|
||||
{class}`ExplicitEncoderDecoderPrompt` instances.
|
||||
|
||||
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
||||
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
||||
|
||||
@ -224,7 +224,7 @@ class InputPreprocessor:
|
||||
lora_request: Optional[LoRARequest],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> list[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
"""Async version of {meth}`_tokenize_prompt`."""
|
||||
tokenizer = self.get_tokenizer_group()
|
||||
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
||||
|
||||
@ -287,7 +287,7 @@ class InputPreprocessor:
|
||||
lora_request: Optional[LoRARequest],
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""Async version of :meth:`_process_multimodal`."""
|
||||
"""Async version of {meth}`_process_multimodal`."""
|
||||
tokenizer = await self._get_mm_tokenizer_async(lora_request)
|
||||
|
||||
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||
@ -472,7 +472,7 @@ class InputPreprocessor:
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`SingletonInputs` instance
|
||||
* {class}`SingletonInputs` instance
|
||||
"""
|
||||
parsed = parse_singleton_prompt(prompt)
|
||||
|
||||
@ -508,7 +508,7 @@ class InputPreprocessor:
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""Async version of :meth:`_prompt_to_llm_inputs`."""
|
||||
"""Async version of {meth}`_prompt_to_llm_inputs`."""
|
||||
parsed = parse_singleton_prompt(prompt)
|
||||
|
||||
if parsed["type"] == "embeds":
|
||||
@ -644,7 +644,7 @@ class InputPreprocessor:
|
||||
) -> EncoderDecoderInputs:
|
||||
"""
|
||||
For encoder/decoder models only:
|
||||
Process an input prompt into an :class:`EncoderDecoderInputs` instance.
|
||||
Process an input prompt into an {class}`EncoderDecoderInputs` instance.
|
||||
|
||||
There are two types of input prompts:
|
||||
singleton prompts which carry only the
|
||||
@ -670,7 +670,7 @@ class InputPreprocessor:
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`EncoderDecoderInputs` instance
|
||||
* {class}`EncoderDecoderInputs` instance
|
||||
"""
|
||||
encoder_inputs: SingletonInputs
|
||||
decoder_inputs: Optional[SingletonInputs]
|
||||
@ -710,7 +710,7 @@ class InputPreprocessor:
|
||||
prompt: PromptType,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
"""Async version of {meth}`_process_encoder_decoder_prompt`."""
|
||||
encoder_inputs: SingletonInputs
|
||||
decoder_inputs: Optional[SingletonInputs]
|
||||
|
||||
@ -778,7 +778,7 @@ class InputPreprocessor:
|
||||
) -> DecoderOnlyInputs:
|
||||
"""
|
||||
For decoder-only models:
|
||||
Process an input prompt into an :class:`DecoderOnlyInputs` instance.
|
||||
Process an input prompt into an {class}`DecoderOnlyInputs` instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
@ -789,7 +789,7 @@ class InputPreprocessor:
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`DecoderOnlyInputs` instance
|
||||
* {class}`DecoderOnlyInputs` instance
|
||||
"""
|
||||
|
||||
prompt_comps = self._prompt_to_llm_inputs(
|
||||
@ -812,7 +812,7 @@ class InputPreprocessor:
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> DecoderOnlyInputs:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
"""Async version of {meth}`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
@ -863,7 +863,7 @@ class InputPreprocessor:
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> ProcessorInputs:
|
||||
"""Async version of :meth:`preprocess`."""
|
||||
"""Async version of {meth}`preprocess`."""
|
||||
if self.model_config.is_encoder_decoder:
|
||||
assert not return_mm_hashes, (
|
||||
"Multimodal hashes for encoder-decoder models should not be ",
|
||||
|
||||
@ -38,7 +38,7 @@ class InputContext:
|
||||
) -> _C:
|
||||
"""
|
||||
Get the HuggingFace configuration
|
||||
(:class:`transformers.PretrainedConfig`) of the model,
|
||||
({class}`transformers.PretrainedConfig`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
@ -79,7 +79,7 @@ class InputContext:
|
||||
) -> _P:
|
||||
"""
|
||||
Get the HuggingFace processor
|
||||
(:class:`transformers.ProcessorMixin`) of the model,
|
||||
({class}`transformers.ProcessorMixin`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
@ -135,8 +135,8 @@ class InputProcessingContext(InputContext):
|
||||
kwargs: Mapping[str, object] = {},
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Call :code:`hf_processor` on the prompt :code:`data`
|
||||
(text, image, audio...) with configurable options :code:`kwargs`.
|
||||
Call `hf_processor` on the prompt `data`
|
||||
(text, image, audio...) with configurable options `kwargs`.
|
||||
"""
|
||||
assert callable(hf_processor)
|
||||
|
||||
|
||||
@ -68,21 +68,21 @@ class _VllmLogger(Logger):
|
||||
"""
|
||||
Note:
|
||||
This class is just to provide type information.
|
||||
We actually patch the methods directly on the :class:`logging.Logger`
|
||||
We actually patch the methods directly on the {class}`logging.Logger`
|
||||
instance to avoid conflicting with other libraries such as
|
||||
`intel_extension_for_pytorch.utils._logger`.
|
||||
"""
|
||||
|
||||
def info_once(self, msg: str, *args: Hashable) -> None:
|
||||
"""
|
||||
As :meth:`info`, but subsequent calls with the same message
|
||||
As {meth}`info`, but subsequent calls with the same message
|
||||
are silently dropped.
|
||||
"""
|
||||
_print_info_once(self, msg, *args)
|
||||
|
||||
def warning_once(self, msg: str, *args: Hashable) -> None:
|
||||
"""
|
||||
As :meth:`warning`, but subsequent calls with the same message
|
||||
As {meth}`warning`, but subsequent calls with the same message
|
||||
are silently dropped.
|
||||
"""
|
||||
_print_warning_once(self, msg, *args)
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm.lora.ops.triton_ops.lora_expand import lora_expand
|
||||
from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
|
||||
from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
|
||||
from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink
|
||||
from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
|
||||
|
||||
__all__ = [
|
||||
"lora_expand",
|
||||
|
||||
@ -261,15 +261,16 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
|
||||
True, then a token can be accepted, else it should be
|
||||
rejected.
|
||||
|
||||
Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
|
||||
:math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according
|
||||
to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
|
||||
Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
|
||||
{math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according
|
||||
to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
|
||||
same conditional probability according to the draft model, the token
|
||||
is accepted with probability:
|
||||
|
||||
.. math::
|
||||
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
|
||||
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
|
||||
:::{math}
|
||||
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
|
||||
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
|
||||
:::
|
||||
|
||||
This implementation does not apply causality. When using the output,
|
||||
if a token is rejected, subsequent tokens should not be used.
|
||||
@ -312,18 +313,20 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
|
||||
target model is recovered (within hardware numerics).
|
||||
|
||||
The probability distribution used in this rejection case is constructed
|
||||
as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of
|
||||
:math:`x` given context :math:`x_1, \dots, x_n` according to the target
|
||||
model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability
|
||||
as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of
|
||||
{math}`x` given context {math}`x_1, \dots, x_n` according to the target
|
||||
model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability
|
||||
according to the draft model:
|
||||
|
||||
.. math::
|
||||
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
|
||||
:::{math}
|
||||
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
|
||||
:::
|
||||
|
||||
where :math:`(f(x))_+` is defined as:
|
||||
where {math}`(f(x))_+` is defined as:
|
||||
|
||||
.. math::
|
||||
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
|
||||
:::{math}
|
||||
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
|
||||
:::
|
||||
|
||||
See https://github.com/vllm-project/vllm/pull/2336 for a visualization
|
||||
of the draft, target, and recovered probability distributions.
|
||||
|
||||
@ -235,7 +235,7 @@ class Sampler(nn.Module):
|
||||
* Defer Pythonization of sampling result & logprobs
|
||||
tensor
|
||||
* Encapsulate arguments required for deferred Pythonization
|
||||
in the :class:`SamplerOutput` structure
|
||||
in the {class}`SamplerOutput` structure
|
||||
|
||||
Args:
|
||||
logits: (num_tokens, vocab_size).
|
||||
|
||||
@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
|
||||
A draft token_id x_{n+k} is accepted if it satisfies the
|
||||
following condition
|
||||
|
||||
.. math::
|
||||
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
|
||||
\min \left( \epsilon, \delta * \exp \left(
|
||||
-H(p_{\text{original}}(
|
||||
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
|
||||
:::{math}
|
||||
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
|
||||
\min \left( \epsilon, \delta * \exp \left(
|
||||
-H(p_{\text{original}}(
|
||||
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
|
||||
:::
|
||||
|
||||
where :math:`p_{\text{original}}` corresponds to target_probs
|
||||
and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
|
||||
where {math}`p_{\text{original}}` corresponds to target_probs
|
||||
and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
|
||||
specified using self._posterior_threshold and self._posterior_alpha
|
||||
|
||||
This method computes the posterior probabilities for the given
|
||||
|
||||
@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
batch.
|
||||
pixel_values: The pixels in each input image.
|
||||
|
||||
See also:
|
||||
:class:`Blip2ImageInputs`
|
||||
:::{seealso}
|
||||
{class}`Blip2ImageInputs`
|
||||
:::
|
||||
"""
|
||||
|
||||
if intermediate_tensors is not None:
|
||||
|
||||
@ -226,9 +226,9 @@ class SupportsPP(Protocol):
|
||||
intermediate_tensors: Optional["IntermediateTensors"],
|
||||
) -> Union[Tensor, "IntermediateTensors"]:
|
||||
"""
|
||||
Accept :class:`IntermediateTensors` when PP rank > 0.
|
||||
Accept {class}`IntermediateTensors` when PP rank > 0.
|
||||
|
||||
Return :class:`IntermediateTensors` only for the last PP rank.
|
||||
Return {class}`IntermediateTensors` only for the last PP rank.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
batch.
|
||||
pixel_values: The pixels in each input image.
|
||||
|
||||
See also:
|
||||
:class:`LlavaImageInputs`
|
||||
:::{seealso}
|
||||
{class}`LlavaImageInputs`
|
||||
:::
|
||||
"""
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
Unlike in LLaVA-1.5, the number of image tokens inputted to the language
|
||||
model depends on the original size of the input image. Including the
|
||||
original image token in the input, the required number of image tokens
|
||||
is given by :func:`get_llava_next_image_feature_size`.
|
||||
is given by {func}`get_llava_next_image_feature_size`.
|
||||
|
||||
This way, the `positions` and `attn_metadata` are consistent
|
||||
with the `input_ids`.
|
||||
@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
pixel_values: The pixels in each grid patch for each input image.
|
||||
image_sizes: The original `(height, width)` for each input image.
|
||||
|
||||
See also:
|
||||
:class:`LlavaNextImageInputs`
|
||||
:::{seealso}
|
||||
{class}`LlavaNextImageInputs`
|
||||
:::
|
||||
"""
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
|
||||
batch.
|
||||
pixel_values: The pixels in each input image.
|
||||
|
||||
See also:
|
||||
:class:`Mistral3ImagePixelInputs`
|
||||
:::{seealso}
|
||||
{class}`Mistral3ImagePixelInputs`
|
||||
:::
|
||||
"""
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
@ -965,7 +965,7 @@ def select_tiling(
|
||||
|
||||
class MolmoProcessorWrapper:
|
||||
"""
|
||||
Wraps :class:`MolmoProcessor` so that it can be called directly.
|
||||
Wraps {class}`MolmoProcessor` so that it can be called directly.
|
||||
|
||||
The original definition can be found here:
|
||||
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
|
||||
|
||||
@ -12,7 +12,7 @@ import torch.nn.functional as F
|
||||
from torch import Tensor, nn
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
class BlockBase(nn.Module):
|
||||
"""Block abstract module"""
|
||||
|
||||
def __init__(self, input_size, output_size):
|
||||
@ -1602,7 +1602,7 @@ class AttModule(nn.Module):
|
||||
return x, memory, pos_emb, att_mask
|
||||
|
||||
|
||||
class AttBlock(Block, AttModule):
|
||||
class AttBlock(BlockBase, AttModule):
|
||||
"""Attention Block module to support both Attention and Block module."""
|
||||
|
||||
def memory_dims(self, max_len=False):
|
||||
|
||||
@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
|
||||
"""
|
||||
Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
|
||||
|
||||
The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
|
||||
The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
|
||||
"""
|
||||
|
||||
|
||||
class PixtralProcessorAdapter:
|
||||
"""
|
||||
Provide a HF-compatible interface for
|
||||
:class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
|
||||
{class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: MistralTokenizer) -> None:
|
||||
|
||||
@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
|
||||
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
|
||||
"""
|
||||
The logic of adding image pad tokens should only be applied in
|
||||
:class:`QwenVLProcessor`, so they are patched out here.
|
||||
{class}`QwenVLProcessor`, so they are patched out here.
|
||||
|
||||
The definition of the wrapped tokenizer can be found here:
|
||||
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
|
||||
|
||||
@ -19,7 +19,6 @@ import cloudpickle
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_in_doc_build
|
||||
|
||||
from .interfaces import (has_inner_state, has_noops, is_attention_free,
|
||||
is_hybrid, supports_cross_encoding,
|
||||
@ -375,13 +374,13 @@ class _ModelRegistry:
|
||||
"""
|
||||
Register an external model to be used in vLLM.
|
||||
|
||||
:code:`model_cls` can be either:
|
||||
`model_cls` can be either:
|
||||
|
||||
- A :class:`torch.nn.Module` class directly referencing the model.
|
||||
- A string in the format :code:`<module>:<class>` which can be used to
|
||||
- A {class}`torch.nn.Module` class directly referencing the model.
|
||||
- A string in the format `<module>:<class>` which can be used to
|
||||
lazily import the model. This is useful to avoid initializing CUDA
|
||||
when importing the model and thus the related error
|
||||
:code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||
`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||
"""
|
||||
if not isinstance(model_arch, str):
|
||||
msg = f"`model_arch` should be a string, not a {type(model_arch)}"
|
||||
@ -400,8 +399,7 @@ class _ModelRegistry:
|
||||
raise ValueError(msg)
|
||||
|
||||
model = _LazyRegisteredModel(*split_str)
|
||||
elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
|
||||
model_cls, nn.Module)):
|
||||
elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
|
||||
model = _RegisteredModel.from_model_cls(model_cls)
|
||||
else:
|
||||
msg = ("`model_cls` should be a string or PyTorch model class, "
|
||||
|
||||
@ -66,7 +66,7 @@ class WeightsMapper:
|
||||
|
||||
class AutoWeightsLoader:
|
||||
"""
|
||||
Helper class to load weights into a :class:`torch.nn.Module`. It is able
|
||||
Helper class to load weights into a {class}`torch.nn.Module`. It is able
|
||||
to automatically detect child modules and parameters while iterating over
|
||||
the weights only once.
|
||||
|
||||
|
||||
@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
|
||||
|
||||
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
||||
"""
|
||||
The global :class:`~MultiModalRegistry` is used by model runners to
|
||||
The global {class}`~MultiModalRegistry` is used by model runners to
|
||||
dispatch data processing according to the target model.
|
||||
|
||||
See also:
|
||||
:ref:`mm-processing`
|
||||
:::{seealso}
|
||||
{ref}`mm-processing`
|
||||
:::
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
|
||||
@ -64,35 +64,35 @@ class MultiModalPlaceholderMap:
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block::
|
||||
```
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: |.................................|
|
||||
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: |.................................|
|
||||
images = [A, B]
|
||||
src_ranges = [(0, 4), (4, 8)]
|
||||
dest_ranges = [(0, 4), (5, 9)]
|
||||
|
||||
images = [A, B]
|
||||
src_ranges = [(0, 4), (4, 8)]
|
||||
dest_ranges = [(0, 4), (5, 9)]
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | ..... |
|
||||
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | ..... |
|
||||
images = [A, B]
|
||||
src_ranges = [(2, 4), (4, 6)]
|
||||
dest_ranges = [(0, 2), (3, 5)]
|
||||
|
||||
images = [A, B]
|
||||
src_ranges = [(2, 4), (4, 6)]
|
||||
dest_ranges = [(0, 2), (3, 5)]
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | ......... |
|
||||
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | ......... |
|
||||
images = [B]
|
||||
src_ranges = [(0, 4)]
|
||||
dest_ranges = [(0, 4)]
|
||||
|
||||
images = [B]
|
||||
src_ranges = [(0, 4)]
|
||||
dest_ranges = [(0, 4)]
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | .......................|
|
||||
|
||||
Prompt: |AAAA BBBB What's in these images?|
|
||||
Positions: | .......................|
|
||||
|
||||
images = []
|
||||
src_ranges = []
|
||||
dest_ranges = []
|
||||
images = []
|
||||
src_ranges = []
|
||||
dest_ranges = []
|
||||
```
|
||||
"""
|
||||
seq_mm_data = seq_group.multi_modal_data
|
||||
seq_mm_placeholders = seq_group.multi_modal_placeholders
|
||||
|
||||
@ -26,27 +26,27 @@ _T = TypeVar("_T")
|
||||
|
||||
HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
|
||||
"""
|
||||
A :class:`transformers.image_utils.ImageInput` representing a single image
|
||||
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
|
||||
A {class}`transformers.image_utils.ImageInput` representing a single image
|
||||
item, which can be passed to a HuggingFace `ImageProcessor`.
|
||||
"""
|
||||
|
||||
HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
|
||||
list[np.ndarray], list[torch.Tensor]]
|
||||
"""
|
||||
A :class:`transformers.image_utils.VideoInput` representing a single video
|
||||
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
|
||||
A {class}`transformers.image_utils.VideoInput` representing a single video
|
||||
item, which can be passed to a HuggingFace `VideoProcessor`.
|
||||
"""
|
||||
|
||||
HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
|
||||
"""
|
||||
Represents a single audio
|
||||
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
|
||||
item, which can be passed to a HuggingFace `AudioProcessor`.
|
||||
"""
|
||||
|
||||
ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
|
||||
"""
|
||||
A :class:`transformers.image_utils.ImageInput` representing a single image
|
||||
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
|
||||
A {class}`transformers.image_utils.ImageInput` representing a single image
|
||||
item, which can be passed to a HuggingFace `ImageProcessor`.
|
||||
|
||||
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
||||
which are treated as image embeddings;
|
||||
@ -55,8 +55,8 @@ these are directly passed to the model without HF processing.
|
||||
|
||||
VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
|
||||
"""
|
||||
A :class:`transformers.image_utils.VideoInput` representing a single video
|
||||
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
|
||||
A {class}`transformers.image_utils.VideoInput` representing a single video
|
||||
item, which can be passed to a HuggingFace `VideoProcessor`.
|
||||
|
||||
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
||||
which are treated as video embeddings;
|
||||
@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
|
||||
torch.Tensor]
|
||||
"""
|
||||
Represents a single audio
|
||||
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
|
||||
item, which can be passed to a HuggingFace `AudioProcessor`.
|
||||
|
||||
Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
|
||||
is different from that expected by the model;
|
||||
@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]]
|
||||
Either a single data item, or a list of data items.
|
||||
|
||||
The number of data items allowed per modality is restricted by
|
||||
:code:`--limit-mm-per-prompt`.
|
||||
`--limit-mm-per-prompt`.
|
||||
"""
|
||||
|
||||
|
||||
@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
|
||||
"""
|
||||
A dictionary containing an entry for each modality type to input.
|
||||
|
||||
The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
|
||||
The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
|
||||
"""
|
||||
|
||||
|
||||
@ -116,14 +116,14 @@ class PlaceholderRange:
|
||||
|
||||
Example:
|
||||
|
||||
Prompt: :code:`AAAA BBBB What is in these images?`
|
||||
Prompt: `AAAA BBBB What is in these images?`
|
||||
|
||||
Images A and B will have:
|
||||
Images A and B will have:
|
||||
|
||||
.. code-block::
|
||||
|
||||
A: PlaceholderRange(offset=0, length=4)
|
||||
B: PlaceholderRange(offset=5, length=4)
|
||||
```
|
||||
A: PlaceholderRange(offset=0, length=4)
|
||||
B: PlaceholderRange(offset=5, length=4)
|
||||
```
|
||||
"""
|
||||
|
||||
offset: int
|
||||
@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
|
||||
|
||||
|
||||
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
||||
"""Equality check between :data:`NestedTensors` objects."""
|
||||
"""Equality check between {data}`NestedTensors` objects."""
|
||||
if isinstance(a, torch.Tensor):
|
||||
return isinstance(b, torch.Tensor) and torch.equal(a, b)
|
||||
elif isinstance(b, torch.Tensor):
|
||||
@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
||||
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
|
||||
"""
|
||||
A dictionary containing nested tensors which have been batched via
|
||||
:meth:`MultiModalKwargs.batch`.
|
||||
{meth}`MultiModalKwargs.batch`.
|
||||
"""
|
||||
|
||||
|
||||
@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via
|
||||
class MultiModalFieldElem:
|
||||
"""
|
||||
Represents a keyword argument corresponding to a multi-modal item
|
||||
in :class:`MultiModalKwargs`.
|
||||
in {class}`MultiModalKwargs`.
|
||||
"""
|
||||
|
||||
modality: str
|
||||
@ -205,13 +205,13 @@ class MultiModalFieldElem:
|
||||
|
||||
key: str
|
||||
"""
|
||||
The key of this field in :class:`MultiModalKwargs`,
|
||||
The key of this field in {class}`MultiModalKwargs`,
|
||||
i.e. the name of the keyword argument to be passed to the model.
|
||||
"""
|
||||
|
||||
data: NestedTensors
|
||||
"""
|
||||
The tensor data of this field in :class:`MultiModalKwargs`,
|
||||
The tensor data of this field in {class}`MultiModalKwargs`,
|
||||
i.e. the value of the keyword argument to be passed to the model.
|
||||
"""
|
||||
|
||||
@ -234,7 +234,7 @@ class MultiModalFieldElem:
|
||||
class BaseMultiModalField(ABC):
|
||||
"""
|
||||
Defines how to interpret tensor data belonging to a keyword argument in
|
||||
:class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
|
||||
{class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
|
||||
"""
|
||||
|
||||
def _field_factory(self, *, modality: str, key: str):
|
||||
@ -259,10 +259,10 @@ class BaseMultiModalField(ABC):
|
||||
data: NestedTensors,
|
||||
) -> Sequence[MultiModalFieldElem]:
|
||||
"""
|
||||
Construct :class:`MultiModalFieldElem` instances to represent
|
||||
Construct {class}`MultiModalFieldElem` instances to represent
|
||||
the provided data.
|
||||
|
||||
This is the inverse of :meth:`reduce_data`.
|
||||
This is the inverse of {meth}`reduce_data`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@ -272,9 +272,9 @@ class BaseMultiModalField(ABC):
|
||||
|
||||
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
|
||||
"""
|
||||
Merge the data from multiple instances of :class:`MultiModalFieldElem`.
|
||||
Merge the data from multiple instances of {class}`MultiModalFieldElem`.
|
||||
|
||||
This is the inverse of :meth:`build_elems`.
|
||||
This is the inverse of {meth}`build_elems`.
|
||||
"""
|
||||
field_types = [type(item.field) for item in elems]
|
||||
if len(set(field_types)) > 1:
|
||||
@ -286,8 +286,9 @@ class BaseMultiModalField(ABC):
|
||||
@dataclass(frozen=True)
|
||||
class MultiModalBatchedField(BaseMultiModalField):
|
||||
"""
|
||||
See also:
|
||||
:func:`MultiModalFieldConfig.batched`
|
||||
:::{seealso}
|
||||
{func}`MultiModalFieldConfig.batched`
|
||||
:::
|
||||
"""
|
||||
|
||||
def build_elems(
|
||||
@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField):
|
||||
@dataclass(frozen=True)
|
||||
class MultiModalFlatField(BaseMultiModalField):
|
||||
"""
|
||||
See also:
|
||||
:func:`MultiModalFieldConfig.flat`
|
||||
:func:`MultiModalFieldConfig.flat_from_sizes`
|
||||
:::{seealso}
|
||||
{func}`MultiModalFieldConfig.flat`
|
||||
{func}`MultiModalFieldConfig.flat_from_sizes`
|
||||
:::
|
||||
"""
|
||||
slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
|
||||
dim: int = 0
|
||||
@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField):
|
||||
@dataclass(frozen=True)
|
||||
class MultiModalSharedField(BaseMultiModalField):
|
||||
"""
|
||||
See also:
|
||||
:func:`MultiModalFieldConfig.shared`
|
||||
:::{seealso}
|
||||
{func}`MultiModalFieldConfig.shared`
|
||||
:::
|
||||
"""
|
||||
batch_size: int
|
||||
|
||||
@ -390,17 +393,17 @@ class MultiModalFieldConfig:
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block::
|
||||
```
|
||||
Input:
|
||||
Data: [[AAAA]
|
||||
[BBBB]
|
||||
[CCCC]]
|
||||
|
||||
Input:
|
||||
Data: [[AAAA]
|
||||
[BBBB]
|
||||
[CCCC]]
|
||||
|
||||
Output:
|
||||
Element 1: [AAAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CCCC]
|
||||
Output:
|
||||
Element 1: [AAAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CCCC]
|
||||
```
|
||||
"""
|
||||
return MultiModalFieldConfig(
|
||||
field=MultiModalBatchedField(),
|
||||
@ -425,35 +428,35 @@ class MultiModalFieldConfig:
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Given:
|
||||
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
|
||||
```
|
||||
Given:
|
||||
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
|
||||
|
||||
Input:
|
||||
Data: [AAABBBBCC]
|
||||
Input:
|
||||
Data: [AAABBBBCC]
|
||||
|
||||
Output:
|
||||
Element 1: [AAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CC]
|
||||
|
||||
.. code-block::
|
||||
Output:
|
||||
Element 1: [AAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CC]
|
||||
```
|
||||
|
||||
Given:
|
||||
slices: [
|
||||
(slice(None), slice(0, 3)),
|
||||
(slice(None), slice(3, 7)),
|
||||
(slice(None), slice(7, 9))]
|
||||
dim: 1
|
||||
```
|
||||
Given:
|
||||
slices: [
|
||||
(slice(None), slice(0, 3)),
|
||||
(slice(None), slice(3, 7)),
|
||||
(slice(None), slice(7, 9))]
|
||||
dim: 1
|
||||
|
||||
Input:
|
||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||
Input:
|
||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||
|
||||
Output:
|
||||
Element 1: [[A],[A],[A]]
|
||||
Element 2: [[B],[B],[B],[B]]
|
||||
Element 3: [[C],[C]]
|
||||
Output:
|
||||
Element 1: [[A],[A],[A]]
|
||||
Element 2: [[B],[B],[B],[B]]
|
||||
Element 3: [[C],[C]]
|
||||
```
|
||||
"""
|
||||
return MultiModalFieldConfig(
|
||||
field=MultiModalFlatField(slices=slices, dim=dim),
|
||||
@ -477,36 +480,36 @@ class MultiModalFieldConfig:
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Given:
|
||||
size_per_item: [3, 4, 2]
|
||||
```
|
||||
Given:
|
||||
size_per_item: [3, 4, 2]
|
||||
|
||||
Input:
|
||||
Data: [AAABBBBCC]
|
||||
Input:
|
||||
Data: [AAABBBBCC]
|
||||
|
||||
Output:
|
||||
Element 1: [AAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CC]
|
||||
Output:
|
||||
Element 1: [AAA]
|
||||
Element 2: [BBBB]
|
||||
Element 3: [CC]
|
||||
```
|
||||
|
||||
|
||||
.. code-block::
|
||||
```
|
||||
Given:
|
||||
slices: [3, 4, 2]
|
||||
dim: 1
|
||||
|
||||
Given:
|
||||
slices: [3, 4, 2]
|
||||
dim: 1
|
||||
Input:
|
||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||
|
||||
Input:
|
||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||
Output:
|
||||
Element 1: [[A],[A],[A]]
|
||||
Element 2: [[B],[B],[B],[B]]
|
||||
Element 3: [[C],[C]]
|
||||
```
|
||||
|
||||
Output:
|
||||
Element 1: [[A],[A],[A]]
|
||||
Element 2: [[B],[B],[B],[B]]
|
||||
Element 3: [[C],[C]]
|
||||
|
||||
See also:
|
||||
:func:`MultiModalFieldConfig.flat`
|
||||
:::{seealso}
|
||||
{func}`MultiModalFieldConfig.flat`
|
||||
:::
|
||||
"""
|
||||
|
||||
if size_per_item.ndim != 1:
|
||||
@ -535,19 +538,19 @@ class MultiModalFieldConfig:
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Given:
|
||||
batch_size: 4
|
||||
```
|
||||
Given:
|
||||
batch_size: 4
|
||||
|
||||
Input:
|
||||
Data: [XYZ]
|
||||
Input:
|
||||
Data: [XYZ]
|
||||
|
||||
Output:
|
||||
Element 1: [XYZ]
|
||||
Element 2: [XYZ]
|
||||
Element 3: [XYZ]
|
||||
Element 4: [XYZ]
|
||||
Output:
|
||||
Element 1: [XYZ]
|
||||
Element 2: [XYZ]
|
||||
Element 3: [XYZ]
|
||||
Element 4: [XYZ]
|
||||
```
|
||||
"""
|
||||
return MultiModalFieldConfig(
|
||||
field=MultiModalSharedField(batch_size),
|
||||
@ -570,8 +573,8 @@ class MultiModalFieldConfig:
|
||||
|
||||
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
||||
"""
|
||||
A collection of :class:`MultiModalFieldElem`
|
||||
corresponding to a data item in :class:`MultiModalDataItems`.
|
||||
A collection of {class}`MultiModalFieldElem`
|
||||
corresponding to a data item in {class}`MultiModalDataItems`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`.
|
||||
{meth}`~torch.nn.Module.forward`.
|
||||
|
||||
The metadata :code:`items` enables us to obtain the keyword arguments
|
||||
corresponding to each data item in :class:`MultiModalDataItems`, via
|
||||
:meth:`get_item` and :meth:`get_items`.
|
||||
The metadata `items` enables us to obtain the keyword arguments
|
||||
corresponding to each data item in {class}`MultiModalDataItems`, via
|
||||
{meth}`get_item` and {meth}`get_items`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
|
||||
@staticmethod
|
||||
def from_items(items: Sequence[MultiModalKwargsItem]):
|
||||
"""Construct a new :class:`MultiModalKwargs` from multiple items."""
|
||||
"""Construct a new {class}`MultiModalKwargs` from multiple items."""
|
||||
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
||||
for item in items:
|
||||
for key, elem in item.items():
|
||||
@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality.
|
||||
class MultiModalInputs(TypedDict):
|
||||
"""
|
||||
Represents the outputs of
|
||||
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
||||
{class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
||||
ready to be passed to vLLM internals.
|
||||
"""
|
||||
|
||||
@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
|
||||
mm_placeholders: MultiModalPlaceholderDict
|
||||
"""
|
||||
For each modality, information about the placeholder tokens in
|
||||
:code:`prompt_token_ids`.
|
||||
`prompt_token_ids`.
|
||||
"""
|
||||
|
||||
cache_salt: NotRequired[str]
|
||||
@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict):
|
||||
|
||||
class MultiModalEncDecInputs(MultiModalInputs):
|
||||
"""
|
||||
Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
|
||||
Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
|
||||
ready to be passed to vLLM internals.
|
||||
"""
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ _I = TypeVar("_I")
|
||||
|
||||
class ModalityDataItems(ABC, Generic[_T, _I]):
|
||||
"""
|
||||
Represents data items for a modality in :class:`MultiModalDataItems`.
|
||||
Represents data items for a modality in {class}`MultiModalDataItems`.
|
||||
"""
|
||||
|
||||
def __init__(self, data: _T, modality: str) -> None:
|
||||
@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
|
||||
|
||||
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
||||
"""
|
||||
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
|
||||
As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
|
||||
such that each entry corresponds to a list.
|
||||
"""
|
||||
|
||||
@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
||||
"""
|
||||
Get the number of data items belonging to a modality.
|
||||
|
||||
If `strict=False`, return `0` instead of raising :exc:`KeyError`
|
||||
If `strict=False`, return `0` instead of raising {exc}`KeyError`
|
||||
even if the modality is not found.
|
||||
"""
|
||||
if modality not in self:
|
||||
@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
|
||||
|
||||
class MultiModalDataParser:
|
||||
"""
|
||||
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
|
||||
:class:`MultiModalDataItems`.
|
||||
Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
|
||||
{class}`MultiModalDataItems`.
|
||||
|
||||
Args:
|
||||
target_sr (float, optional): Enables automatic resampling of audio
|
||||
|
||||
@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
|
||||
|
||||
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
|
||||
"""
|
||||
Given :attr:`full`, return a boolean mask of shape `(len(full),)`
|
||||
Given {attr}`full`, return a boolean mask of shape `(len(full),)`
|
||||
indicating which positions of `full` to assign embeddings to.
|
||||
|
||||
`None` (default) means to assign embeddings to all positions of `full`.
|
||||
|
||||
The embeddings are obtained by calling
|
||||
:class:`SupportsMultiModal.get_multimodal_embeddings`.
|
||||
{class}`SupportsMultiModal.get_multimodal_embeddings`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
|
||||
The token sequence or text that are part of the update.
|
||||
|
||||
If only part of the content corresponds to feature placeholders, you can
|
||||
use :class:`PromptUpdateDetails` to specify which part.
|
||||
use {class}`PromptUpdateDetails` to specify which part.
|
||||
"""
|
||||
|
||||
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
|
||||
PromptUpdateInfo]
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
Given the index of the processed item within {attr}`modality`,
|
||||
output the corresponding token sequence (or text).
|
||||
|
||||
For convenience, you can directly pass in the token sequence (or text)
|
||||
@ -213,52 +213,52 @@ class PromptInsertion(PromptUpdate):
|
||||
|
||||
Example:
|
||||
|
||||
For each image, insert a number of ``<image>`` feature placeholders
|
||||
equal to the feature size of the vision encoder after the ``<s>`` token:
|
||||
For each image, insert a number of ``<image>`` feature placeholders
|
||||
equal to the feature size of the vision encoder after the ``<s>`` token:
|
||||
|
||||
.. code-block:: python
|
||||
```python
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target="<s>",
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```
|
||||
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target="<s>",
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
Insert these tokens at the start of the prompt:
|
||||
|
||||
Insert these tokens at the start of the prompt:
|
||||
```python
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.start(),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```
|
||||
|
||||
.. code-block:: python
|
||||
Insert these tokens after a prefix ``Images:``:
|
||||
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.start(),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```python
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.prefix("Images:"),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```
|
||||
|
||||
Insert these tokens after a prefix ``Images:``:
|
||||
Insert these tokens at the end of the prompt:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.prefix("Images:"),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
|
||||
Insert these tokens at the end of the prompt:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.end(),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```python
|
||||
PromptInsertion(
|
||||
modality="image",
|
||||
target=PromptIndexTargets.end(),
|
||||
insertion="<image>" * image_feature_size,
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
insertion: PromptUpdateContent = field(repr=False)
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
output the token sequence (or text) to insert right after :attr:`target`.
|
||||
Given the index of the processed item within {attr}`modality`,
|
||||
output the token sequence (or text) to insert right after {attr}`target`.
|
||||
|
||||
For convenience, you can directly pass in the token sequence (or text)
|
||||
instead of a function if it does not depend on the input.
|
||||
@ -280,57 +280,57 @@ class PromptReplacement(PromptUpdate):
|
||||
|
||||
Example:
|
||||
|
||||
For each image, replace one ``<image>`` input placeholder in the prompt
|
||||
with a number of ``<image>`` feature placeholders
|
||||
equal to the feature size of the vision encoder:
|
||||
For each image, replace one ``<image>`` input placeholder in the prompt
|
||||
with a number of ``<image>`` feature placeholders
|
||||
equal to the feature size of the vision encoder:
|
||||
|
||||
.. code-block:: python
|
||||
```python
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="<image>",
|
||||
replacement="<image>" * image_feature_size,
|
||||
)
|
||||
```
|
||||
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="<image>",
|
||||
replacement="<image>" * image_feature_size,
|
||||
)
|
||||
As above, but further pad the feature placeholders with ``<image_bos>``
|
||||
and `<image_eos>``, which are not supposed to be passed to the vision
|
||||
encoder:
|
||||
|
||||
As above, but further pad the feature placeholders with ``<image_bos>``
|
||||
and `<image_eos>``, which are not supposed to be passed to the vision
|
||||
encoder:
|
||||
```python
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="<image>",
|
||||
replacement=PromptUpdateDetails(
|
||||
full="".join([
|
||||
"<image_bos>",
|
||||
"<image>" * image_feature_size,
|
||||
"<image_eos>",
|
||||
]),
|
||||
features="<image>" * image_feature_size,
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
.. code-block:: python
|
||||
To avoid unnecessary tokenization during prompt replacement,
|
||||
we recommended passing token sequences instead of text:
|
||||
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target="<image>",
|
||||
replacement=PromptUpdateDetails(
|
||||
full="".join([
|
||||
"<image_bos>",
|
||||
"<image>" * image_feature_size,
|
||||
"<image_eos>",
|
||||
]),
|
||||
features="<image>" * image_feature_size,
|
||||
),
|
||||
)
|
||||
|
||||
To avoid unnecessary tokenization during prompt replacement,
|
||||
we recommended passing token sequences instead of text:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=PromptUpdateDetails(
|
||||
full=([image_bos_id] + [image_token_id] * image_feature_size
|
||||
+ [image_eos_id]),
|
||||
features=[image_token_id] * image_feature_size,
|
||||
),
|
||||
)
|
||||
```python
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=PromptUpdateDetails(
|
||||
full=([image_bos_id] + [image_token_id] * image_feature_size
|
||||
+ [image_eos_id]),
|
||||
features=[image_token_id] * image_feature_size,
|
||||
),
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
replacement: PromptUpdateContent = field(repr=False)
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
output the token sequence (or text) to replace :attr:`target`.
|
||||
Given the index of the processed item within {attr}`modality`,
|
||||
output the token sequence (or text) to replace {attr}`target`.
|
||||
|
||||
For convenience, you can directly pass in the token sequence (or text)
|
||||
instead of a function if it does not depend on the input.
|
||||
@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
|
||||
|
||||
|
||||
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
||||
"""Convenience function to apply :func:`full_groupby` based on modality."""
|
||||
"""Convenience function to apply {func}`full_groupby` based on modality."""
|
||||
return full_groupby(values, key=lambda x: x.modality)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _BoundPromptSequence:
|
||||
"""
|
||||
A :data:`_PromptSeq` bound to a tokenizer to automatically
|
||||
A {data}`_PromptSeq` bound to a tokenizer to automatically
|
||||
convert between token sequence and text representations.
|
||||
"""
|
||||
tokenizer: AnyTokenizer = field(repr=False)
|
||||
@ -443,8 +443,8 @@ class _BoundPromptContent:
|
||||
@dataclass
|
||||
class BoundPromptUpdate:
|
||||
"""
|
||||
A :class:`PromptUpdate` bound to a tokenizer to automatically convert
|
||||
:attr:`target` and the result of :meth:`get_content` between
|
||||
A {class}`PromptUpdate` bound to a tokenizer to automatically convert
|
||||
{attr}`target` and the result of {meth}`get_content` between
|
||||
token sequence and text representations.
|
||||
"""
|
||||
_origin: PromptUpdate
|
||||
@ -479,7 +479,7 @@ class BoundPromptUpdate:
|
||||
|
||||
def get_content(self, item_idx: int) -> _BoundPromptContent:
|
||||
"""
|
||||
Given the index of the processed item within :attr:`modality`,
|
||||
Given the index of the processed item within {attr}`modality`,
|
||||
output the token sequence (or text) to update.
|
||||
"""
|
||||
content = self.content
|
||||
@ -516,7 +516,7 @@ def iter_token_matches(
|
||||
match_ids: list[int],
|
||||
) -> Generator[_TokenMatch]:
|
||||
"""
|
||||
Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
|
||||
Yield each occurrence of `match_ids` in `token_ids`.
|
||||
|
||||
Note that empty matches are ignored.
|
||||
"""
|
||||
@ -545,8 +545,8 @@ def replace_token_matches(
|
||||
new_ids: list[int],
|
||||
) -> list[int]:
|
||||
"""
|
||||
Replace each occurrence of :code:`match_ids` in :code:`token_ids`
|
||||
with :code:`new_ids`.
|
||||
Replace each occurrence of `match_ids` in `token_ids`
|
||||
with `new_ids`.
|
||||
|
||||
Note that empty matches are ignored.
|
||||
"""
|
||||
@ -654,7 +654,7 @@ def find_token_matches(
|
||||
prompt: list[int],
|
||||
prompt_updates: Sequence[BoundPromptUpdate],
|
||||
) -> Sequence[PromptTargetMatch]:
|
||||
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
|
||||
"""Return each target of `prompt_updates` found in `prompt`."""
|
||||
|
||||
def get_matches(update: BoundPromptUpdate):
|
||||
target = update.target
|
||||
@ -680,7 +680,7 @@ def find_text_matches(
|
||||
prompt: str,
|
||||
prompt_updates: Sequence[BoundPromptUpdate],
|
||||
) -> Sequence[PromptTargetMatch]:
|
||||
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
|
||||
"""Return each target of `prompt_updates` found in `prompt`."""
|
||||
|
||||
def get_matches(update: BoundPromptUpdate):
|
||||
target = update.target
|
||||
@ -707,7 +707,7 @@ def _resolve_matches(
|
||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||
) -> list[PromptTargetMatch]:
|
||||
"""
|
||||
Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
|
||||
Resolve `mm_matches` to ensure that there are no overlapping matches,
|
||||
and sort them such that earlier matches take priority over later ones.
|
||||
"""
|
||||
matches = [m for matches in mm_matches.values() for m in matches]
|
||||
@ -731,7 +731,7 @@ def _apply_matches(
|
||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> list[_S]:
|
||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
||||
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||
out_seqs = list[Union[str, list[int]]]()
|
||||
prev_end_idx = 0
|
||||
next_idx_by_modality = defaultdict[str, int](lambda: 0)
|
||||
@ -780,7 +780,7 @@ def apply_token_matches(
|
||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> list[int]:
|
||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
||||
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||
if not mm_matches:
|
||||
return prompt
|
||||
|
||||
@ -794,7 +794,7 @@ def apply_text_matches(
|
||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> str:
|
||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
||||
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||
if not mm_matches:
|
||||
return prompt
|
||||
|
||||
@ -809,7 +809,7 @@ def _iter_placeholders(
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> Iterable[PlaceholderFeaturesInfo]:
|
||||
"""
|
||||
Yield each set of placeholder tokens found in :code:`prompt`.
|
||||
Yield each set of placeholder tokens found in `prompt`.
|
||||
|
||||
Matches are exclusive even when multiple modalities share
|
||||
the same placeholder tokens. In that case, the modality that
|
||||
@ -1016,7 +1016,7 @@ class ProcessingCache:
|
||||
) -> None:
|
||||
"""
|
||||
Put a processed multi-modal item into the cache
|
||||
according to its dependencies (see :meth:`get`).
|
||||
according to its dependencies (see {meth}`get`).
|
||||
"""
|
||||
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
|
||||
**{modality: input_item},
|
||||
@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||
|
||||
MultiModalHashes = dict[str, list[str]]
|
||||
"""
|
||||
A collection of hashes with a similar structure as :class:`MultiModalKwargs`.
|
||||
A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
|
||||
"""
|
||||
|
||||
|
||||
@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
"""
|
||||
Abstract base class to process multi-modal inputs to be used in vLLM.
|
||||
|
||||
Not to be confused with :class:`transformers.ProcessorMixin`.
|
||||
Not to be confused with {class}`transformers.ProcessorMixin`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
"""
|
||||
Construct a parser to preprocess multi-modal data items
|
||||
before passing them to :meth:`_get_hf_mm_data`.
|
||||
before passing them to {meth}`_get_hf_mm_data`.
|
||||
|
||||
You can support additional modalities by creating a subclass
|
||||
of :class:`MultiModalDataParser` that has additional subparsers.
|
||||
of {class}`MultiModalDataParser` that has additional subparsers.
|
||||
"""
|
||||
return MultiModalDataParser()
|
||||
|
||||
@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> MultiModalDataItems:
|
||||
"""
|
||||
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
|
||||
before passing them to :meth:`_get_hf_mm_data`.
|
||||
Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
|
||||
before passing them to {meth}`_get_hf_mm_data`.
|
||||
"""
|
||||
mm_items = self.data_parser.parse_mm_data(mm_data)
|
||||
supported_mm_limits = self.info.get_supported_mm_limits()
|
||||
@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
inputs.
|
||||
|
||||
Moreover, this information is critical to determine the token positions
|
||||
in order to construct :class:`~vllm-multimodal.input.PlaceholderRange`
|
||||
in order to construct {class}`~vllm-multimodal.input.PlaceholderRange`
|
||||
for each multi-modal item.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
"""
|
||||
Return whether the HF processor applies prompt updates.
|
||||
|
||||
For most HF processors, this should be :code:`True` when multi-modal
|
||||
data items are passed, but :code:`False` when multi-modal embeddings
|
||||
For most HF processors, this should be `True` when multi-modal
|
||||
data items are passed, but `False` when multi-modal embeddings
|
||||
are passed.
|
||||
"""
|
||||
return not any(
|
||||
@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
Most HF processors accept prompt text but not prompt tokens.
|
||||
If the HF processor adds or removes tokens that are not related to
|
||||
multi-modal data, you should override this method so it is consistent
|
||||
with the output of :meth:`_apply_hf_processor_text_only` on the
|
||||
with the output of {meth}`_apply_hf_processor_text_only` on the
|
||||
corresponding text.
|
||||
"""
|
||||
return prompt_tokens
|
||||
@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
|
||||
Since HF processor requires that text and multi-modal items
|
||||
correspond to each other, we generate dummy text using
|
||||
:class:`DummyInputsBuilder` to go along with the multi-modal data.
|
||||
{class}`DummyInputsBuilder` to go along with the multi-modal data.
|
||||
"""
|
||||
mm_counts = mm_items.get_all_counts()
|
||||
|
||||
@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
Apply the HF processor on the prompt text and multi-modal data.
|
||||
|
||||
In addition, return whether prompt updates have been applied
|
||||
(for most HF processors, this should be :code:`True`).
|
||||
(for most HF processors, this should be `True`).
|
||||
|
||||
Note:
|
||||
If :code:`enable_hf_prompt_update=False`, we use HF processor
|
||||
If `enable_hf_prompt_update=False`, we use HF processor
|
||||
to perform prompt updates if available; HF processor requires
|
||||
that the prompt corresponds to multi-modal items.
|
||||
"""
|
||||
|
||||
@ -25,7 +25,7 @@ logger = init_logger(__name__)
|
||||
class ProcessorInputs:
|
||||
"""
|
||||
Represents the keyword arguments to
|
||||
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
|
||||
{meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
|
||||
"""
|
||||
prompt_text: str
|
||||
mm_data: MultiModalDataDict
|
||||
@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
||||
# TODO: @abstractmethod after transition
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
"""
|
||||
Build the text input corresponding to :code:`mm_counts`.
|
||||
Build the text input corresponding to `mm_counts`.
|
||||
"""
|
||||
if (type(self).get_dummy_processor_inputs ==
|
||||
BaseDummyInputsBuilder.get_dummy_processor_inputs):
|
||||
|
||||
@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
|
||||
|
||||
|
||||
class ProcessingInfoFactory(Protocol[_I_co]):
|
||||
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
|
||||
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
|
||||
|
||||
class DummyInputsBuilderFactory(Protocol[_I]):
|
||||
"""
|
||||
Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
|
||||
Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
|
||||
"""
|
||||
|
||||
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
|
||||
@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]):
|
||||
|
||||
|
||||
class MultiModalProcessorFactory(Protocol[_I]):
|
||||
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
|
||||
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@ -150,7 +150,7 @@ class MultiModalRegistry:
|
||||
Get the maximum number of tokens from each modality
|
||||
for profiling the memory usage of a model.
|
||||
|
||||
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||
"""
|
||||
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
||||
|
||||
@ -165,7 +165,7 @@ class MultiModalRegistry:
|
||||
Get the maximum number of multi-modal tokens
|
||||
for profiling the memory usage of a model.
|
||||
|
||||
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||
"""
|
||||
return sum(self.get_max_tokens_by_modality(model_config).values())
|
||||
|
||||
@ -208,8 +208,9 @@ class MultiModalRegistry:
|
||||
When the model receives multi-modal data, the provided function is
|
||||
invoked to transform the data into a dictionary of model inputs.
|
||||
|
||||
See also:
|
||||
:ref:`mm-processing`
|
||||
:::{seealso}
|
||||
{ref}`mm-processing`
|
||||
:::
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
@ -253,8 +254,9 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create a multi-modal processor for a specific model and tokenizer.
|
||||
|
||||
See also:
|
||||
:ref:`mm-processing`
|
||||
:::{seealso}
|
||||
{ref}`mm-processing`
|
||||
:::
|
||||
"""
|
||||
if not model_config.is_multimodal_model:
|
||||
raise ValueError(f"{model_config.model} is not a multimodal model")
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
|
||||
import numpy as np
|
||||
@ -24,6 +24,10 @@ _M = TypeVar("_M")
|
||||
if TYPE_CHECKING:
|
||||
from .hasher import MultiModalHashDict
|
||||
from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
|
||||
else:
|
||||
MultiModalHashDict = Any
|
||||
MultiModalKwargs = Any
|
||||
MultiModalPlaceholderDict = Any
|
||||
|
||||
|
||||
class MediaConnector:
|
||||
@ -255,7 +259,7 @@ class MediaConnector:
|
||||
|
||||
|
||||
global_media_connector = MediaConnector()
|
||||
"""The global :class:`MediaConnector` instance used by vLLM."""
|
||||
"""The global {class}`MediaConnector` instance used by vLLM."""
|
||||
|
||||
fetch_audio = global_media_connector.fetch_audio
|
||||
fetch_image = global_media_connector.fetch_image
|
||||
@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
|
||||
|
||||
|
||||
def merge_and_sort_multimodal_metadata(
|
||||
mm_positions: "MultiModalPlaceholderDict",
|
||||
mm_hashes: Optional["MultiModalHashDict"],
|
||||
mm_positions: MultiModalPlaceholderDict,
|
||||
mm_hashes: Optional[MultiModalHashDict],
|
||||
) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
|
||||
"""Given a MultiModalPlaceholderDict, merge all PlaceholderRange
|
||||
objects from all available modalities into a single list of
|
||||
PlaceholderRange, sorted by their offset (starting index in the input
|
||||
PlaceholderRange, sorted by their offset (starting index in the input
|
||||
sequence) in the ascending order.
|
||||
|
||||
Optionally if a MultiModalHashDict is given, same operation will be
|
||||
Optionally if a `MultiModalHashDict` is given, same operation will be
|
||||
applied to the object and the sorted list of hashes will be returned.
|
||||
|
||||
Returns:
|
||||
list[str]: List of item modalities in order of their positions in
|
||||
the input sequence.
|
||||
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
||||
mm_positions.
|
||||
Optional[list[str]]: Sorted list of all hashes from mm_hashes if
|
||||
given, None otherwise.
|
||||
list[str]: List of item modalities in order of their positions in the
|
||||
input sequence.
|
||||
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
||||
mm_positions.
|
||||
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
|
||||
None otherwise.
|
||||
"""
|
||||
|
||||
modalities = list(mm_positions.keys())
|
||||
@ -352,22 +356,23 @@ def merge_and_sort_multimodal_metadata(
|
||||
|
||||
|
||||
def group_mm_inputs_by_modality(
|
||||
mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
|
||||
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
|
||||
together into the same list for batching purpose. For MultiModalKwargs with
|
||||
mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
|
||||
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
|
||||
together into the same list for batching purpose. For MultiModalKwargs with
|
||||
multiple modalities, put them into their own list.
|
||||
|
||||
Args:
|
||||
mm_inputs: List of MultiModalKwargs.
|
||||
|
||||
Returns:
|
||||
list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each
|
||||
inner list contains consecutive MultiModalKwargs with same modality.
|
||||
list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
|
||||
`MultiModalKwargs`, each inner list contains consecutive
|
||||
`MultiModalKwargs` with same modality.
|
||||
"""
|
||||
if not mm_inputs:
|
||||
return []
|
||||
|
||||
def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
|
||||
def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
|
||||
# If the input has multiple modalities, return a id as the unique key
|
||||
# for the mm_input input.
|
||||
if len(mm_input.modalities) > 1:
|
||||
|
||||
@ -19,8 +19,6 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class CpuPlatform(Platform):
|
||||
_enum = PlatformEnum.CPU
|
||||
|
||||
@ -454,10 +454,4 @@ finally:
|
||||
|
||||
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
|
||||
|
||||
try:
|
||||
from sphinx.ext.autodoc.mock import _MockModule
|
||||
|
||||
if not isinstance(pynvml, _MockModule):
|
||||
CudaPlatform.log_warnings()
|
||||
except ModuleNotFoundError:
|
||||
CudaPlatform.log_warnings()
|
||||
CudaPlatform.log_warnings()
|
||||
|
||||
@ -146,7 +146,7 @@ class Platform:
|
||||
return self._enum == PlatformEnum.OOT
|
||||
|
||||
def is_cuda_alike(self) -> bool:
|
||||
"""Stateless version of :func:`torch.cuda.is_available`."""
|
||||
"""Stateless version of {func}`torch.cuda.is_available`."""
|
||||
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
||||
|
||||
def is_sleep_mode_available(self) -> bool:
|
||||
@ -165,7 +165,7 @@ class Platform:
|
||||
cls,
|
||||
device_id: int = 0,
|
||||
) -> Optional[DeviceCapability]:
|
||||
"""Stateless version of :func:`torch.cuda.get_device_capability`."""
|
||||
"""Stateless version of {func}`torch.cuda.get_device_capability`."""
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
@ -180,7 +180,7 @@ class Platform:
|
||||
The ``capability`` argument can either be:
|
||||
|
||||
- A tuple ``(major, minor)``.
|
||||
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
|
||||
- An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
|
||||
"""
|
||||
current_capability = cls.get_device_capability(device_id=device_id)
|
||||
if current_capability is None:
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .layerwise_profile import layerwise_profile
|
||||
|
||||
__all__ = [
|
||||
"layerwise_profile",
|
||||
]
|
||||
@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
|
||||
|
||||
|
||||
def array_full(token_id: int, count: int):
|
||||
""":class:`array` equivalent of :func:`numpy.full`."""
|
||||
"""{class}`array` equivalent of {func}`numpy.full`."""
|
||||
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
|
||||
|
||||
|
||||
@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct,
|
||||
def from_prompt_token_counts(
|
||||
*token_counts: tuple[int, int]) -> "SequenceData":
|
||||
"""
|
||||
Construct a :class:`SequenceData` instance by concatenating
|
||||
Construct a {class}`SequenceData` instance by concatenating
|
||||
prompt token sequences.
|
||||
|
||||
Each tuple represents one token sequence, expressed in the form
|
||||
:code:`(token_id, count)`.
|
||||
`(token_id, count)`.
|
||||
"""
|
||||
if len(token_counts) == 0:
|
||||
return SequenceData.from_seqs([])
|
||||
@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct,
|
||||
prompt_embeds: Optional[torch.Tensor] = None,
|
||||
) -> "SequenceData":
|
||||
"""
|
||||
Construct a :class:`SequenceData` instance from prompt and output
|
||||
Construct a {class}`SequenceData` instance from prompt and output
|
||||
token sequences.
|
||||
"""
|
||||
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
|
||||
@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct,
|
||||
class Sequence:
|
||||
"""Stores the data, status, and block information of a sequence.
|
||||
|
||||
The sequence is constructed from the :data:`DecoderOnlyInputs`
|
||||
(for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
|
||||
instance passed in through the :code:`inputs` constructor argument.
|
||||
The sequence is constructed from the {data}`DecoderOnlyInputs`
|
||||
(for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
|
||||
instance passed in through the `inputs` constructor argument.
|
||||
|
||||
Args:
|
||||
seq_id: The ID of the sequence.
|
||||
|
||||
@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
|
||||
"""Create a SmallerTpProposerWorker.
|
||||
|
||||
Args:
|
||||
worker (MultiStepWorker): an actual worker wrapped with this class
|
||||
worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
|
||||
actual worker wrapped with this class
|
||||
draft_ranks (List[int]): if this value is given, only the GPU ranks
|
||||
written in this value participate in draft generation
|
||||
"""
|
||||
|
||||
@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss. See [here]() for more details
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Exaone
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
Configuration objects inherit from {class}`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs. Read the documentation from :
|
||||
class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
||||
vocab_size ({obj}`int`, `optional`, defaults to 50257):
|
||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||
different tokens that can be represented by the :obj:`inputs_ids`
|
||||
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
||||
different tokens that can be represented by the {obj}`inputs_ids`
|
||||
passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
|
||||
size of the model.
|
||||
Defines the different tokens that can be represented by the
|
||||
`inputs_ids` passed to the forward method of :class:
|
||||
`~transformers.EXAONEModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
||||
hidden_size ({obj}`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
num_layers ({obj}`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
|
||||
specified, will default to `num_attention_heads`.
|
||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||
percentage of hidden dimensions to allocate to rotary embeddings
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
||||
intermediate_size ({obj}`int`, `optional`, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||
the Transformer encoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
||||
defaults to :obj:`"gelu_new"`):
|
||||
activation_function ({obj}`str` or {obj}`function`, `optional`,
|
||||
defaults to {obj}`"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
||||
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
||||
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
|
||||
{obj}`"selu"` and {obj}`"gelu_new"` are supported.
|
||||
embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
||||
max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
||||
:class:`~transformers.EXAONEModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
type_vocab_size ({obj}`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the {obj}`token_type_ids` passed when calling
|
||||
{class}`~transformers.EXAONEModel`.
|
||||
initializer_range ({obj}`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
||||
layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
Only relevant if ``config.is_decoder=True``.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`,
|
||||
defaults to :obj:`False`):
|
||||
gradient_checkpointing ({obj}`bool`, `optional`,
|
||||
defaults to {obj}`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense
|
||||
of slower backward pass.
|
||||
Example::
|
||||
|
||||
@ -39,9 +39,9 @@ def decode_tokens(
|
||||
) -> str:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
:code:`tokenizer.decode(token_ids, ...)`.
|
||||
`tokenizer.decode(token_ids, ...)`.
|
||||
|
||||
:code:`skip_special_tokens=None` means to use the backend's default
|
||||
`skip_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
if skip_special_tokens is not None:
|
||||
@ -61,9 +61,9 @@ def encode_tokens(
|
||||
) -> list[int]:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
:code:`tokenizer.encode(text, ...)`.
|
||||
`tokenizer.encode(text, ...)`.
|
||||
|
||||
:code:`add_special_tokens=None` means to use the backend's default
|
||||
`add_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
|
||||
|
||||
@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
|
||||
"""
|
||||
Gets the cumulative number of hits and queries against this cache.
|
||||
|
||||
If :code:`delta=True`, instead gets these statistics
|
||||
since the last call that also passed :code:`delta=True`.
|
||||
If `delta=True`, instead gets these statistics
|
||||
since the last call that also passed `delta=True`.
|
||||
"""
|
||||
info = CacheInfo(hits=self._hits, total=self._total)
|
||||
|
||||
@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
|
||||
|
||||
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
|
||||
"""
|
||||
Unlike :class:`itertools.groupby`, groups are not broken by
|
||||
Unlike {class}`itertools.groupby`, groups are not broken by
|
||||
non-contiguous data.
|
||||
"""
|
||||
groups = defaultdict[_K, list[_V]](list)
|
||||
@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
|
||||
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||
|
||||
|
||||
def is_in_doc_build() -> bool:
|
||||
try:
|
||||
from sphinx.ext.autodoc.mock import _MockModule
|
||||
return isinstance(torch, _MockModule)
|
||||
except ModuleNotFoundError:
|
||||
return False
|
||||
|
||||
|
||||
def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
|
||||
"""
|
||||
Import a Python file according to its file path.
|
||||
@ -1820,10 +1812,11 @@ class _PlaceholderBase:
|
||||
Disallows downstream usage of placeholder modules.
|
||||
|
||||
We need to explicitly override each dunder method because
|
||||
:meth:`__getattr__` is not called when they are accessed.
|
||||
{meth}`__getattr__` is not called when they are accessed.
|
||||
|
||||
See also:
|
||||
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
|
||||
:::{seealso}
|
||||
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
|
||||
:::
|
||||
"""
|
||||
|
||||
def __getattr__(self, key: str) -> Never:
|
||||
@ -2052,9 +2045,6 @@ def direct_register_custom_op(
|
||||
library object. If you want to bind the operator to a different library,
|
||||
make sure the library object is alive when the operator is used.
|
||||
"""
|
||||
if is_in_doc_build():
|
||||
return
|
||||
|
||||
if not supports_custom_op():
|
||||
from vllm.platforms import current_platform
|
||||
assert not current_platform.is_cuda_alike(), (
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
# MLA Common Components
|
||||
|
||||
This file implements common components for MLA implementations.
|
||||
|
||||
First we define:
|
||||
|
||||
@ -180,6 +180,7 @@ class KVCacheManager:
|
||||
as eagle.
|
||||
|
||||
Blocks layout:
|
||||
```
|
||||
-----------------------------------------------------------------------
|
||||
| < computed > | < new computed > | < new > | < pre-allocated > |
|
||||
-----------------------------------------------------------------------
|
||||
@ -189,6 +190,7 @@ class KVCacheManager:
|
||||
------------------------------------------------
|
||||
| <new full> |
|
||||
--------------
|
||||
```
|
||||
The following *_blocks are illustrated in this layout.
|
||||
|
||||
Returns:
|
||||
|
||||
@ -308,7 +308,7 @@ class OutputProcessor:
|
||||
* If there is no queue (for usage with LLMEngine),
|
||||
return a list of RequestOutput objects.
|
||||
|
||||
****************** NOTE FOR DEVELOPERS ******************
|
||||
NOTE FOR DEVELOPERS
|
||||
|
||||
vLLM V1 minimizes the number of python loops over the full
|
||||
batch to ensure system overheads are minimized. This is the
|
||||
@ -316,8 +316,6 @@ class OutputProcessor:
|
||||
|
||||
If you need to touch every element of the batch, do it from
|
||||
within the loop below.
|
||||
|
||||
**********************************************************
|
||||
"""
|
||||
|
||||
request_outputs: list[RequestOutput] = []
|
||||
|
||||
@ -75,7 +75,7 @@ class RejectionSampler(nn.Module):
|
||||
outside of the rejection sampler with the default sampling
|
||||
strategy. It allows for more flexibility in the sampling
|
||||
process such as top_p, top_k sampling.
|
||||
sampling_metadata (SamplingMetadata):
|
||||
sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
|
||||
Additional metadata needed for sampling, such as temperature,
|
||||
top-k/top-p parameters, or other relevant information.
|
||||
Returns:
|
||||
|
||||
@ -170,9 +170,10 @@ class Worker(WorkerBase):
|
||||
Then, it calculate the free memory that can be used for KV cache in
|
||||
bytes.
|
||||
|
||||
.. tip::
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::{tip}
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::
|
||||
"""
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
|
||||
) -> None:
|
||||
"""
|
||||
Perform sanity checks for the result of
|
||||
:meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
|
||||
{meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
|
||||
"""
|
||||
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
|
||||
"Expected multimodal embeddings to be a list/tuple of 2D tensors, "
|
||||
@ -39,7 +39,7 @@ def scatter_mm_placeholders(
|
||||
Scatter the multimodal embeddings into a contiguous tensor that represents
|
||||
the placeholder tokens.
|
||||
|
||||
:class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
|
||||
{class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
|
||||
|
||||
Args:
|
||||
embeds: The multimodal embeddings.
|
||||
@ -66,7 +66,7 @@ def gather_mm_placeholders(
|
||||
"""
|
||||
Reconstructs the embeddings from the placeholder tokens.
|
||||
|
||||
This is the operation of :func:`scatter_mm_placeholders`.
|
||||
This is the operation of {func}`scatter_mm_placeholders`.
|
||||
"""
|
||||
if is_embed is None:
|
||||
return placeholders
|
||||
|
||||
@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
.. tip::
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::{tip}
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
|
||||
@ -734,11 +734,11 @@ def _pythonize_sampler_output(
|
||||
cache: Optional[PythonizationCache],
|
||||
) -> None:
|
||||
""" This function is only called when the output tensors are ready.
|
||||
See :class:`ModelOutput`.
|
||||
See {class}`ModelOutput`.
|
||||
|
||||
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
|
||||
adding a Pythonized output data structure
|
||||
(:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
|
||||
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
|
||||
|
||||
Args:
|
||||
model_input
|
||||
|
||||
@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
.. tip::
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::{tip}
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
:::
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user