mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-17 15:10:12 +08:00
Merge branch 'main' into woosuk-jf
This commit is contained in:
commit
bcf3c8230d
@ -39,7 +39,7 @@ steps:
|
|||||||
- pip install -r ../../requirements/docs.txt
|
- pip install -r ../../requirements/docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -80,6 +80,7 @@ instance/
|
|||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
docs/source/getting_started/examples/
|
docs/source/getting_started/examples/
|
||||||
|
docs/source/api/vllm
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
|
|||||||
@ -22,3 +22,4 @@ help:
|
|||||||
clean:
|
clean:
|
||||||
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
rm -rf "$(SOURCEDIR)/getting_started/examples"
|
rm -rf "$(SOURCEDIR)/getting_started/examples"
|
||||||
|
rm -rf "$(SOURCEDIR)/api/vllm"
|
||||||
|
|||||||
@ -1,7 +0,0 @@
|
|||||||
# AsyncLLMEngine
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.AsyncLLMEngine
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
# vLLM Engine
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.engine
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. currentmodule:: vllm.engine
|
|
||||||
```
|
|
||||||
|
|
||||||
:::{toctree}
|
|
||||||
:caption: Engines
|
|
||||||
:maxdepth: 2
|
|
||||||
|
|
||||||
llm_engine
|
|
||||||
async_llm_engine
|
|
||||||
:::
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
# LLMEngine
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.LLMEngine
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# Inference Parameters
|
|
||||||
|
|
||||||
Inference parameters for vLLM APIs.
|
|
||||||
|
|
||||||
(sampling-params)=
|
|
||||||
|
|
||||||
## Sampling Parameters
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.SamplingParams
|
|
||||||
:members:
|
|
||||||
```
|
|
||||||
|
|
||||||
(pooling-params)=
|
|
||||||
|
|
||||||
## Pooling Parameters
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.PoolingParams
|
|
||||||
:members:
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Model Adapters
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.model_executor.models.adapters
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# Model Development
|
|
||||||
|
|
||||||
## Submodules
|
|
||||||
|
|
||||||
:::{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
interfaces_base
|
|
||||||
interfaces
|
|
||||||
adapters
|
|
||||||
:::
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Optional Interfaces
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.model_executor.models.interfaces
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Base Model Interfaces
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.model_executor.models.interfaces_base
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
(multi-modality)=
|
|
||||||
|
|
||||||
# Multi-Modality
|
|
||||||
|
|
||||||
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
|
||||||
|
|
||||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
|
||||||
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
|
||||||
|
|
||||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
|
|
||||||
```
|
|
||||||
|
|
||||||
## Submodules
|
|
||||||
|
|
||||||
:::{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
inputs
|
|
||||||
parse
|
|
||||||
processing
|
|
||||||
profiling
|
|
||||||
registry
|
|
||||||
:::
|
|
||||||
@ -1,49 +0,0 @@
|
|||||||
# Input Definitions
|
|
||||||
|
|
||||||
## User-facing inputs
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
|
|
||||||
```
|
|
||||||
|
|
||||||
## Internal data structures
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autodata:: vllm.multimodal.inputs.NestedTensors
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Data Parsing
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.multimodal.parse
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Data Processing
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.multimodal.processing
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Memory Profiling
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.multimodal.profiling
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Registry
|
|
||||||
|
|
||||||
## Module Contents
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. automodule:: vllm.multimodal.registry
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# Offline Inference
|
|
||||||
|
|
||||||
:::{toctree}
|
|
||||||
:caption: Contents
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
llm
|
|
||||||
llm_inputs
|
|
||||||
:::
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
# LLM Class
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.LLM
|
|
||||||
:members:
|
|
||||||
:show-inheritance:
|
|
||||||
```
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
# LLM Inputs
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autodata:: vllm.inputs.PromptType
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.inputs.TextPrompt
|
|
||||||
:show-inheritance:
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
|
||||||
.. autoclass:: vllm.inputs.TokensPrompt
|
|
||||||
:show-inheritance:
|
|
||||||
:members:
|
|
||||||
:member-order: bysource
|
|
||||||
```
|
|
||||||
133
docs/source/api/summary.md
Normal file
133
docs/source/api/summary.md
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
# Summary
|
||||||
|
|
||||||
|
(configuration)=
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
API documentation for vLLM's configuration classes.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.config.ModelConfig
|
||||||
|
vllm.config.CacheConfig
|
||||||
|
vllm.config.TokenizerPoolConfig
|
||||||
|
vllm.config.LoadConfig
|
||||||
|
vllm.config.ParallelConfig
|
||||||
|
vllm.config.SchedulerConfig
|
||||||
|
vllm.config.DeviceConfig
|
||||||
|
vllm.config.SpeculativeConfig
|
||||||
|
vllm.config.LoRAConfig
|
||||||
|
vllm.config.PromptAdapterConfig
|
||||||
|
vllm.config.MultiModalConfig
|
||||||
|
vllm.config.PoolerConfig
|
||||||
|
vllm.config.DecodingConfig
|
||||||
|
vllm.config.ObservabilityConfig
|
||||||
|
vllm.config.KVTransferConfig
|
||||||
|
vllm.config.CompilationConfig
|
||||||
|
vllm.config.VllmConfig
|
||||||
|
```
|
||||||
|
|
||||||
|
(offline-inference-api)=
|
||||||
|
|
||||||
|
## Offline Inference
|
||||||
|
|
||||||
|
LLM Class.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.LLM
|
||||||
|
```
|
||||||
|
|
||||||
|
LLM Inputs.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.inputs.PromptType
|
||||||
|
vllm.inputs.TextPrompt
|
||||||
|
vllm.inputs.TokensPrompt
|
||||||
|
```
|
||||||
|
|
||||||
|
## vLLM Engines
|
||||||
|
|
||||||
|
Engine classes for offline and online inference.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.LLMEngine
|
||||||
|
vllm.AsyncLLMEngine
|
||||||
|
```
|
||||||
|
|
||||||
|
## Inference Parameters
|
||||||
|
|
||||||
|
Inference parameters for vLLM APIs.
|
||||||
|
|
||||||
|
(sampling-params)=
|
||||||
|
(pooling-params)=
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.SamplingParams
|
||||||
|
vllm.PoolingParams
|
||||||
|
```
|
||||||
|
|
||||||
|
(multi-modality)=
|
||||||
|
|
||||||
|
## Multi-Modality
|
||||||
|
|
||||||
|
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
|
||||||
|
|
||||||
|
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
|
||||||
|
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
|
||||||
|
|
||||||
|
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.MULTIMODAL_REGISTRY
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
|
||||||
|
User-facing inputs.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.inputs.MultiModalDataDict
|
||||||
|
```
|
||||||
|
|
||||||
|
Internal data structures.
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.inputs.PlaceholderRange
|
||||||
|
vllm.multimodal.inputs.NestedTensors
|
||||||
|
vllm.multimodal.inputs.MultiModalFieldElem
|
||||||
|
vllm.multimodal.inputs.MultiModalFieldConfig
|
||||||
|
vllm.multimodal.inputs.MultiModalKwargsItem
|
||||||
|
vllm.multimodal.inputs.MultiModalKwargs
|
||||||
|
vllm.multimodal.inputs.MultiModalInputs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Parsing
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.parse
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Processing
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.processing
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory Profiling
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.profiling
|
||||||
|
```
|
||||||
|
|
||||||
|
### Registry
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.multimodal.registry
|
||||||
|
```
|
||||||
|
|
||||||
|
## Model Development
|
||||||
|
|
||||||
|
```{autodoc2-summary}
|
||||||
|
vllm.model_executor.models.interfaces_base
|
||||||
|
vllm.model_executor.models.interfaces
|
||||||
|
vllm.model_executor.models.adapters
|
||||||
|
```
|
||||||
21
docs/source/autodoc2_docstring_parser.py
Normal file
21
docs/source/autodoc2_docstring_parser.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from docutils import nodes
|
||||||
|
from myst_parser.parsers.sphinx_ import MystParser
|
||||||
|
from sphinx.ext.napoleon import docstring
|
||||||
|
|
||||||
|
|
||||||
|
class NapoleonParser(MystParser):
|
||||||
|
|
||||||
|
def parse(self, input_string: str, document: nodes.document) -> None:
|
||||||
|
# Get the Sphinx configuration
|
||||||
|
config = document.settings.env.config
|
||||||
|
|
||||||
|
parsed_content = str(
|
||||||
|
docstring.GoogleDocstring(
|
||||||
|
str(docstring.NumpyDocstring(input_string, config)),
|
||||||
|
config,
|
||||||
|
))
|
||||||
|
return super().parse(parsed_content, document)
|
||||||
|
|
||||||
|
|
||||||
|
Parser = NapoleonParser
|
||||||
@ -13,16 +13,17 @@
|
|||||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import inspect
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from sphinx.ext import autodoc
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
sys.path.append(os.path.abspath("../.."))
|
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||||
|
sys.path.append(os.path.abspath(REPO_ROOT))
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
@ -40,8 +41,7 @@ extensions = [
|
|||||||
"sphinx.ext.linkcode",
|
"sphinx.ext.linkcode",
|
||||||
"sphinx.ext.intersphinx",
|
"sphinx.ext.intersphinx",
|
||||||
"sphinx_copybutton",
|
"sphinx_copybutton",
|
||||||
"sphinx.ext.autodoc",
|
"autodoc2",
|
||||||
"sphinx.ext.autosummary",
|
|
||||||
"myst_parser",
|
"myst_parser",
|
||||||
"sphinxarg.ext",
|
"sphinxarg.ext",
|
||||||
"sphinx_design",
|
"sphinx_design",
|
||||||
@ -49,7 +49,22 @@ extensions = [
|
|||||||
]
|
]
|
||||||
myst_enable_extensions = [
|
myst_enable_extensions = [
|
||||||
"colon_fence",
|
"colon_fence",
|
||||||
|
"fieldlist",
|
||||||
]
|
]
|
||||||
|
autodoc2_packages = [
|
||||||
|
{
|
||||||
|
"path": "../../vllm",
|
||||||
|
"exclude_dirs": ["__pycache__", "third_party"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
autodoc2_output_dir = "api"
|
||||||
|
autodoc2_render_plugin = "myst"
|
||||||
|
autodoc2_hidden_objects = ["dunder", "private", "inherited"]
|
||||||
|
autodoc2_docstring_parser_regexes = [
|
||||||
|
(".*", "docs.source.autodoc2_docstring_parser"),
|
||||||
|
]
|
||||||
|
autodoc2_sort_names = True
|
||||||
|
autodoc2_index_template = None
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
templates_path = ['_templates']
|
templates_path = ['_templates']
|
||||||
@ -77,6 +92,11 @@ html_theme_options = {
|
|||||||
'repository_url': 'https://github.com/vllm-project/vllm',
|
'repository_url': 'https://github.com/vllm-project/vllm',
|
||||||
'use_repository_button': True,
|
'use_repository_button': True,
|
||||||
'use_edit_page_button': True,
|
'use_edit_page_button': True,
|
||||||
|
# Prevents the full API being added to the left sidebar of every page.
|
||||||
|
# Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
|
||||||
|
'collapse_navbar': True,
|
||||||
|
# Makes API visible in the right sidebar on API reference pages.
|
||||||
|
'show_toc_level': 3,
|
||||||
}
|
}
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
@ -164,73 +184,64 @@ def linkcode_resolve(domain, info):
|
|||||||
return None
|
return None
|
||||||
if not info['module']:
|
if not info['module']:
|
||||||
return None
|
return None
|
||||||
filename = info['module'].replace('.', '/')
|
|
||||||
module = info['module']
|
|
||||||
|
|
||||||
# try to determine the correct file and line number to link to
|
# Get path from module name
|
||||||
obj = sys.modules[module]
|
file = Path(f"{info['module'].replace('.', '/')}.py")
|
||||||
|
path = REPO_ROOT / file
|
||||||
|
if not path.exists():
|
||||||
|
path = REPO_ROOT / file.with_suffix("") / "__init__.py"
|
||||||
|
if not path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
# get as specific as we can
|
# Get the line number of the object
|
||||||
lineno: int = 0
|
with open(path) as f:
|
||||||
filename: str = ""
|
lines = f.readlines()
|
||||||
try:
|
name = info['fullname'].split(".")[-1]
|
||||||
for part in info['fullname'].split('.'):
|
pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
|
||||||
obj = getattr(obj, part)
|
for lineno, line in enumerate(lines, 1):
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
if re.match(pattern, line):
|
||||||
|
break
|
||||||
|
|
||||||
# Skip decorator wrappers by checking if the object is a function
|
# If the line number is not found, return None
|
||||||
# and has a __wrapped__ attribute (which decorators typically set)
|
if lineno == len(lines):
|
||||||
while hasattr(obj, '__wrapped__'):
|
return None
|
||||||
obj = obj.__wrapped__
|
|
||||||
|
|
||||||
if not (inspect.isclass(obj) or inspect.isfunction(obj)
|
# If the line number is found, create the URL
|
||||||
or inspect.ismethod(obj)):
|
filename = path.relative_to(REPO_ROOT)
|
||||||
obj = obj.__class__ # Get the class of the instance
|
if "checkouts" in path.parts:
|
||||||
|
|
||||||
lineno = inspect.getsourcelines(obj)[1]
|
|
||||||
filename = (inspect.getsourcefile(obj)
|
|
||||||
or f"{filename}.py").split("vllm/", 1)[1]
|
|
||||||
except Exception:
|
|
||||||
# For some things, like a class member, won't work, so
|
|
||||||
# we'll use the line number of the parent (the class)
|
|
||||||
pass
|
|
||||||
|
|
||||||
if filename.startswith("checkouts/"):
|
|
||||||
# a PR build on readthedocs
|
# a PR build on readthedocs
|
||||||
pr_number = filename.split("/")[1]
|
pr_number = REPO_ROOT.name
|
||||||
filename = filename.split("/", 2)[2]
|
|
||||||
base, branch = get_repo_base_and_branch(pr_number)
|
base, branch = get_repo_base_and_branch(pr_number)
|
||||||
if base and branch:
|
if base and branch:
|
||||||
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
|
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
|
||||||
|
|
||||||
# Otherwise, link to the source file on the main branch
|
# Otherwise, link to the source file on the main branch
|
||||||
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
|
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
|
||||||
|
|
||||||
|
|
||||||
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
|
# Mock out external dependencies here, otherwise sphinx-argparse won't work.
|
||||||
autodoc_mock_imports = [
|
autodoc_mock_imports = [
|
||||||
|
"huggingface_hub",
|
||||||
|
"pydantic",
|
||||||
|
"zmq",
|
||||||
|
"cloudpickle",
|
||||||
|
"aiohttp",
|
||||||
|
"starlette",
|
||||||
"blake3",
|
"blake3",
|
||||||
"compressed_tensors",
|
|
||||||
"cpuinfo",
|
"cpuinfo",
|
||||||
"cv2",
|
|
||||||
"torch",
|
|
||||||
"transformers",
|
"transformers",
|
||||||
"psutil",
|
"psutil",
|
||||||
"prometheus_client",
|
|
||||||
"sentencepiece",
|
|
||||||
"vllm._C",
|
"vllm._C",
|
||||||
"PIL",
|
"PIL",
|
||||||
"numpy",
|
"numpy",
|
||||||
'triton',
|
|
||||||
"tqdm",
|
"tqdm",
|
||||||
"tensorizer",
|
# The mocks below are required by
|
||||||
"pynvml",
|
# docs/source/serving/openai_compatible_server.md's
|
||||||
"outlines",
|
# vllm.entrypoints.openai.cli_args
|
||||||
"xgrammar",
|
"openai",
|
||||||
"librosa",
|
"fastapi",
|
||||||
"soundfile",
|
"partial_json_parser",
|
||||||
"gguf",
|
|
||||||
"lark",
|
|
||||||
"decord",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for mock_target in autodoc_mock_imports:
|
for mock_target in autodoc_mock_imports:
|
||||||
@ -241,18 +252,6 @@ for mock_target in autodoc_mock_imports:
|
|||||||
"been loaded into sys.modules when the sphinx build starts.",
|
"been loaded into sys.modules when the sphinx build starts.",
|
||||||
mock_target)
|
mock_target)
|
||||||
|
|
||||||
|
|
||||||
class MockedClassDocumenter(autodoc.ClassDocumenter):
|
|
||||||
"""Remove note about base class when a class is derived from object."""
|
|
||||||
|
|
||||||
def add_line(self, line: str, source: str, *lineno: int) -> None:
|
|
||||||
if line == " Bases: :py:class:`object`":
|
|
||||||
return
|
|
||||||
super().add_line(line, source, *lineno)
|
|
||||||
|
|
||||||
|
|
||||||
autodoc.ClassDocumenter = MockedClassDocumenter
|
|
||||||
|
|
||||||
intersphinx_mapping = {
|
intersphinx_mapping = {
|
||||||
"python": ("https://docs.python.org/3", None),
|
"python": ("https://docs.python.org/3", None),
|
||||||
"typing_extensions":
|
"typing_extensions":
|
||||||
@ -264,7 +263,4 @@ intersphinx_mapping = {
|
|||||||
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
|
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
|
||||||
}
|
}
|
||||||
|
|
||||||
autodoc_preserve_defaults = True
|
|
||||||
autodoc_warningiserror = True
|
|
||||||
|
|
||||||
navigation_with_keys = False
|
navigation_with_keys = False
|
||||||
|
|||||||
@ -52,8 +52,8 @@ for output in outputs:
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
More API details can be found in the {doc}`Offline Inference
|
More API details can be found in the [Offline Inference]
|
||||||
</api/offline_inference/index>` section of the API docs.
|
(#offline-inference-api) section of the API docs.
|
||||||
|
|
||||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||||
|
|
||||||
|
|||||||
@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
|||||||
* [APC](#automatic-prefix-caching)
|
* [APC](#automatic-prefix-caching)
|
||||||
* [LoRA](#lora-adapter)
|
* [LoRA](#lora-adapter)
|
||||||
* <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
* <abbr title="Prompt Adapter">prmpt adptr</abbr>
|
||||||
* [SD](#spec_decode)
|
* [SD](#spec-decode)
|
||||||
* CUDA graph
|
* CUDA graph
|
||||||
* <abbr title="Pooling Models">pooling</abbr>
|
* <abbr title="Pooling Models">pooling</abbr>
|
||||||
* <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
* <abbr title="Encoder-Decoder Models">enc-dec</abbr>
|
||||||
@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
- * [SD](#spec_decode)
|
- * [SD](#spec-decode)
|
||||||
* ✅
|
* ✅
|
||||||
* ✅
|
* ✅
|
||||||
* ❌
|
* ❌
|
||||||
@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
|
|||||||
* ✅
|
* ✅
|
||||||
* [❌](gh-issue:8475)
|
* [❌](gh-issue:8475)
|
||||||
* ✅
|
* ✅
|
||||||
- * [SD](#spec_decode)
|
- * [SD](#spec-decode)
|
||||||
* ✅
|
* ✅
|
||||||
* ✅
|
* ✅
|
||||||
* ✅
|
* ✅
|
||||||
|
|||||||
@ -194,11 +194,8 @@ contributing/vulnerability_management
|
|||||||
:caption: API Reference
|
:caption: API Reference
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
api/offline_inference/index
|
api/summary
|
||||||
api/engine/index
|
api/vllm/vllm
|
||||||
api/inference_params
|
|
||||||
api/multimodal/index
|
|
||||||
api/model/index
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
% Latest news and acknowledgements
|
% Latest news and acknowledgements
|
||||||
|
|||||||
@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it.
|
|||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|
||||||
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
||||||
See [Engine Arguments](#engine-args) for a list of options when initializing the model.
|
See <project:#configuration> for a list of options when initializing the model.
|
||||||
|
|
||||||
### `LLM.generate`
|
### `LLM.generate`
|
||||||
|
|
||||||
|
|||||||
@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
|
|||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|
||||||
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
The {class}`~vllm.LLM` class provides various methods for offline inference.
|
||||||
See [Engine Arguments](#engine-args) for a list of options when initializing the model.
|
See <project:#configuration> for a list of options when initializing the model.
|
||||||
|
|
||||||
### `LLM.encode`
|
### `LLM.encode`
|
||||||
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ The available APIs depend on the type of model that is being run:
|
|||||||
Please refer to the above pages for more details about each API.
|
Please refer to the above pages for more details about each API.
|
||||||
|
|
||||||
:::{seealso}
|
:::{seealso}
|
||||||
[API Reference](/api/offline_inference/index)
|
[API Reference](#offline-inference-api)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
(configuration-options)=
|
(configuration-options)=
|
||||||
@ -33,7 +33,7 @@ Please refer to the above pages for more details about each API.
|
|||||||
## Configuration Options
|
## Configuration Options
|
||||||
|
|
||||||
This section lists the most common options for running the vLLM engine.
|
This section lists the most common options for running the vLLM engine.
|
||||||
For a full list, refer to the [Engine Arguments](#engine-args) page.
|
For a full list, refer to the <project:#configuration> page.
|
||||||
|
|
||||||
(model-resolution)=
|
(model-resolution)=
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import tqdm
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.profiler import layerwise_profile
|
from vllm.profiler.layerwise_profile import layerwise_profile
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
BATCH_SIZE_DEFAULT = 1
|
BATCH_SIZE_DEFAULT = 1
|
||||||
|
|||||||
@ -1,27 +1,15 @@
|
|||||||
sphinx==8.2.3
|
sphinx==8.2.3
|
||||||
sphinx-argparse==0.5.2
|
sphinx-argparse==0.5.2
|
||||||
|
sphinx-autodoc2==0.5.0
|
||||||
sphinx-book-theme==1.1.4
|
sphinx-book-theme==1.1.4
|
||||||
sphinx-copybutton==0.5.2
|
sphinx-copybutton==0.5.2
|
||||||
sphinx-design==0.6.1
|
sphinx-design==0.6.1
|
||||||
sphinx-togglebutton==0.3.2
|
sphinx-togglebutton==0.3.2
|
||||||
myst-parser==4.0.1
|
myst-parser==4.0.1
|
||||||
msgspec
|
msgspec
|
||||||
cloudpickle
|
|
||||||
commonmark # Required by sphinx-argparse when using :markdownhelp:
|
commonmark # Required by sphinx-argparse when using :markdownhelp:
|
||||||
|
|
||||||
# packages to install to build the documentation
|
# packages to install to build the documentation
|
||||||
cachetools
|
cachetools
|
||||||
pydantic >= 2.8
|
|
||||||
-f https://download.pytorch.org/whl/cpu
|
-f https://download.pytorch.org/whl/cpu
|
||||||
torch
|
torch
|
||||||
py-cpuinfo
|
|
||||||
transformers
|
|
||||||
mistral_common >= 1.5.4
|
|
||||||
aiohttp
|
|
||||||
starlette
|
|
||||||
scipy
|
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
|
||||||
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
|
||||||
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
|
||||||
requests
|
|
||||||
zmq
|
|
||||||
@ -112,11 +112,11 @@ class AudioTestAssets(list[AudioAsset]):
|
|||||||
|
|
||||||
|
|
||||||
IMAGE_ASSETS = ImageTestAssets()
|
IMAGE_ASSETS = ImageTestAssets()
|
||||||
"""Singleton instance of :class:`ImageTestAssets`."""
|
"""Singleton instance of {class}`ImageTestAssets`."""
|
||||||
VIDEO_ASSETS = VideoTestAssets()
|
VIDEO_ASSETS = VideoTestAssets()
|
||||||
"""Singleton instance of :class:`VideoTestAssets`."""
|
"""Singleton instance of {class}`VideoTestAssets`."""
|
||||||
AUDIO_ASSETS = AudioTestAssets()
|
AUDIO_ASSETS = AudioTestAssets()
|
||||||
"""Singleton instance of :class:`AudioTestAssets`."""
|
"""Singleton instance of {class}`AudioTestAssets`."""
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
@ -724,7 +724,7 @@ def hf_runner():
|
|||||||
class VllmRunner:
|
class VllmRunner:
|
||||||
"""
|
"""
|
||||||
The default value of some arguments have been modified from
|
The default value of some arguments have been modified from
|
||||||
:class:`~vllm.LLM` as follows:
|
{class}`~vllm.LLM` as follows:
|
||||||
|
|
||||||
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
||||||
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
This test file includes some cases where it is inappropriate to
|
This test file includes some cases where it is inappropriate to
|
||||||
only get the `eos_token_id` from the tokenizer as defined by
|
only get the `eos_token_id` from the tokenizer as defined by
|
||||||
:meth:`vllm.LLMEngine._get_eos_token_id`.
|
{meth}`vllm.LLMEngine._get_eos_token_id`.
|
||||||
"""
|
"""
|
||||||
from vllm.transformers_utils.config import try_get_generation_config
|
from vllm.transformers_utils.config import try_get_generation_config
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|||||||
@ -952,7 +952,7 @@ def get_client_text_logprob_generations(
|
|||||||
completions: list[Completion]) -> list[TextTextLogprobs]:
|
completions: list[Completion]) -> list[TextTextLogprobs]:
|
||||||
'''Operates on the output of a request made to an Open-AI-protocol
|
'''Operates on the output of a request made to an Open-AI-protocol
|
||||||
completions endpoint; obtains top-rank logprobs for each token in
|
completions endpoint; obtains top-rank logprobs for each token in
|
||||||
each :class:`SequenceGroup`
|
each {class}`SequenceGroup`
|
||||||
'''
|
'''
|
||||||
text_generations = get_client_text_generations(completions)
|
text_generations = get_client_text_generations(completions)
|
||||||
text = ''.join(text_generations)
|
text = ''.join(text_generations)
|
||||||
|
|||||||
@ -44,7 +44,7 @@ def create_scheduler(
|
|||||||
(None)
|
(None)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
:class:`Scheduler` instance
|
{class}`Scheduler` instance
|
||||||
'''
|
'''
|
||||||
if max_model_len is None:
|
if max_model_len is None:
|
||||||
max_model_len = max_num_batched_tokens
|
max_model_len = max_num_batched_tokens
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
|
# MLA Common Components
|
||||||
|
|
||||||
This file implements common components for MLA implementations.
|
This file implements common components for MLA implementations.
|
||||||
|
|
||||||
First we define:
|
First we define:
|
||||||
|
|||||||
@ -550,7 +550,7 @@ def get_num_prefill_decode_query_kv_tokens(
|
|||||||
based on the attention metadata and the specified attention type.
|
based on the attention metadata and the specified attention type.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
attn_metadata (FlashAttentionMetadata): Attention Metadata object.
|
attn_metadata (AttentionMetadata): Attention Metadata object.
|
||||||
attn_type (AttentionType): The type of attention being used.
|
attn_type (AttentionType): The type of attention being used.
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[int, int, int]: A tuple containing three integers:
|
Tuple[int, int, int]: A tuple containing three integers:
|
||||||
|
|||||||
@ -39,7 +39,7 @@ class CompilerInterface:
|
|||||||
Gather all the relevant information from the vLLM config,
|
Gather all the relevant information from the vLLM config,
|
||||||
to compute a hash so that we can cache the compiled model.
|
to compute a hash so that we can cache the compiled model.
|
||||||
|
|
||||||
See :meth:`VllmConfig.compute_hash` to check what information
|
See {meth}`VllmConfig.compute_hash` to check what information
|
||||||
is already considered by default. This function should only
|
is already considered by default. This function should only
|
||||||
consider the information that is specific to the compiler.
|
consider the information that is specific to the compiler.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1911,10 +1911,10 @@ class SchedulerConfig:
|
|||||||
|
|
||||||
cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
|
cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
|
||||||
"""Cuda graph capture sizes, default is 512.
|
"""Cuda graph capture sizes, default is 512.
|
||||||
1. if one value is provided, then the capture list would follow the pattern:
|
1. if one value is provided, then the capture list would follow the
|
||||||
[1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
|
pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
|
||||||
2. more than one value (e.g. 1 2 128) is provided,
|
2. more than one value (e.g. 1 2 128) is provided, then the capture list
|
||||||
then the capture list will follow the provided list."""
|
will follow the provided list."""
|
||||||
|
|
||||||
delay_factor: float = 0.0
|
delay_factor: float = 0.0
|
||||||
"""Apply a delay (of delay factor multiplied by previous
|
"""Apply a delay (of delay factor multiplied by previous
|
||||||
@ -2888,7 +2888,7 @@ class PoolerConfig:
|
|||||||
pooling_type: Optional[str] = None
|
pooling_type: Optional[str] = None
|
||||||
"""
|
"""
|
||||||
The pooling method of the pooling model. This should be a key in
|
The pooling method of the pooling model. This should be a key in
|
||||||
:class:`vllm.model_executor.layers.pooler.PoolingType`.
|
{class}`vllm.model_executor.layers.pooler.PoolingType`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
normalize: Optional[bool] = None
|
normalize: Optional[bool] = None
|
||||||
@ -2954,10 +2954,12 @@ def _get_and_verify_dtype(
|
|||||||
) -> torch.dtype:
|
) -> torch.dtype:
|
||||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||||
# because config.torch_dtype can be None.
|
# because config.torch_dtype can be None.
|
||||||
config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
|
config_dtype = getattr(config, "torch_dtype", None)
|
||||||
|
|
||||||
# Fallback for multi-modal models if the root config
|
# Fallbacks for multi-modal models if the root config
|
||||||
# does not define torch_dtype
|
# does not define torch_dtype
|
||||||
|
if config_dtype is None:
|
||||||
|
config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
|
||||||
if config_dtype is None and hasattr(config, "vision_config"):
|
if config_dtype is None and hasattr(config, "vision_config"):
|
||||||
config_dtype = getattr(config.vision_config, "torch_dtype", None)
|
config_dtype = getattr(config.vision_config, "torch_dtype", None)
|
||||||
|
|
||||||
|
|||||||
@ -167,4 +167,4 @@ class HTTPConnection:
|
|||||||
|
|
||||||
|
|
||||||
global_http_connection = HTTPConnection()
|
global_http_connection = HTTPConnection()
|
||||||
"""The global :class:`HTTPConnection` instance used by vLLM."""
|
"""The global {class}`HTTPConnection` instance used by vLLM."""
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
|
||||||
from vllm.distributed.kv_transfer.kv_transfer_state import (
|
from vllm.distributed.kv_transfer.kv_transfer_state import (
|
||||||
ensure_kv_transfer_initialized, get_kv_transfer_group,
|
ensure_kv_transfer_initialized, get_kv_transfer_group,
|
||||||
has_kv_transfer_group, is_v1_kv_transfer_group)
|
has_kv_transfer_group, is_v1_kv_transfer_group)
|
||||||
|
|||||||
@ -1237,6 +1237,12 @@ class EngineArgs:
|
|||||||
recommend_to_remove=False)
|
recommend_to_remove=False)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# No text embedding inputs so far.
|
||||||
|
if self.enable_prompt_embeds:
|
||||||
|
_raise_or_fallback(feature_name="--enable-prompt-embeds",
|
||||||
|
recommend_to_remove=False)
|
||||||
|
return False
|
||||||
|
|
||||||
# Only Fp16 and Bf16 dtypes since we only support FA.
|
# Only Fp16 and Bf16 dtypes since we only support FA.
|
||||||
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
|
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
|
||||||
if model_config.dtype not in V1_SUPPORTED_DTYPES:
|
if model_config.dtype not in V1_SUPPORTED_DTYPES:
|
||||||
|
|||||||
@ -475,7 +475,7 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
*,
|
*,
|
||||||
inputs: Optional[PromptType] = None, # DEPRECATED
|
inputs: Optional[PromptType] = None, # DEPRECATED
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Async version of :meth:`add_request`."""
|
"""Async version of {meth}`add_request`."""
|
||||||
if inputs is not None:
|
if inputs is not None:
|
||||||
prompt = inputs
|
prompt = inputs
|
||||||
assert prompt is not None and params is not None
|
assert prompt is not None and params is not None
|
||||||
@ -582,20 +582,20 @@ async def build_guided_decoding_logits_processor_async(
|
|||||||
|
|
||||||
|
|
||||||
class AsyncLLMEngine(EngineClient):
|
class AsyncLLMEngine(EngineClient):
|
||||||
"""An asynchronous wrapper for :class:`LLMEngine`.
|
"""An asynchronous wrapper for {class}`LLMEngine`.
|
||||||
|
|
||||||
This class is used to wrap the :class:`LLMEngine` class to make it
|
This class is used to wrap the {class}`LLMEngine` class to make it
|
||||||
asynchronous. It uses asyncio to create a background loop that keeps
|
asynchronous. It uses asyncio to create a background loop that keeps
|
||||||
processing incoming requests. The :class:`LLMEngine` is kicked by the
|
processing incoming requests. The {class}`LLMEngine` is kicked by the
|
||||||
generate method when there are requests in the waiting queue. The generate
|
generate method when there are requests in the waiting queue. The generate
|
||||||
method yields the outputs from the :class:`LLMEngine` to the caller.
|
method yields the outputs from the {class}`LLMEngine` to the caller.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
log_requests: Whether to log the requests.
|
log_requests: Whether to log the requests.
|
||||||
start_engine_loop: If True, the background task to run the engine
|
start_engine_loop: If True, the background task to run the engine
|
||||||
will be automatically started in the generate call.
|
will be automatically started in the generate call.
|
||||||
*args: Arguments for :class:`LLMEngine`.
|
*args: Arguments for {class}`LLMEngine`.
|
||||||
**kwargs: Arguments for :class:`LLMEngine`.
|
**kwargs: Arguments for {class}`LLMEngine`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
||||||
@ -985,7 +985,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
from the LLMEngine to the caller.
|
from the LLMEngine to the caller.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each input.
|
for more details about the format of each input.
|
||||||
sampling_params: The sampling parameters of the request.
|
sampling_params: The sampling parameters of the request.
|
||||||
request_id: The unique id of the request.
|
request_id: The unique id of the request.
|
||||||
@ -1003,7 +1003,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
Details:
|
Details:
|
||||||
- If the engine is not running, start the background loop,
|
- If the engine is not running, start the background loop,
|
||||||
which iteratively invokes
|
which iteratively invokes
|
||||||
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||||
to process the waiting requests.
|
to process the waiting requests.
|
||||||
- Add the request to the engine's `RequestTracker`.
|
- Add the request to the engine's `RequestTracker`.
|
||||||
On the next background loop, this request will be sent to
|
On the next background loop, this request will be sent to
|
||||||
@ -1075,7 +1075,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
from the LLMEngine to the caller.
|
from the LLMEngine to the caller.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each input.
|
for more details about the format of each input.
|
||||||
pooling_params: The pooling parameters of the request.
|
pooling_params: The pooling parameters of the request.
|
||||||
request_id: The unique id of the request.
|
request_id: The unique id of the request.
|
||||||
@ -1089,46 +1089,48 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
for the request.
|
for the request.
|
||||||
|
|
||||||
Details:
|
Details:
|
||||||
- If the engine is not running, start the background loop,
|
- If the engine is not running, start the background loop,
|
||||||
which iteratively invokes
|
which iteratively invokes
|
||||||
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||||
to process the waiting requests.
|
to process the waiting requests.
|
||||||
- Add the request to the engine's `RequestTracker`.
|
- Add the request to the engine's `RequestTracker`.
|
||||||
On the next background loop, this request will be sent to
|
On the next background loop, this request will be sent to
|
||||||
the underlying engine.
|
the underlying engine.
|
||||||
Also, a corresponding `AsyncStream` will be created.
|
Also, a corresponding `AsyncStream` will be created.
|
||||||
- Wait for the request outputs from `AsyncStream` and yield them.
|
- Wait for the request outputs from `AsyncStream` and yield them.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> # Please refer to entrypoints/api_server.py for
|
```
|
||||||
>>> # the complete example.
|
# Please refer to entrypoints/api_server.py for
|
||||||
>>>
|
# the complete example.
|
||||||
>>> # initialize the engine and the example input
|
|
||||||
>>> # note that engine_args here is AsyncEngineArgs instance
|
# initialize the engine and the example input
|
||||||
>>> engine = AsyncLLMEngine.from_engine_args(engine_args)
|
# note that engine_args here is AsyncEngineArgs instance
|
||||||
>>> example_input = {
|
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
>>> "input": "What is LLM?",
|
example_input = {
|
||||||
>>> "request_id": 0,
|
"input": "What is LLM?",
|
||||||
>>> }
|
"request_id": 0,
|
||||||
>>>
|
}
|
||||||
>>> # start the generation
|
|
||||||
>>> results_generator = engine.encode(
|
# start the generation
|
||||||
>>> example_input["input"],
|
results_generator = engine.encode(
|
||||||
>>> PoolingParams(),
|
example_input["input"],
|
||||||
>>> example_input["request_id"])
|
PoolingParams(),
|
||||||
>>>
|
example_input["request_id"])
|
||||||
>>> # get the results
|
|
||||||
>>> final_output = None
|
# get the results
|
||||||
>>> async for request_output in results_generator:
|
final_output = None
|
||||||
>>> if await request.is_disconnected():
|
async for request_output in results_generator:
|
||||||
>>> # Abort the request if the client disconnects.
|
if await request.is_disconnected():
|
||||||
>>> await engine.abort(request_id)
|
# Abort the request if the client disconnects.
|
||||||
>>> # Return or raise an error
|
await engine.abort(request_id)
|
||||||
>>> ...
|
# Return or raise an error
|
||||||
>>> final_output = request_output
|
...
|
||||||
>>>
|
final_output = request_output
|
||||||
>>> # Process and return the final output
|
|
||||||
>>> ...
|
# Process and return the final output
|
||||||
|
...
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
async for output in await self.add_request(
|
async for output in await self.add_request(
|
||||||
|
|||||||
@ -130,11 +130,11 @@ class LLMEngine:
|
|||||||
iteration-level scheduling and efficient memory management to maximize the
|
iteration-level scheduling and efficient memory management to maximize the
|
||||||
serving throughput.
|
serving throughput.
|
||||||
|
|
||||||
The :class:`~vllm.LLM` class wraps this class for offline batched inference
|
The {class}`~vllm.LLM` class wraps this class for offline batched inference
|
||||||
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
|
and the {class}`AsyncLLMEngine` class wraps this class for online serving.
|
||||||
|
|
||||||
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
|
The config arguments are derived from {class}`~vllm.EngineArgs`. (See
|
||||||
:ref:`engine-args`)
|
{ref}`engine-args`)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_config: The configuration related to the LLM model.
|
model_config: The configuration related to the LLM model.
|
||||||
@ -694,11 +694,11 @@ class LLMEngine:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
request_id: The unique ID of the request.
|
request_id: The unique ID of the request.
|
||||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each input.
|
for more details about the format of each input.
|
||||||
params: Parameters for sampling or pooling.
|
params: Parameters for sampling or pooling.
|
||||||
:class:`~vllm.SamplingParams` for text generation.
|
{class}`~vllm.SamplingParams` for text generation.
|
||||||
:class:`~vllm.PoolingParams` for pooling.
|
{class}`~vllm.PoolingParams` for pooling.
|
||||||
arrival_time: The arrival time of the request. If None, we use
|
arrival_time: The arrival time of the request. If None, we use
|
||||||
the current monotonic time.
|
the current monotonic time.
|
||||||
lora_request: The LoRA request to add.
|
lora_request: The LoRA request to add.
|
||||||
@ -710,10 +710,10 @@ class LLMEngine:
|
|||||||
Details:
|
Details:
|
||||||
- Set arrival_time to the current time if it is None.
|
- Set arrival_time to the current time if it is None.
|
||||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||||
- Create `n` number of :class:`~vllm.Sequence` objects.
|
- Create `n` number of {class}`~vllm.Sequence` objects.
|
||||||
- Create a :class:`~vllm.SequenceGroup` object
|
- Create a {class}`~vllm.SequenceGroup` object
|
||||||
from the list of :class:`~vllm.Sequence`.
|
from the list of {class}`~vllm.Sequence`.
|
||||||
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
|
- Add the {class}`~vllm.SequenceGroup` object to the scheduler.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> # initialize engine
|
>>> # initialize engine
|
||||||
@ -861,8 +861,8 @@ class LLMEngine:
|
|||||||
|
|
||||||
Details:
|
Details:
|
||||||
- Refer to the
|
- Refer to the
|
||||||
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
||||||
from class :class:`~vllm.core.scheduler.Scheduler`.
|
from class {class}`~vllm.core.scheduler.Scheduler`.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> # initialize engine and add a request with request_id
|
>>> # initialize engine and add a request with request_id
|
||||||
@ -1258,53 +1258,56 @@ class LLMEngine:
|
|||||||
def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
|
def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
|
||||||
"""Performs one decoding iteration and returns newly generated results.
|
"""Performs one decoding iteration and returns newly generated results.
|
||||||
|
|
||||||
.. figure:: https://i.imgur.com/sv2HssD.png
|
:::{figure} https://i.imgur.com/sv2HssD.png
|
||||||
:alt: Overview of the step function
|
:alt: Overview of the step function
|
||||||
:align: center
|
:align: center
|
||||||
|
|
||||||
Overview of the step function.
|
Overview of the step function.
|
||||||
|
:::
|
||||||
|
|
||||||
Details:
|
Details:
|
||||||
- Step 1: Schedules the sequences to be executed in the next
|
- Step 1: Schedules the sequences to be executed in the next
|
||||||
iteration and the token blocks to be swapped in/out/copy.
|
iteration and the token blocks to be swapped in/out/copy.
|
||||||
|
|
||||||
- Depending on the scheduling policy,
|
- Depending on the scheduling policy,
|
||||||
sequences may be `preempted/reordered`.
|
sequences may be `preempted/reordered`.
|
||||||
- A Sequence Group (SG) refer to a group of sequences
|
- A Sequence Group (SG) refer to a group of sequences
|
||||||
that are generated from the same prompt.
|
that are generated from the same prompt.
|
||||||
|
|
||||||
- Step 2: Calls the distributed executor to execute the model.
|
- Step 2: Calls the distributed executor to execute the model.
|
||||||
- Step 3: Processes the model output. This mainly includes:
|
- Step 3: Processes the model output. This mainly includes:
|
||||||
|
|
||||||
- Decodes the relevant outputs.
|
- Decodes the relevant outputs.
|
||||||
- Updates the scheduled sequence groups with model outputs
|
- Updates the scheduled sequence groups with model outputs
|
||||||
based on its `sampling parameters` (`use_beam_search` or not).
|
based on its `sampling parameters` (`use_beam_search` or not).
|
||||||
- Frees the finished sequence groups.
|
- Frees the finished sequence groups.
|
||||||
|
|
||||||
- Finally, it creates and returns the newly generated results.
|
- Finally, it creates and returns the newly generated results.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> # Please see the example/ folder for more detailed examples.
|
```
|
||||||
>>>
|
# Please see the example/ folder for more detailed examples.
|
||||||
>>> # initialize engine and request arguments
|
|
||||||
>>> engine = LLMEngine.from_engine_args(engine_args)
|
# initialize engine and request arguments
|
||||||
>>> example_inputs = [(0, "What is LLM?",
|
engine = LLMEngine.from_engine_args(engine_args)
|
||||||
>>> SamplingParams(temperature=0.0))]
|
example_inputs = [(0, "What is LLM?",
|
||||||
>>>
|
SamplingParams(temperature=0.0))]
|
||||||
>>> # Start the engine with an event loop
|
|
||||||
>>> while True:
|
# Start the engine with an event loop
|
||||||
>>> if example_inputs:
|
while True:
|
||||||
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
|
if example_inputs:
|
||||||
>>> engine.add_request(str(req_id),prompt,sampling_params)
|
req_id, prompt, sampling_params = example_inputs.pop(0)
|
||||||
>>>
|
engine.add_request(str(req_id),prompt,sampling_params)
|
||||||
>>> # continue the request processing
|
|
||||||
>>> request_outputs = engine.step()
|
# continue the request processing
|
||||||
>>> for request_output in request_outputs:
|
request_outputs = engine.step()
|
||||||
>>> if request_output.finished:
|
for request_output in request_outputs:
|
||||||
>>> # return or show the request output
|
if request_output.finished:
|
||||||
>>>
|
# return or show the request output
|
||||||
>>> if not (engine.has_unfinished_requests() or example_inputs):
|
|
||||||
>>> break
|
if not (engine.has_unfinished_requests() or example_inputs):
|
||||||
|
break
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
if self.parallel_config.pipeline_parallel_size > 1:
|
if self.parallel_config.pipeline_parallel_size > 1:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
|||||||
@ -491,7 +491,7 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
from the LLMEngine to the caller.
|
from the LLMEngine to the caller.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each input.
|
for more details about the format of each input.
|
||||||
sampling_params: The sampling parameters of the request.
|
sampling_params: The sampling parameters of the request.
|
||||||
request_id: The unique id of the request.
|
request_id: The unique id of the request.
|
||||||
@ -560,7 +560,7 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
from the LLMEngine to the caller.
|
from the LLMEngine to the caller.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
|
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each input.
|
for more details about the format of each input.
|
||||||
pooling_params: The pooling parameters of the request.
|
pooling_params: The pooling parameters of the request.
|
||||||
request_id: The unique id of the request.
|
request_id: The unique id of the request.
|
||||||
|
|||||||
@ -41,18 +41,18 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
|
|||||||
|
|
||||||
|
|
||||||
class MQLLMEngine:
|
class MQLLMEngine:
|
||||||
"""A multiprocessing wrapper for :class:`LLMEngine`.
|
"""A multiprocessing wrapper for {class}`LLMEngine`.
|
||||||
|
|
||||||
This class is used to wrap the :class:`LLMEngine` class to enable use
|
This class is used to wrap the {class}`LLMEngine` class to enable use
|
||||||
in concurrnet manner. It runs a background loop and uses zeromq to
|
in concurrnet manner. It runs a background loop and uses zeromq to
|
||||||
receive new requests and stream outputs incrementally via ipc.
|
receive new requests and stream outputs incrementally via ipc.
|
||||||
|
|
||||||
The :class:`LLMEngine` generate or encode process is kicked off when a new
|
The {class}`LLMEngine` generate or encode process is kicked off when a new
|
||||||
RPCProcessRequest is received by the input_socket.
|
RPCProcessRequest is received by the input_socket.
|
||||||
|
|
||||||
The self.engine_loop checks the input_socket for new requests,
|
The self.engine_loop checks the input_socket for new requests,
|
||||||
adds them to the LLMEngine if there are any, calls the internal
|
adds them to the LLMEngine if there are any, calls the internal
|
||||||
:class:`LLMEngine.step()`, and sends the RequestOutputs back over
|
{class}`LLMEngine.step()`, and sends the RequestOutputs back over
|
||||||
the output_socket.
|
the output_socket.
|
||||||
|
|
||||||
If use_async_sockets is set, the logic associated with reading new
|
If use_async_sockets is set, the logic associated with reading new
|
||||||
@ -64,8 +64,8 @@ class MQLLMEngine:
|
|||||||
ipc_path: Base path for zeromq interprocess messaging
|
ipc_path: Base path for zeromq interprocess messaging
|
||||||
use_async_sockets: Whether to make send/recv async with GPU
|
use_async_sockets: Whether to make send/recv async with GPU
|
||||||
log_requests: Whether to log the requests.
|
log_requests: Whether to log the requests.
|
||||||
*args: Arguments for :class:`LLMEngine`.
|
*args: Arguments for {class}`LLMEngine`.
|
||||||
**kwargs: Arguments for :class:`LLMEngine`.
|
**kwargs: Arguments for {class}`LLMEngine`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
|||||||
@ -56,8 +56,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
|||||||
scheduled computation.
|
scheduled computation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_group: the outputs are associated with this :class:`SequenceGroup`
|
seq_group: the outputs are associated with this {class}`SequenceGroup`
|
||||||
outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
|
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
|
||||||
"""
|
"""
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
# Concatenate single-step prompt logprob processing results.
|
# Concatenate single-step prompt logprob processing results.
|
||||||
|
|||||||
@ -19,7 +19,7 @@ logger = init_logger(__name__)
|
|||||||
def single_step_process_prompt_logprob(
|
def single_step_process_prompt_logprob(
|
||||||
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
|
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
|
||||||
output: CompletionSequenceGroupOutput) -> None:
|
output: CompletionSequenceGroupOutput) -> None:
|
||||||
"""Process prompt logprobs associated with the :class:`SequenceGroupOutput`
|
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput`
|
||||||
for a given step.
|
for a given step.
|
||||||
|
|
||||||
Do nothing if the output has no prompt logprobs.
|
Do nothing if the output has no prompt logprobs.
|
||||||
@ -27,9 +27,9 @@ def single_step_process_prompt_logprob(
|
|||||||
Account for the fact that transformers do not compute first-token logprobs.
|
Account for the fact that transformers do not compute first-token logprobs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sg_output_proc: :class:`SequenceGroupOutputProcessor` instance
|
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
|
||||||
seq_group: the output is associated with this :class:`SequenceGroup`
|
seq_group: the output is associated with this {class}`SequenceGroup`
|
||||||
output: the :class:`SequenceGroupOutput` for a single scheduler step
|
output: the {class}`SequenceGroupOutput` for a single scheduler step
|
||||||
"""
|
"""
|
||||||
prompt_logprobs = output.prompt_logprobs
|
prompt_logprobs = output.prompt_logprobs
|
||||||
|
|
||||||
@ -103,8 +103,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
|
|||||||
scheduled computation.
|
scheduled computation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_group: the output is associated with this :class:`SequenceGroup`
|
seq_group: the output is associated with this {class}`SequenceGroup`
|
||||||
outputs: the :class:`SequenceGroupOutput` for a single scheduler step
|
outputs: the {class}`SequenceGroupOutput` for a single scheduler step
|
||||||
"""
|
"""
|
||||||
assert len(outputs) == 1, "Single step should only have 1 output."
|
assert len(outputs) == 1, "Single step should only have 1 output."
|
||||||
output = outputs[0]
|
output = outputs[0]
|
||||||
|
|||||||
@ -115,7 +115,7 @@ class LLM:
|
|||||||
to eager mode. Additionally for encoder-decoder models, if the
|
to eager mode. Additionally for encoder-decoder models, if the
|
||||||
sequence length of the encoder input is larger than this, we fall
|
sequence length of the encoder input is larger than this, we fall
|
||||||
back to the eager mode.
|
back to the eager mode.
|
||||||
disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
|
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
|
||||||
disable_async_output_proc: Disable async output processing.
|
disable_async_output_proc: Disable async output processing.
|
||||||
This may result in lower performance.
|
This may result in lower performance.
|
||||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||||
@ -127,12 +127,13 @@ class LLM:
|
|||||||
compilation_config: Either an integer or a dictionary. If it is an
|
compilation_config: Either an integer or a dictionary. If it is an
|
||||||
integer, it is used as the level of compilation optimization. If it
|
integer, it is used as the level of compilation optimization. If it
|
||||||
is a dictionary, it can specify the full compilation configuration.
|
is a dictionary, it can specify the full compilation configuration.
|
||||||
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
|
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
|
||||||
:ref:`engine-args`)
|
{ref}`engine-args`)
|
||||||
|
|
||||||
Note:
|
:::{note}
|
||||||
This class is intended to be used for offline inference. For online
|
This class is intended to be used for offline inference. For online
|
||||||
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
|
serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DEPRECATE_LEGACY: ClassVar[bool] = True
|
DEPRECATE_LEGACY: ClassVar[bool] = True
|
||||||
@ -141,7 +142,7 @@ class LLM:
|
|||||||
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
|
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
|
||||||
"""
|
"""
|
||||||
A flag to toggle whether to deprecate positional arguments in
|
A flag to toggle whether to deprecate positional arguments in
|
||||||
:meth:`LLM.__init__`.
|
{meth}`LLM.__init__`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -398,7 +399,7 @@ class LLM:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompts.
|
||||||
sampling_params: The sampling parameters for text generation. If
|
sampling_params: The sampling parameters for text generation. If
|
||||||
None, we use the default sampling parameters.
|
None, we use the default sampling parameters.
|
||||||
@ -413,13 +414,14 @@ class LLM:
|
|||||||
Only applicable when priority scheduling policy is enabled.
|
Only applicable when priority scheduling policy is enabled.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of ``RequestOutput`` objects containing the
|
A list of `RequestOutput` objects containing the
|
||||||
generated completions in the same order as the input prompts.
|
generated completions in the same order as the input prompts.
|
||||||
|
|
||||||
Note:
|
:::{note}
|
||||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||||
considered legacy and may be deprecated in the future. You should
|
considered legacy and may be deprecated in the future. You should
|
||||||
instead pass them via the ``inputs`` parameter.
|
instead pass them via the `inputs` parameter.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
runner_type = self.llm_engine.model_config.runner_type
|
runner_type = self.llm_engine.model_config.runner_type
|
||||||
if runner_type not in ["generate", "transcription"]:
|
if runner_type not in ["generate", "transcription"]:
|
||||||
@ -488,16 +490,17 @@ class LLM:
|
|||||||
`self` argument, in addition to the arguments passed in `args`
|
`self` argument, in addition to the arguments passed in `args`
|
||||||
and `kwargs`. The `self` argument will be the worker object.
|
and `kwargs`. The `self` argument will be the worker object.
|
||||||
timeout: Maximum time in seconds to wait for execution. Raises a
|
timeout: Maximum time in seconds to wait for execution. Raises a
|
||||||
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
|
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||||
args: Positional arguments to pass to the worker method.
|
args: Positional arguments to pass to the worker method.
|
||||||
kwargs: Keyword arguments to pass to the worker method.
|
kwargs: Keyword arguments to pass to the worker method.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list containing the results from each worker.
|
A list containing the results from each worker.
|
||||||
|
|
||||||
Note:
|
:::{note}
|
||||||
It is recommended to use this API to only pass control messages,
|
It is recommended to use this API to only pass control messages,
|
||||||
and set up data-plane communication to pass data.
|
and set up data-plane communication to pass data.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
|
return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
|
||||||
@ -664,7 +667,7 @@ class LLM:
|
|||||||
Generate responses for a chat conversation.
|
Generate responses for a chat conversation.
|
||||||
|
|
||||||
The chat conversation is converted into a text prompt using the
|
The chat conversation is converted into a text prompt using the
|
||||||
tokenizer and calls the :meth:`generate` method to generate the
|
tokenizer and calls the {meth}`generate` method to generate the
|
||||||
responses.
|
responses.
|
||||||
|
|
||||||
Multi-modal inputs can be passed in the same way you would pass them
|
Multi-modal inputs can be passed in the same way you would pass them
|
||||||
@ -903,7 +906,7 @@ class LLM:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompts.
|
||||||
pooling_params: The pooling parameters for pooling. If None, we
|
pooling_params: The pooling parameters for pooling. If None, we
|
||||||
use the default pooling parameters.
|
use the default pooling parameters.
|
||||||
@ -913,13 +916,14 @@ class LLM:
|
|||||||
generation, if any.
|
generation, if any.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of ``PoolingRequestOutput`` objects containing the
|
A list of `PoolingRequestOutput` objects containing the
|
||||||
pooled hidden states in the same order as the input prompts.
|
pooled hidden states in the same order as the input prompts.
|
||||||
|
|
||||||
Note:
|
:::{note}
|
||||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||||
considered legacy and may be deprecated in the future. You should
|
considered legacy and may be deprecated in the future. You should
|
||||||
instead pass them via the ``inputs`` parameter.
|
instead pass them via the `inputs` parameter.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
runner_type = self.llm_engine.model_config.runner_type
|
runner_type = self.llm_engine.model_config.runner_type
|
||||||
if runner_type != "pooling":
|
if runner_type != "pooling":
|
||||||
@ -992,7 +996,7 @@ class LLM:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompts.
|
||||||
pooling_params: The pooling parameters for pooling. If None, we
|
pooling_params: The pooling parameters for pooling. If None, we
|
||||||
use the default pooling parameters.
|
use the default pooling parameters.
|
||||||
@ -1036,7 +1040,7 @@ class LLM:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompts.
|
||||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||||
lora_request: LoRA request to use for generation, if any.
|
lora_request: LoRA request to use for generation, if any.
|
||||||
@ -1168,7 +1172,7 @@ class LLM:
|
|||||||
text_1: can be a single prompt or a list of prompts, in which
|
text_1: can be a single prompt or a list of prompts, in which
|
||||||
case it has to have the same length as the ``text_2`` list
|
case it has to have the same length as the ``text_2`` list
|
||||||
text_2: The texts to pair with the query to form the input
|
text_2: The texts to pair with the query to form the input
|
||||||
to the LLM. See :class:`~vllm.inputs.PromptType` for
|
to the LLM. See {class}`~vllm.inputs.PromptType` for
|
||||||
more details about the format of each prompts.
|
more details about the format of each prompts.
|
||||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||||
lora_request: LoRA request to use for generation, if any.
|
lora_request: LoRA request to use for generation, if any.
|
||||||
@ -1277,7 +1281,7 @@ class LLM:
|
|||||||
|
|
||||||
def wake_up(self, tags: Optional[list[str]] = None):
|
def wake_up(self, tags: Optional[list[str]] = None):
|
||||||
"""
|
"""
|
||||||
Wake up the engine from sleep mode. See the :meth:`sleep` method
|
Wake up the engine from sleep mode. See the {meth}`sleep` method
|
||||||
for more details.
|
for more details.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@ -5,7 +5,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from argparse import Namespace
|
|
||||||
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
|
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@ -25,23 +24,7 @@ from vllm.utils import random_uuid, resolve_obj_by_qualname
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# torch is mocked during docs generation,
|
_LONG_INFO = torch.iinfo(torch.long)
|
||||||
# so we have to provide the values as literals
|
|
||||||
_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
|
|
||||||
_LONG_INFO: Union["torch.iinfo", Namespace]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from sphinx.ext.autodoc.mock import _MockModule
|
|
||||||
|
|
||||||
if isinstance(torch, _MockModule):
|
|
||||||
_LONG_INFO = _MOCK_LONG_INFO
|
|
||||||
else:
|
|
||||||
_LONG_INFO = torch.iinfo(torch.long)
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
_LONG_INFO = torch.iinfo(torch.long)
|
|
||||||
|
|
||||||
assert _LONG_INFO.min == _MOCK_LONG_INFO.min
|
|
||||||
assert _LONG_INFO.max == _MOCK_LONG_INFO.max
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIBaseModel(BaseModel):
|
class OpenAIBaseModel(BaseModel):
|
||||||
|
|||||||
@ -275,7 +275,7 @@ class OpenAIServing:
|
|||||||
add_special_tokens: bool = True,
|
add_special_tokens: bool = True,
|
||||||
) -> TextTokensPrompt:
|
) -> TextTokensPrompt:
|
||||||
"""
|
"""
|
||||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||||
that assumes single input.
|
that assumes single input.
|
||||||
"""
|
"""
|
||||||
return next(
|
return next(
|
||||||
@ -296,7 +296,7 @@ class OpenAIServing:
|
|||||||
add_special_tokens: bool = True,
|
add_special_tokens: bool = True,
|
||||||
) -> Iterator[TextTokensPrompt]:
|
) -> Iterator[TextTokensPrompt]:
|
||||||
"""
|
"""
|
||||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||||
that assumes multiple inputs.
|
that assumes multiple inputs.
|
||||||
"""
|
"""
|
||||||
for text in prompt_inputs:
|
for text in prompt_inputs:
|
||||||
|
|||||||
@ -74,7 +74,7 @@ class ExecutorBase(ABC):
|
|||||||
`self` argument, in addition to the arguments passed in `args`
|
`self` argument, in addition to the arguments passed in `args`
|
||||||
and `kwargs`. The `self` argument will be the worker object.
|
and `kwargs`. The `self` argument will be the worker object.
|
||||||
timeout: Maximum time in seconds to wait for execution. Raises a
|
timeout: Maximum time in seconds to wait for execution. Raises a
|
||||||
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
|
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||||
args: Positional arguments to pass to the worker method.
|
args: Positional arguments to pass to the worker method.
|
||||||
kwargs: Keyword arguments to pass to the worker method.
|
kwargs: Keyword arguments to pass to the worker method.
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
|
|||||||
|
|
||||||
INPUT_REGISTRY = InputRegistry()
|
INPUT_REGISTRY = InputRegistry()
|
||||||
"""
|
"""
|
||||||
The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
|
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
|
||||||
to dispatch data processing according to the target model.
|
to dispatch data processing according to the target model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -80,22 +80,22 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
|
|||||||
"""
|
"""
|
||||||
Set of possible schemas for a single prompt:
|
Set of possible schemas for a single prompt:
|
||||||
|
|
||||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
- A text prompt ({class}`str` or {class}`TextPrompt`)
|
||||||
- A tokenized prompt (:class:`TokensPrompt`)
|
- A tokenized prompt ({class}`TokensPrompt`)
|
||||||
- An embeddings prompt (:class:`EmbedsPrompt`)
|
- An embeddings prompt ({class}`EmbedsPrompt`)
|
||||||
|
|
||||||
Note that "singleton" is as opposed to a data structure
|
Note that "singleton" is as opposed to a data structure
|
||||||
which encapsulates multiple prompts, i.e. of the sort
|
which encapsulates multiple prompts, i.e. of the sort
|
||||||
which may be utilized for encoder/decoder models when
|
which may be utilized for encoder/decoder models when
|
||||||
the user desires to express both the encoder & decoder
|
the user desires to express both the encoder & decoder
|
||||||
prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
|
||||||
|
|
||||||
A prompt of type :class:`SingletonPrompt` may be employed
|
A prompt of type {class}`SingletonPrompt` may be employed
|
||||||
as (1) input to a decoder-only model, (2) input to
|
as (1) input to a decoder-only model, (2) input to
|
||||||
the encoder of an encoder/decoder model, in the scenario
|
the encoder of an encoder/decoder model, in the scenario
|
||||||
where the decoder-prompt is not specified explicitly, or
|
where the decoder-prompt is not specified explicitly, or
|
||||||
(3) as a member of a larger data structure encapsulating
|
(3) as a member of a larger data structure encapsulating
|
||||||
more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_T1_co = TypeVar("_T1_co",
|
_T1_co = TypeVar("_T1_co",
|
||||||
@ -115,18 +115,18 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
|
|||||||
comprising an explicit encoder prompt and a decoder prompt.
|
comprising an explicit encoder prompt and a decoder prompt.
|
||||||
|
|
||||||
The encoder and decoder prompts, respectively, may be formatted
|
The encoder and decoder prompts, respectively, may be formatted
|
||||||
according to any of the :class:`SingletonPrompt` schemas,
|
according to any of the {class}`SingletonPrompt` schemas,
|
||||||
and are not required to have the same schema.
|
and are not required to have the same schema.
|
||||||
|
|
||||||
Only the encoder prompt may have multi-modal data. mm_processor_kwargs
|
Only the encoder prompt may have multi-modal data. mm_processor_kwargs
|
||||||
should be at the top-level, and should not be set in the encoder/decoder
|
should be at the top-level, and should not be set in the encoder/decoder
|
||||||
prompts, since they are agnostic to the encoder/decoder.
|
prompts, since they are agnostic to the encoder/decoder.
|
||||||
|
|
||||||
Note that an :class:`ExplicitEncoderDecoderPrompt` may not
|
Note that an {class}`ExplicitEncoderDecoderPrompt` may not
|
||||||
be used as an input to a decoder-only model,
|
be used as an input to a decoder-only model,
|
||||||
and that the :code:`encoder_prompt` and :code:`decoder_prompt`
|
and that the `encoder_prompt` and `decoder_prompt`
|
||||||
fields of this data structure themselves must be
|
fields of this data structure themselves must be
|
||||||
:class:`SingletonPrompt` instances.
|
{class}`SingletonPrompt` instances.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
encoder_prompt: _T1_co
|
encoder_prompt: _T1_co
|
||||||
@ -141,11 +141,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
|
|||||||
Set of possible schemas for an LLM input, including
|
Set of possible schemas for an LLM input, including
|
||||||
both decoder-only and encoder/decoder input types:
|
both decoder-only and encoder/decoder input types:
|
||||||
|
|
||||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
- A text prompt ({class}`str` or {class}`TextPrompt`)
|
||||||
- A tokenized prompt (:class:`TokensPrompt`)
|
- A tokenized prompt ({class}`TokensPrompt`)
|
||||||
- An embeddings prompt (:class:`EmbedsPrompt`)
|
- An embeddings prompt ({class}`EmbedsPrompt`)
|
||||||
- A single data structure containing both an encoder and a decoder prompt
|
- A single data structure containing both an encoder and a decoder prompt
|
||||||
(:class:`ExplicitEncoderDecoderPrompt`)
|
({class}`ExplicitEncoderDecoderPrompt`)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ def token_inputs(
|
|||||||
prompt: Optional[str] = None,
|
prompt: Optional[str] = None,
|
||||||
cache_salt: Optional[str] = None,
|
cache_salt: Optional[str] = None,
|
||||||
) -> TokenInputs:
|
) -> TokenInputs:
|
||||||
"""Construct :class:`TokenInputs` from optional values."""
|
"""Construct {class}`TokenInputs` from optional values."""
|
||||||
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
||||||
|
|
||||||
if prompt is not None:
|
if prompt is not None:
|
||||||
@ -221,7 +221,7 @@ def embeds_inputs(
|
|||||||
|
|
||||||
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
||||||
"""
|
"""
|
||||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
The inputs in {class}`~vllm.LLMEngine` before they are
|
||||||
passed to the model executor.
|
passed to the model executor.
|
||||||
This specifies the data required for decoder-only models.
|
This specifies the data required for decoder-only models.
|
||||||
"""
|
"""
|
||||||
@ -229,7 +229,7 @@ This specifies the data required for decoder-only models.
|
|||||||
|
|
||||||
class EncoderDecoderInputs(TypedDict):
|
class EncoderDecoderInputs(TypedDict):
|
||||||
"""
|
"""
|
||||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
The inputs in {class}`~vllm.LLMEngine` before they are
|
||||||
passed to the model executor.
|
passed to the model executor.
|
||||||
|
|
||||||
This specifies the required data for encoder-decoder models.
|
This specifies the required data for encoder-decoder models.
|
||||||
@ -243,13 +243,13 @@ class EncoderDecoderInputs(TypedDict):
|
|||||||
|
|
||||||
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
|
||||||
"""
|
"""
|
||||||
A processed :class:`SingletonPrompt` which can be passed to
|
A processed {class}`SingletonPrompt` which can be passed to
|
||||||
:class:`vllm.sequence.Sequence`.
|
{class}`vllm.sequence.Sequence`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
|
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
|
||||||
"""
|
"""
|
||||||
The inputs to :data:`vllm.inputs.InputProcessor`.
|
The inputs to {data}`vllm.inputs.InputProcessor`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
|
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
|
||||||
@ -277,7 +277,7 @@ def zip_enc_dec_prompts(
|
|||||||
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
|
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
|
||||||
"""
|
"""
|
||||||
Zip encoder and decoder prompts together into a list of
|
Zip encoder and decoder prompts together into a list of
|
||||||
:class:`ExplicitEncoderDecoderPrompt` instances.
|
{class}`ExplicitEncoderDecoderPrompt` instances.
|
||||||
|
|
||||||
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
||||||
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
||||||
|
|||||||
@ -6,7 +6,6 @@ from typing import Any, Optional, Union, cast
|
|||||||
|
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm import envs
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
@ -225,7 +224,7 @@ class InputPreprocessor:
|
|||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""Async version of :meth:`_tokenize_prompt`."""
|
"""Async version of {meth}`_tokenize_prompt`."""
|
||||||
tokenizer = self.get_tokenizer_group()
|
tokenizer = self.get_tokenizer_group()
|
||||||
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
||||||
|
|
||||||
@ -288,7 +287,7 @@ class InputPreprocessor:
|
|||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
return_mm_hashes: bool = False,
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""Async version of :meth:`_process_multimodal`."""
|
"""Async version of {meth}`_process_multimodal`."""
|
||||||
tokenizer = await self._get_mm_tokenizer_async(lora_request)
|
tokenizer = await self._get_mm_tokenizer_async(lora_request)
|
||||||
|
|
||||||
mm_processor = self.mm_registry.create_processor(self.model_config,
|
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||||
@ -306,8 +305,6 @@ class InputPreprocessor:
|
|||||||
if not self.model_config.enable_prompt_embeds:
|
if not self.model_config.enable_prompt_embeds:
|
||||||
raise ValueError("You must set `--enable-prompt-embeds` to input "
|
raise ValueError("You must set `--enable-prompt-embeds` to input "
|
||||||
"`prompt_embeds`.")
|
"`prompt_embeds`.")
|
||||||
if envs.VLLM_USE_V1:
|
|
||||||
raise ValueError("`prompt_embeds` is only available in V0.")
|
|
||||||
|
|
||||||
prompt_embeds = parsed_content["prompt_embeds"]
|
prompt_embeds = parsed_content["prompt_embeds"]
|
||||||
|
|
||||||
@ -475,7 +472,7 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
* :class:`SingletonInputs` instance
|
* {class}`SingletonInputs` instance
|
||||||
"""
|
"""
|
||||||
parsed = parse_singleton_prompt(prompt)
|
parsed = parse_singleton_prompt(prompt)
|
||||||
|
|
||||||
@ -511,7 +508,7 @@ class InputPreprocessor:
|
|||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
return_mm_hashes: bool = False,
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
"""Async version of :meth:`_prompt_to_llm_inputs`."""
|
"""Async version of {meth}`_prompt_to_llm_inputs`."""
|
||||||
parsed = parse_singleton_prompt(prompt)
|
parsed = parse_singleton_prompt(prompt)
|
||||||
|
|
||||||
if parsed["type"] == "embeds":
|
if parsed["type"] == "embeds":
|
||||||
@ -647,7 +644,7 @@ class InputPreprocessor:
|
|||||||
) -> EncoderDecoderInputs:
|
) -> EncoderDecoderInputs:
|
||||||
"""
|
"""
|
||||||
For encoder/decoder models only:
|
For encoder/decoder models only:
|
||||||
Process an input prompt into an :class:`EncoderDecoderInputs` instance.
|
Process an input prompt into an {class}`EncoderDecoderInputs` instance.
|
||||||
|
|
||||||
There are two types of input prompts:
|
There are two types of input prompts:
|
||||||
singleton prompts which carry only the
|
singleton prompts which carry only the
|
||||||
@ -673,7 +670,7 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
* :class:`EncoderDecoderInputs` instance
|
* {class}`EncoderDecoderInputs` instance
|
||||||
"""
|
"""
|
||||||
encoder_inputs: SingletonInputs
|
encoder_inputs: SingletonInputs
|
||||||
decoder_inputs: Optional[SingletonInputs]
|
decoder_inputs: Optional[SingletonInputs]
|
||||||
@ -713,7 +710,7 @@ class InputPreprocessor:
|
|||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> EncoderDecoderInputs:
|
) -> EncoderDecoderInputs:
|
||||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
"""Async version of {meth}`_process_encoder_decoder_prompt`."""
|
||||||
encoder_inputs: SingletonInputs
|
encoder_inputs: SingletonInputs
|
||||||
decoder_inputs: Optional[SingletonInputs]
|
decoder_inputs: Optional[SingletonInputs]
|
||||||
|
|
||||||
@ -781,7 +778,7 @@ class InputPreprocessor:
|
|||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""
|
"""
|
||||||
For decoder-only models:
|
For decoder-only models:
|
||||||
Process an input prompt into an :class:`DecoderOnlyInputs` instance.
|
Process an input prompt into an {class}`DecoderOnlyInputs` instance.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
|
||||||
@ -792,7 +789,7 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
* :class:`DecoderOnlyInputs` instance
|
* {class}`DecoderOnlyInputs` instance
|
||||||
"""
|
"""
|
||||||
|
|
||||||
prompt_comps = self._prompt_to_llm_inputs(
|
prompt_comps = self._prompt_to_llm_inputs(
|
||||||
@ -815,7 +812,7 @@ class InputPreprocessor:
|
|||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
return_mm_hashes: bool = False,
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
"""Async version of {meth}`_process_decoder_only_prompt`."""
|
||||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
@ -866,7 +863,7 @@ class InputPreprocessor:
|
|||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
return_mm_hashes: bool = False,
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
"""Async version of :meth:`preprocess`."""
|
"""Async version of {meth}`preprocess`."""
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.is_encoder_decoder:
|
||||||
assert not return_mm_hashes, (
|
assert not return_mm_hashes, (
|
||||||
"Multimodal hashes for encoder-decoder models should not be ",
|
"Multimodal hashes for encoder-decoder models should not be ",
|
||||||
|
|||||||
@ -38,7 +38,7 @@ class InputContext:
|
|||||||
) -> _C:
|
) -> _C:
|
||||||
"""
|
"""
|
||||||
Get the HuggingFace configuration
|
Get the HuggingFace configuration
|
||||||
(:class:`transformers.PretrainedConfig`) of the model,
|
({class}`transformers.PretrainedConfig`) of the model,
|
||||||
additionally checking its type.
|
additionally checking its type.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
@ -79,7 +79,7 @@ class InputContext:
|
|||||||
) -> _P:
|
) -> _P:
|
||||||
"""
|
"""
|
||||||
Get the HuggingFace processor
|
Get the HuggingFace processor
|
||||||
(:class:`transformers.ProcessorMixin`) of the model,
|
({class}`transformers.ProcessorMixin`) of the model,
|
||||||
additionally checking its type.
|
additionally checking its type.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
@ -135,8 +135,8 @@ class InputProcessingContext(InputContext):
|
|||||||
kwargs: Mapping[str, object] = {},
|
kwargs: Mapping[str, object] = {},
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Call :code:`hf_processor` on the prompt :code:`data`
|
Call `hf_processor` on the prompt `data`
|
||||||
(text, image, audio...) with configurable options :code:`kwargs`.
|
(text, image, audio...) with configurable options `kwargs`.
|
||||||
"""
|
"""
|
||||||
assert callable(hf_processor)
|
assert callable(hf_processor)
|
||||||
|
|
||||||
|
|||||||
@ -68,21 +68,21 @@ class _VllmLogger(Logger):
|
|||||||
"""
|
"""
|
||||||
Note:
|
Note:
|
||||||
This class is just to provide type information.
|
This class is just to provide type information.
|
||||||
We actually patch the methods directly on the :class:`logging.Logger`
|
We actually patch the methods directly on the {class}`logging.Logger`
|
||||||
instance to avoid conflicting with other libraries such as
|
instance to avoid conflicting with other libraries such as
|
||||||
`intel_extension_for_pytorch.utils._logger`.
|
`intel_extension_for_pytorch.utils._logger`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def info_once(self, msg: str, *args: Hashable) -> None:
|
def info_once(self, msg: str, *args: Hashable) -> None:
|
||||||
"""
|
"""
|
||||||
As :meth:`info`, but subsequent calls with the same message
|
As {meth}`info`, but subsequent calls with the same message
|
||||||
are silently dropped.
|
are silently dropped.
|
||||||
"""
|
"""
|
||||||
_print_info_once(self, msg, *args)
|
_print_info_once(self, msg, *args)
|
||||||
|
|
||||||
def warning_once(self, msg: str, *args: Hashable) -> None:
|
def warning_once(self, msg: str, *args: Hashable) -> None:
|
||||||
"""
|
"""
|
||||||
As :meth:`warning`, but subsequent calls with the same message
|
As {meth}`warning`, but subsequent calls with the same message
|
||||||
are silently dropped.
|
are silently dropped.
|
||||||
"""
|
"""
|
||||||
_print_warning_once(self, msg, *args)
|
_print_warning_once(self, msg, *args)
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from vllm.lora.ops.triton_ops.lora_expand import lora_expand
|
from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
|
||||||
from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
|
from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
|
||||||
from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink
|
from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"lora_expand",
|
"lora_expand",
|
||||||
|
|||||||
@ -261,15 +261,16 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
|
|||||||
True, then a token can be accepted, else it should be
|
True, then a token can be accepted, else it should be
|
||||||
rejected.
|
rejected.
|
||||||
|
|
||||||
Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
|
Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
|
||||||
:math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according
|
{math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according
|
||||||
to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
|
to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
|
||||||
same conditional probability according to the draft model, the token
|
same conditional probability according to the draft model, the token
|
||||||
is accepted with probability:
|
is accepted with probability:
|
||||||
|
|
||||||
.. math::
|
:::{math}
|
||||||
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
|
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
|
||||||
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
|
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
|
||||||
|
:::
|
||||||
|
|
||||||
This implementation does not apply causality. When using the output,
|
This implementation does not apply causality. When using the output,
|
||||||
if a token is rejected, subsequent tokens should not be used.
|
if a token is rejected, subsequent tokens should not be used.
|
||||||
@ -312,18 +313,20 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
|
|||||||
target model is recovered (within hardware numerics).
|
target model is recovered (within hardware numerics).
|
||||||
|
|
||||||
The probability distribution used in this rejection case is constructed
|
The probability distribution used in this rejection case is constructed
|
||||||
as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of
|
as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of
|
||||||
:math:`x` given context :math:`x_1, \dots, x_n` according to the target
|
{math}`x` given context {math}`x_1, \dots, x_n` according to the target
|
||||||
model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability
|
model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability
|
||||||
according to the draft model:
|
according to the draft model:
|
||||||
|
|
||||||
.. math::
|
:::{math}
|
||||||
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
|
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
|
||||||
|
:::
|
||||||
|
|
||||||
where :math:`(f(x))_+` is defined as:
|
where {math}`(f(x))_+` is defined as:
|
||||||
|
|
||||||
.. math::
|
:::{math}
|
||||||
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
|
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
|
||||||
|
:::
|
||||||
|
|
||||||
See https://github.com/vllm-project/vllm/pull/2336 for a visualization
|
See https://github.com/vllm-project/vllm/pull/2336 for a visualization
|
||||||
of the draft, target, and recovered probability distributions.
|
of the draft, target, and recovered probability distributions.
|
||||||
|
|||||||
@ -235,7 +235,7 @@ class Sampler(nn.Module):
|
|||||||
* Defer Pythonization of sampling result & logprobs
|
* Defer Pythonization of sampling result & logprobs
|
||||||
tensor
|
tensor
|
||||||
* Encapsulate arguments required for deferred Pythonization
|
* Encapsulate arguments required for deferred Pythonization
|
||||||
in the :class:`SamplerOutput` structure
|
in the {class}`SamplerOutput` structure
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
logits: (num_tokens, vocab_size).
|
logits: (num_tokens, vocab_size).
|
||||||
|
|||||||
@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
|
|||||||
A draft token_id x_{n+k} is accepted if it satisfies the
|
A draft token_id x_{n+k} is accepted if it satisfies the
|
||||||
following condition
|
following condition
|
||||||
|
|
||||||
.. math::
|
:::{math}
|
||||||
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
|
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
|
||||||
\min \left( \epsilon, \delta * \exp \left(
|
\min \left( \epsilon, \delta * \exp \left(
|
||||||
-H(p_{\text{original}}(
|
-H(p_{\text{original}}(
|
||||||
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
|
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
|
||||||
|
:::
|
||||||
|
|
||||||
where :math:`p_{\text{original}}` corresponds to target_probs
|
where {math}`p_{\text{original}}` corresponds to target_probs
|
||||||
and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
|
and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
|
||||||
specified using self._posterior_threshold and self._posterior_alpha
|
specified using self._posterior_threshold and self._posterior_alpha
|
||||||
|
|
||||||
This method computes the posterior probabilities for the given
|
This method computes the posterior probabilities for the given
|
||||||
|
|||||||
@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
|||||||
batch.
|
batch.
|
||||||
pixel_values: The pixels in each input image.
|
pixel_values: The pixels in each input image.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:class:`Blip2ImageInputs`
|
{class}`Blip2ImageInputs`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if intermediate_tensors is not None:
|
if intermediate_tensors is not None:
|
||||||
|
|||||||
@ -226,9 +226,9 @@ class SupportsPP(Protocol):
|
|||||||
intermediate_tensors: Optional["IntermediateTensors"],
|
intermediate_tensors: Optional["IntermediateTensors"],
|
||||||
) -> Union[Tensor, "IntermediateTensors"]:
|
) -> Union[Tensor, "IntermediateTensors"]:
|
||||||
"""
|
"""
|
||||||
Accept :class:`IntermediateTensors` when PP rank > 0.
|
Accept {class}`IntermediateTensors` when PP rank > 0.
|
||||||
|
|
||||||
Return :class:`IntermediateTensors` only for the last PP rank.
|
Return {class}`IntermediateTensors` only for the last PP rank.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|||||||
@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
batch.
|
batch.
|
||||||
pixel_values: The pixels in each input image.
|
pixel_values: The pixels in each input image.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:class:`LlavaImageInputs`
|
{class}`LlavaImageInputs`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
if intermediate_tensors is not None:
|
if intermediate_tensors is not None:
|
||||||
inputs_embeds = None
|
inputs_embeds = None
|
||||||
|
|||||||
@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
Unlike in LLaVA-1.5, the number of image tokens inputted to the language
|
Unlike in LLaVA-1.5, the number of image tokens inputted to the language
|
||||||
model depends on the original size of the input image. Including the
|
model depends on the original size of the input image. Including the
|
||||||
original image token in the input, the required number of image tokens
|
original image token in the input, the required number of image tokens
|
||||||
is given by :func:`get_llava_next_image_feature_size`.
|
is given by {func}`get_llava_next_image_feature_size`.
|
||||||
|
|
||||||
This way, the `positions` and `attn_metadata` are consistent
|
This way, the `positions` and `attn_metadata` are consistent
|
||||||
with the `input_ids`.
|
with the `input_ids`.
|
||||||
@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
pixel_values: The pixels in each grid patch for each input image.
|
pixel_values: The pixels in each grid patch for each input image.
|
||||||
image_sizes: The original `(height, width)` for each input image.
|
image_sizes: The original `(height, width)` for each input image.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:class:`LlavaNextImageInputs`
|
{class}`LlavaNextImageInputs`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
if intermediate_tensors is not None:
|
if intermediate_tensors is not None:
|
||||||
inputs_embeds = None
|
inputs_embeds = None
|
||||||
|
|||||||
@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
|
|||||||
batch.
|
batch.
|
||||||
pixel_values: The pixels in each input image.
|
pixel_values: The pixels in each input image.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:class:`Mistral3ImagePixelInputs`
|
{class}`Mistral3ImagePixelInputs`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
if intermediate_tensors is not None:
|
if intermediate_tensors is not None:
|
||||||
inputs_embeds = None
|
inputs_embeds = None
|
||||||
|
|||||||
@ -965,7 +965,7 @@ def select_tiling(
|
|||||||
|
|
||||||
class MolmoProcessorWrapper:
|
class MolmoProcessorWrapper:
|
||||||
"""
|
"""
|
||||||
Wraps :class:`MolmoProcessor` so that it can be called directly.
|
Wraps {class}`MolmoProcessor` so that it can be called directly.
|
||||||
|
|
||||||
The original definition can be found here:
|
The original definition can be found here:
|
||||||
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
|
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
|
||||||
|
|||||||
@ -12,7 +12,7 @@ import torch.nn.functional as F
|
|||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
|
|
||||||
|
|
||||||
class Block(nn.Module):
|
class BlockBase(nn.Module):
|
||||||
"""Block abstract module"""
|
"""Block abstract module"""
|
||||||
|
|
||||||
def __init__(self, input_size, output_size):
|
def __init__(self, input_size, output_size):
|
||||||
@ -1602,7 +1602,7 @@ class AttModule(nn.Module):
|
|||||||
return x, memory, pos_emb, att_mask
|
return x, memory, pos_emb, att_mask
|
||||||
|
|
||||||
|
|
||||||
class AttBlock(Block, AttModule):
|
class AttBlock(BlockBase, AttModule):
|
||||||
"""Attention Block module to support both Attention and Block module."""
|
"""Attention Block module to support both Attention and Block module."""
|
||||||
|
|
||||||
def memory_dims(self, max_len=False):
|
def memory_dims(self, max_len=False):
|
||||||
|
|||||||
@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
|
|||||||
"""
|
"""
|
||||||
Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
|
Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
|
||||||
|
|
||||||
The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
|
The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class PixtralProcessorAdapter:
|
class PixtralProcessorAdapter:
|
||||||
"""
|
"""
|
||||||
Provide a HF-compatible interface for
|
Provide a HF-compatible interface for
|
||||||
:class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
|
{class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tokenizer: MistralTokenizer) -> None:
|
def __init__(self, tokenizer: MistralTokenizer) -> None:
|
||||||
|
|||||||
@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
|
|||||||
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
|
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
|
||||||
"""
|
"""
|
||||||
The logic of adding image pad tokens should only be applied in
|
The logic of adding image pad tokens should only be applied in
|
||||||
:class:`QwenVLProcessor`, so they are patched out here.
|
{class}`QwenVLProcessor`, so they are patched out here.
|
||||||
|
|
||||||
The definition of the wrapped tokenizer can be found here:
|
The definition of the wrapped tokenizer can be found here:
|
||||||
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
|
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
|
||||||
|
|||||||
@ -19,7 +19,6 @@ import cloudpickle
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import is_in_doc_build
|
|
||||||
|
|
||||||
from .interfaces import (has_inner_state, has_noops, is_attention_free,
|
from .interfaces import (has_inner_state, has_noops, is_attention_free,
|
||||||
is_hybrid, supports_cross_encoding,
|
is_hybrid, supports_cross_encoding,
|
||||||
@ -375,13 +374,13 @@ class _ModelRegistry:
|
|||||||
"""
|
"""
|
||||||
Register an external model to be used in vLLM.
|
Register an external model to be used in vLLM.
|
||||||
|
|
||||||
:code:`model_cls` can be either:
|
`model_cls` can be either:
|
||||||
|
|
||||||
- A :class:`torch.nn.Module` class directly referencing the model.
|
- A {class}`torch.nn.Module` class directly referencing the model.
|
||||||
- A string in the format :code:`<module>:<class>` which can be used to
|
- A string in the format `<module>:<class>` which can be used to
|
||||||
lazily import the model. This is useful to avoid initializing CUDA
|
lazily import the model. This is useful to avoid initializing CUDA
|
||||||
when importing the model and thus the related error
|
when importing the model and thus the related error
|
||||||
:code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||||
"""
|
"""
|
||||||
if not isinstance(model_arch, str):
|
if not isinstance(model_arch, str):
|
||||||
msg = f"`model_arch` should be a string, not a {type(model_arch)}"
|
msg = f"`model_arch` should be a string, not a {type(model_arch)}"
|
||||||
@ -400,8 +399,7 @@ class _ModelRegistry:
|
|||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
model = _LazyRegisteredModel(*split_str)
|
model = _LazyRegisteredModel(*split_str)
|
||||||
elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
|
elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
|
||||||
model_cls, nn.Module)):
|
|
||||||
model = _RegisteredModel.from_model_cls(model_cls)
|
model = _RegisteredModel.from_model_cls(model_cls)
|
||||||
else:
|
else:
|
||||||
msg = ("`model_cls` should be a string or PyTorch model class, "
|
msg = ("`model_cls` should be a string or PyTorch model class, "
|
||||||
|
|||||||
@ -66,7 +66,7 @@ class WeightsMapper:
|
|||||||
|
|
||||||
class AutoWeightsLoader:
|
class AutoWeightsLoader:
|
||||||
"""
|
"""
|
||||||
Helper class to load weights into a :class:`torch.nn.Module`. It is able
|
Helper class to load weights into a {class}`torch.nn.Module`. It is able
|
||||||
to automatically detect child modules and parameters while iterating over
|
to automatically detect child modules and parameters while iterating over
|
||||||
the weights only once.
|
the weights only once.
|
||||||
|
|
||||||
|
|||||||
@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
|
|||||||
|
|
||||||
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
||||||
"""
|
"""
|
||||||
The global :class:`~MultiModalRegistry` is used by model runners to
|
The global {class}`~MultiModalRegistry` is used by model runners to
|
||||||
dispatch data processing according to the target model.
|
dispatch data processing according to the target model.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:ref:`mm-processing`
|
{ref}`mm-processing`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|||||||
@ -64,35 +64,35 @@ class MultiModalPlaceholderMap:
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
Prompt: |AAAA BBBB What's in these images?|
|
||||||
|
Positions: |.................................|
|
||||||
|
|
||||||
Prompt: |AAAA BBBB What's in these images?|
|
images = [A, B]
|
||||||
Positions: |.................................|
|
src_ranges = [(0, 4), (4, 8)]
|
||||||
|
dest_ranges = [(0, 4), (5, 9)]
|
||||||
|
|
||||||
images = [A, B]
|
Prompt: |AAAA BBBB What's in these images?|
|
||||||
src_ranges = [(0, 4), (4, 8)]
|
Positions: | ..... |
|
||||||
dest_ranges = [(0, 4), (5, 9)]
|
|
||||||
|
|
||||||
Prompt: |AAAA BBBB What's in these images?|
|
images = [A, B]
|
||||||
Positions: | ..... |
|
src_ranges = [(2, 4), (4, 6)]
|
||||||
|
dest_ranges = [(0, 2), (3, 5)]
|
||||||
|
|
||||||
images = [A, B]
|
Prompt: |AAAA BBBB What's in these images?|
|
||||||
src_ranges = [(2, 4), (4, 6)]
|
Positions: | ......... |
|
||||||
dest_ranges = [(0, 2), (3, 5)]
|
|
||||||
|
|
||||||
Prompt: |AAAA BBBB What's in these images?|
|
images = [B]
|
||||||
Positions: | ......... |
|
src_ranges = [(0, 4)]
|
||||||
|
dest_ranges = [(0, 4)]
|
||||||
|
|
||||||
images = [B]
|
Prompt: |AAAA BBBB What's in these images?|
|
||||||
src_ranges = [(0, 4)]
|
Positions: | .......................|
|
||||||
dest_ranges = [(0, 4)]
|
|
||||||
|
|
||||||
Prompt: |AAAA BBBB What's in these images?|
|
images = []
|
||||||
Positions: | .......................|
|
src_ranges = []
|
||||||
|
dest_ranges = []
|
||||||
images = []
|
```
|
||||||
src_ranges = []
|
|
||||||
dest_ranges = []
|
|
||||||
"""
|
"""
|
||||||
seq_mm_data = seq_group.multi_modal_data
|
seq_mm_data = seq_group.multi_modal_data
|
||||||
seq_mm_placeholders = seq_group.multi_modal_placeholders
|
seq_mm_placeholders = seq_group.multi_modal_placeholders
|
||||||
|
|||||||
@ -26,27 +26,27 @@ _T = TypeVar("_T")
|
|||||||
|
|
||||||
HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
|
HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
|
||||||
"""
|
"""
|
||||||
A :class:`transformers.image_utils.ImageInput` representing a single image
|
A {class}`transformers.image_utils.ImageInput` representing a single image
|
||||||
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
|
item, which can be passed to a HuggingFace `ImageProcessor`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
|
HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
|
||||||
list[np.ndarray], list[torch.Tensor]]
|
list[np.ndarray], list[torch.Tensor]]
|
||||||
"""
|
"""
|
||||||
A :class:`transformers.image_utils.VideoInput` representing a single video
|
A {class}`transformers.image_utils.VideoInput` representing a single video
|
||||||
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
|
item, which can be passed to a HuggingFace `VideoProcessor`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
|
HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
|
||||||
"""
|
"""
|
||||||
Represents a single audio
|
Represents a single audio
|
||||||
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
|
item, which can be passed to a HuggingFace `AudioProcessor`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
|
ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
|
||||||
"""
|
"""
|
||||||
A :class:`transformers.image_utils.ImageInput` representing a single image
|
A {class}`transformers.image_utils.ImageInput` representing a single image
|
||||||
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
|
item, which can be passed to a HuggingFace `ImageProcessor`.
|
||||||
|
|
||||||
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
||||||
which are treated as image embeddings;
|
which are treated as image embeddings;
|
||||||
@ -55,8 +55,8 @@ these are directly passed to the model without HF processing.
|
|||||||
|
|
||||||
VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
|
VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
|
||||||
"""
|
"""
|
||||||
A :class:`transformers.image_utils.VideoInput` representing a single video
|
A {class}`transformers.image_utils.VideoInput` representing a single video
|
||||||
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
|
item, which can be passed to a HuggingFace `VideoProcessor`.
|
||||||
|
|
||||||
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
Alternatively, a 3-D tensor or batch of 2-D tensors,
|
||||||
which are treated as video embeddings;
|
which are treated as video embeddings;
|
||||||
@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
|
|||||||
torch.Tensor]
|
torch.Tensor]
|
||||||
"""
|
"""
|
||||||
Represents a single audio
|
Represents a single audio
|
||||||
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
|
item, which can be passed to a HuggingFace `AudioProcessor`.
|
||||||
|
|
||||||
Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
|
Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
|
||||||
is different from that expected by the model;
|
is different from that expected by the model;
|
||||||
@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]]
|
|||||||
Either a single data item, or a list of data items.
|
Either a single data item, or a list of data items.
|
||||||
|
|
||||||
The number of data items allowed per modality is restricted by
|
The number of data items allowed per modality is restricted by
|
||||||
:code:`--limit-mm-per-prompt`.
|
`--limit-mm-per-prompt`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
|
|||||||
"""
|
"""
|
||||||
A dictionary containing an entry for each modality type to input.
|
A dictionary containing an entry for each modality type to input.
|
||||||
|
|
||||||
The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
|
The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -116,14 +116,14 @@ class PlaceholderRange:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
Prompt: :code:`AAAA BBBB What is in these images?`
|
Prompt: `AAAA BBBB What is in these images?`
|
||||||
|
|
||||||
Images A and B will have:
|
Images A and B will have:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
A: PlaceholderRange(offset=0, length=4)
|
||||||
A: PlaceholderRange(offset=0, length=4)
|
B: PlaceholderRange(offset=5, length=4)
|
||||||
B: PlaceholderRange(offset=5, length=4)
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
offset: int
|
offset: int
|
||||||
@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
|
|||||||
|
|
||||||
|
|
||||||
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
||||||
"""Equality check between :data:`NestedTensors` objects."""
|
"""Equality check between {data}`NestedTensors` objects."""
|
||||||
if isinstance(a, torch.Tensor):
|
if isinstance(a, torch.Tensor):
|
||||||
return isinstance(b, torch.Tensor) and torch.equal(a, b)
|
return isinstance(b, torch.Tensor) and torch.equal(a, b)
|
||||||
elif isinstance(b, torch.Tensor):
|
elif isinstance(b, torch.Tensor):
|
||||||
@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
|||||||
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
|
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
|
||||||
"""
|
"""
|
||||||
A dictionary containing nested tensors which have been batched via
|
A dictionary containing nested tensors which have been batched via
|
||||||
:meth:`MultiModalKwargs.batch`.
|
{meth}`MultiModalKwargs.batch`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via
|
|||||||
class MultiModalFieldElem:
|
class MultiModalFieldElem:
|
||||||
"""
|
"""
|
||||||
Represents a keyword argument corresponding to a multi-modal item
|
Represents a keyword argument corresponding to a multi-modal item
|
||||||
in :class:`MultiModalKwargs`.
|
in {class}`MultiModalKwargs`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
modality: str
|
modality: str
|
||||||
@ -205,13 +205,13 @@ class MultiModalFieldElem:
|
|||||||
|
|
||||||
key: str
|
key: str
|
||||||
"""
|
"""
|
||||||
The key of this field in :class:`MultiModalKwargs`,
|
The key of this field in {class}`MultiModalKwargs`,
|
||||||
i.e. the name of the keyword argument to be passed to the model.
|
i.e. the name of the keyword argument to be passed to the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
data: NestedTensors
|
data: NestedTensors
|
||||||
"""
|
"""
|
||||||
The tensor data of this field in :class:`MultiModalKwargs`,
|
The tensor data of this field in {class}`MultiModalKwargs`,
|
||||||
i.e. the value of the keyword argument to be passed to the model.
|
i.e. the value of the keyword argument to be passed to the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -234,7 +234,7 @@ class MultiModalFieldElem:
|
|||||||
class BaseMultiModalField(ABC):
|
class BaseMultiModalField(ABC):
|
||||||
"""
|
"""
|
||||||
Defines how to interpret tensor data belonging to a keyword argument in
|
Defines how to interpret tensor data belonging to a keyword argument in
|
||||||
:class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
|
{class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _field_factory(self, *, modality: str, key: str):
|
def _field_factory(self, *, modality: str, key: str):
|
||||||
@ -259,10 +259,10 @@ class BaseMultiModalField(ABC):
|
|||||||
data: NestedTensors,
|
data: NestedTensors,
|
||||||
) -> Sequence[MultiModalFieldElem]:
|
) -> Sequence[MultiModalFieldElem]:
|
||||||
"""
|
"""
|
||||||
Construct :class:`MultiModalFieldElem` instances to represent
|
Construct {class}`MultiModalFieldElem` instances to represent
|
||||||
the provided data.
|
the provided data.
|
||||||
|
|
||||||
This is the inverse of :meth:`reduce_data`.
|
This is the inverse of {meth}`reduce_data`.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@ -272,9 +272,9 @@ class BaseMultiModalField(ABC):
|
|||||||
|
|
||||||
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
|
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
|
||||||
"""
|
"""
|
||||||
Merge the data from multiple instances of :class:`MultiModalFieldElem`.
|
Merge the data from multiple instances of {class}`MultiModalFieldElem`.
|
||||||
|
|
||||||
This is the inverse of :meth:`build_elems`.
|
This is the inverse of {meth}`build_elems`.
|
||||||
"""
|
"""
|
||||||
field_types = [type(item.field) for item in elems]
|
field_types = [type(item.field) for item in elems]
|
||||||
if len(set(field_types)) > 1:
|
if len(set(field_types)) > 1:
|
||||||
@ -286,8 +286,9 @@ class BaseMultiModalField(ABC):
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class MultiModalBatchedField(BaseMultiModalField):
|
class MultiModalBatchedField(BaseMultiModalField):
|
||||||
"""
|
"""
|
||||||
See also:
|
:::{seealso}
|
||||||
:func:`MultiModalFieldConfig.batched`
|
{func}`MultiModalFieldConfig.batched`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def build_elems(
|
def build_elems(
|
||||||
@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField):
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class MultiModalFlatField(BaseMultiModalField):
|
class MultiModalFlatField(BaseMultiModalField):
|
||||||
"""
|
"""
|
||||||
See also:
|
:::{seealso}
|
||||||
:func:`MultiModalFieldConfig.flat`
|
{func}`MultiModalFieldConfig.flat`
|
||||||
:func:`MultiModalFieldConfig.flat_from_sizes`
|
{func}`MultiModalFieldConfig.flat_from_sizes`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
|
slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
|
||||||
dim: int = 0
|
dim: int = 0
|
||||||
@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField):
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class MultiModalSharedField(BaseMultiModalField):
|
class MultiModalSharedField(BaseMultiModalField):
|
||||||
"""
|
"""
|
||||||
See also:
|
:::{seealso}
|
||||||
:func:`MultiModalFieldConfig.shared`
|
{func}`MultiModalFieldConfig.shared`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
batch_size: int
|
batch_size: int
|
||||||
|
|
||||||
@ -390,17 +393,17 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
Input:
|
||||||
|
Data: [[AAAA]
|
||||||
|
[BBBB]
|
||||||
|
[CCCC]]
|
||||||
|
|
||||||
Input:
|
Output:
|
||||||
Data: [[AAAA]
|
Element 1: [AAAA]
|
||||||
[BBBB]
|
Element 2: [BBBB]
|
||||||
[CCCC]]
|
Element 3: [CCCC]
|
||||||
|
```
|
||||||
Output:
|
|
||||||
Element 1: [AAAA]
|
|
||||||
Element 2: [BBBB]
|
|
||||||
Element 3: [CCCC]
|
|
||||||
"""
|
"""
|
||||||
return MultiModalFieldConfig(
|
return MultiModalFieldConfig(
|
||||||
field=MultiModalBatchedField(),
|
field=MultiModalBatchedField(),
|
||||||
@ -425,35 +428,35 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
Given:
|
||||||
Given:
|
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
|
||||||
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
|
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
Data: [AAABBBBCC]
|
Data: [AAABBBBCC]
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
Element 1: [AAA]
|
Element 1: [AAA]
|
||||||
Element 2: [BBBB]
|
Element 2: [BBBB]
|
||||||
Element 3: [CC]
|
Element 3: [CC]
|
||||||
|
```
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
Given:
|
```
|
||||||
slices: [
|
Given:
|
||||||
(slice(None), slice(0, 3)),
|
slices: [
|
||||||
(slice(None), slice(3, 7)),
|
(slice(None), slice(0, 3)),
|
||||||
(slice(None), slice(7, 9))]
|
(slice(None), slice(3, 7)),
|
||||||
dim: 1
|
(slice(None), slice(7, 9))]
|
||||||
|
dim: 1
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
Element 1: [[A],[A],[A]]
|
Element 1: [[A],[A],[A]]
|
||||||
Element 2: [[B],[B],[B],[B]]
|
Element 2: [[B],[B],[B],[B]]
|
||||||
Element 3: [[C],[C]]
|
Element 3: [[C],[C]]
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
return MultiModalFieldConfig(
|
return MultiModalFieldConfig(
|
||||||
field=MultiModalFlatField(slices=slices, dim=dim),
|
field=MultiModalFlatField(slices=slices, dim=dim),
|
||||||
@ -477,36 +480,36 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
Given:
|
||||||
Given:
|
size_per_item: [3, 4, 2]
|
||||||
size_per_item: [3, 4, 2]
|
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
Data: [AAABBBBCC]
|
Data: [AAABBBBCC]
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
Element 1: [AAA]
|
Element 1: [AAA]
|
||||||
Element 2: [BBBB]
|
Element 2: [BBBB]
|
||||||
Element 3: [CC]
|
Element 3: [CC]
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
.. code-block::
|
Given:
|
||||||
|
slices: [3, 4, 2]
|
||||||
|
dim: 1
|
||||||
|
|
||||||
Given:
|
Input:
|
||||||
slices: [3, 4, 2]
|
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
||||||
dim: 1
|
|
||||||
|
|
||||||
Input:
|
Output:
|
||||||
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
|
Element 1: [[A],[A],[A]]
|
||||||
|
Element 2: [[B],[B],[B],[B]]
|
||||||
|
Element 3: [[C],[C]]
|
||||||
|
```
|
||||||
|
|
||||||
Output:
|
:::{seealso}
|
||||||
Element 1: [[A],[A],[A]]
|
{func}`MultiModalFieldConfig.flat`
|
||||||
Element 2: [[B],[B],[B],[B]]
|
:::
|
||||||
Element 3: [[C],[C]]
|
|
||||||
|
|
||||||
See also:
|
|
||||||
:func:`MultiModalFieldConfig.flat`
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if size_per_item.ndim != 1:
|
if size_per_item.ndim != 1:
|
||||||
@ -535,19 +538,19 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code-block::
|
```
|
||||||
|
Given:
|
||||||
Given:
|
batch_size: 4
|
||||||
batch_size: 4
|
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
Data: [XYZ]
|
Data: [XYZ]
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
Element 1: [XYZ]
|
Element 1: [XYZ]
|
||||||
Element 2: [XYZ]
|
Element 2: [XYZ]
|
||||||
Element 3: [XYZ]
|
Element 3: [XYZ]
|
||||||
Element 4: [XYZ]
|
Element 4: [XYZ]
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
return MultiModalFieldConfig(
|
return MultiModalFieldConfig(
|
||||||
field=MultiModalSharedField(batch_size),
|
field=MultiModalSharedField(batch_size),
|
||||||
@ -570,8 +573,8 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
||||||
"""
|
"""
|
||||||
A collection of :class:`MultiModalFieldElem`
|
A collection of {class}`MultiModalFieldElem`
|
||||||
corresponding to a data item in :class:`MultiModalDataItems`.
|
corresponding to a data item in {class}`MultiModalDataItems`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
|||||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||||
"""
|
"""
|
||||||
A dictionary that represents the keyword arguments to
|
A dictionary that represents the keyword arguments to
|
||||||
:meth:`~torch.nn.Module.forward`.
|
{meth}`~torch.nn.Module.forward`.
|
||||||
|
|
||||||
The metadata :code:`items` enables us to obtain the keyword arguments
|
The metadata `items` enables us to obtain the keyword arguments
|
||||||
corresponding to each data item in :class:`MultiModalDataItems`, via
|
corresponding to each data item in {class}`MultiModalDataItems`, via
|
||||||
:meth:`get_item` and :meth:`get_items`.
|
{meth}`get_item` and {meth}`get_items`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_items(items: Sequence[MultiModalKwargsItem]):
|
def from_items(items: Sequence[MultiModalKwargsItem]):
|
||||||
"""Construct a new :class:`MultiModalKwargs` from multiple items."""
|
"""Construct a new {class}`MultiModalKwargs` from multiple items."""
|
||||||
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
||||||
for item in items:
|
for item in items:
|
||||||
for key, elem in item.items():
|
for key, elem in item.items():
|
||||||
@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality.
|
|||||||
class MultiModalInputs(TypedDict):
|
class MultiModalInputs(TypedDict):
|
||||||
"""
|
"""
|
||||||
Represents the outputs of
|
Represents the outputs of
|
||||||
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
{class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
||||||
ready to be passed to vLLM internals.
|
ready to be passed to vLLM internals.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
|
|||||||
mm_placeholders: MultiModalPlaceholderDict
|
mm_placeholders: MultiModalPlaceholderDict
|
||||||
"""
|
"""
|
||||||
For each modality, information about the placeholder tokens in
|
For each modality, information about the placeholder tokens in
|
||||||
:code:`prompt_token_ids`.
|
`prompt_token_ids`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cache_salt: NotRequired[str]
|
cache_salt: NotRequired[str]
|
||||||
@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict):
|
|||||||
|
|
||||||
class MultiModalEncDecInputs(MultiModalInputs):
|
class MultiModalEncDecInputs(MultiModalInputs):
|
||||||
"""
|
"""
|
||||||
Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
|
Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
|
||||||
ready to be passed to vLLM internals.
|
ready to be passed to vLLM internals.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ _I = TypeVar("_I")
|
|||||||
|
|
||||||
class ModalityDataItems(ABC, Generic[_T, _I]):
|
class ModalityDataItems(ABC, Generic[_T, _I]):
|
||||||
"""
|
"""
|
||||||
Represents data items for a modality in :class:`MultiModalDataItems`.
|
Represents data items for a modality in {class}`MultiModalDataItems`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data: _T, modality: str) -> None:
|
def __init__(self, data: _T, modality: str) -> None:
|
||||||
@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
|
|||||||
|
|
||||||
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
||||||
"""
|
"""
|
||||||
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
|
As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
|
||||||
such that each entry corresponds to a list.
|
such that each entry corresponds to a list.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
|
|||||||
"""
|
"""
|
||||||
Get the number of data items belonging to a modality.
|
Get the number of data items belonging to a modality.
|
||||||
|
|
||||||
If `strict=False`, return `0` instead of raising :exc:`KeyError`
|
If `strict=False`, return `0` instead of raising {exc}`KeyError`
|
||||||
even if the modality is not found.
|
even if the modality is not found.
|
||||||
"""
|
"""
|
||||||
if modality not in self:
|
if modality not in self:
|
||||||
@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
|
|||||||
|
|
||||||
class MultiModalDataParser:
|
class MultiModalDataParser:
|
||||||
"""
|
"""
|
||||||
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
|
Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
|
||||||
:class:`MultiModalDataItems`.
|
{class}`MultiModalDataItems`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
target_sr (float, optional): Enables automatic resampling of audio
|
target_sr (float, optional): Enables automatic resampling of audio
|
||||||
|
|||||||
@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
|
|||||||
|
|
||||||
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
|
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
|
||||||
"""
|
"""
|
||||||
Given :attr:`full`, return a boolean mask of shape `(len(full),)`
|
Given {attr}`full`, return a boolean mask of shape `(len(full),)`
|
||||||
indicating which positions of `full` to assign embeddings to.
|
indicating which positions of `full` to assign embeddings to.
|
||||||
|
|
||||||
`None` (default) means to assign embeddings to all positions of `full`.
|
`None` (default) means to assign embeddings to all positions of `full`.
|
||||||
|
|
||||||
The embeddings are obtained by calling
|
The embeddings are obtained by calling
|
||||||
:class:`SupportsMultiModal.get_multimodal_embeddings`.
|
{class}`SupportsMultiModal.get_multimodal_embeddings`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
|
|||||||
The token sequence or text that are part of the update.
|
The token sequence or text that are part of the update.
|
||||||
|
|
||||||
If only part of the content corresponds to feature placeholders, you can
|
If only part of the content corresponds to feature placeholders, you can
|
||||||
use :class:`PromptUpdateDetails` to specify which part.
|
use {class}`PromptUpdateDetails` to specify which part.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
|
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
|
||||||
PromptUpdateInfo]
|
PromptUpdateInfo]
|
||||||
"""
|
"""
|
||||||
Given the index of the processed item within :attr:`modality`,
|
Given the index of the processed item within {attr}`modality`,
|
||||||
output the corresponding token sequence (or text).
|
output the corresponding token sequence (or text).
|
||||||
|
|
||||||
For convenience, you can directly pass in the token sequence (or text)
|
For convenience, you can directly pass in the token sequence (or text)
|
||||||
@ -213,52 +213,52 @@ class PromptInsertion(PromptUpdate):
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
For each image, insert a number of ``<image>`` feature placeholders
|
For each image, insert a number of ``<image>`` feature placeholders
|
||||||
equal to the feature size of the vision encoder after the ``<s>`` token:
|
equal to the feature size of the vision encoder after the ``<s>`` token:
|
||||||
|
|
||||||
.. code-block:: python
|
```python
|
||||||
|
PromptInsertion(
|
||||||
|
modality="image",
|
||||||
|
target="<s>",
|
||||||
|
insertion="<image>" * image_feature_size,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
PromptInsertion(
|
Insert these tokens at the start of the prompt:
|
||||||
modality="image",
|
|
||||||
target="<s>",
|
|
||||||
insertion="<image>" * image_feature_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
Insert these tokens at the start of the prompt:
|
```python
|
||||||
|
PromptInsertion(
|
||||||
|
modality="image",
|
||||||
|
target=PromptIndexTargets.start(),
|
||||||
|
insertion="<image>" * image_feature_size,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
.. code-block:: python
|
Insert these tokens after a prefix ``Images:``:
|
||||||
|
|
||||||
PromptInsertion(
|
```python
|
||||||
modality="image",
|
PromptInsertion(
|
||||||
target=PromptIndexTargets.start(),
|
modality="image",
|
||||||
insertion="<image>" * image_feature_size,
|
target=PromptIndexTargets.prefix("Images:"),
|
||||||
)
|
insertion="<image>" * image_feature_size,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
Insert these tokens after a prefix ``Images:``:
|
Insert these tokens at the end of the prompt:
|
||||||
|
|
||||||
.. code-block:: python
|
```python
|
||||||
|
PromptInsertion(
|
||||||
PromptInsertion(
|
modality="image",
|
||||||
modality="image",
|
target=PromptIndexTargets.end(),
|
||||||
target=PromptIndexTargets.prefix("Images:"),
|
insertion="<image>" * image_feature_size,
|
||||||
insertion="<image>" * image_feature_size,
|
)
|
||||||
)
|
```
|
||||||
|
|
||||||
Insert these tokens at the end of the prompt:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
PromptInsertion(
|
|
||||||
modality="image",
|
|
||||||
target=PromptIndexTargets.end(),
|
|
||||||
insertion="<image>" * image_feature_size,
|
|
||||||
)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
insertion: PromptUpdateContent = field(repr=False)
|
insertion: PromptUpdateContent = field(repr=False)
|
||||||
"""
|
"""
|
||||||
Given the index of the processed item within :attr:`modality`,
|
Given the index of the processed item within {attr}`modality`,
|
||||||
output the token sequence (or text) to insert right after :attr:`target`.
|
output the token sequence (or text) to insert right after {attr}`target`.
|
||||||
|
|
||||||
For convenience, you can directly pass in the token sequence (or text)
|
For convenience, you can directly pass in the token sequence (or text)
|
||||||
instead of a function if it does not depend on the input.
|
instead of a function if it does not depend on the input.
|
||||||
@ -280,57 +280,57 @@ class PromptReplacement(PromptUpdate):
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
For each image, replace one ``<image>`` input placeholder in the prompt
|
For each image, replace one ``<image>`` input placeholder in the prompt
|
||||||
with a number of ``<image>`` feature placeholders
|
with a number of ``<image>`` feature placeholders
|
||||||
equal to the feature size of the vision encoder:
|
equal to the feature size of the vision encoder:
|
||||||
|
|
||||||
.. code-block:: python
|
```python
|
||||||
|
PromptReplacement(
|
||||||
|
modality="image",
|
||||||
|
target="<image>",
|
||||||
|
replacement="<image>" * image_feature_size,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
PromptReplacement(
|
As above, but further pad the feature placeholders with ``<image_bos>``
|
||||||
modality="image",
|
and `<image_eos>``, which are not supposed to be passed to the vision
|
||||||
target="<image>",
|
encoder:
|
||||||
replacement="<image>" * image_feature_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
As above, but further pad the feature placeholders with ``<image_bos>``
|
```python
|
||||||
and `<image_eos>``, which are not supposed to be passed to the vision
|
PromptReplacement(
|
||||||
encoder:
|
modality="image",
|
||||||
|
target="<image>",
|
||||||
|
replacement=PromptUpdateDetails(
|
||||||
|
full="".join([
|
||||||
|
"<image_bos>",
|
||||||
|
"<image>" * image_feature_size,
|
||||||
|
"<image_eos>",
|
||||||
|
]),
|
||||||
|
features="<image>" * image_feature_size,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
.. code-block:: python
|
To avoid unnecessary tokenization during prompt replacement,
|
||||||
|
we recommended passing token sequences instead of text:
|
||||||
|
|
||||||
PromptReplacement(
|
```python
|
||||||
modality="image",
|
PromptReplacement(
|
||||||
target="<image>",
|
modality="image",
|
||||||
replacement=PromptUpdateDetails(
|
target=[image_token_id],
|
||||||
full="".join([
|
replacement=PromptUpdateDetails(
|
||||||
"<image_bos>",
|
full=([image_bos_id] + [image_token_id] * image_feature_size
|
||||||
"<image>" * image_feature_size,
|
+ [image_eos_id]),
|
||||||
"<image_eos>",
|
features=[image_token_id] * image_feature_size,
|
||||||
]),
|
),
|
||||||
features="<image>" * image_feature_size,
|
)
|
||||||
),
|
```
|
||||||
)
|
|
||||||
|
|
||||||
To avoid unnecessary tokenization during prompt replacement,
|
|
||||||
we recommended passing token sequences instead of text:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
PromptReplacement(
|
|
||||||
modality="image",
|
|
||||||
target=[image_token_id],
|
|
||||||
replacement=PromptUpdateDetails(
|
|
||||||
full=([image_bos_id] + [image_token_id] * image_feature_size
|
|
||||||
+ [image_eos_id]),
|
|
||||||
features=[image_token_id] * image_feature_size,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
replacement: PromptUpdateContent = field(repr=False)
|
replacement: PromptUpdateContent = field(repr=False)
|
||||||
"""
|
"""
|
||||||
Given the index of the processed item within :attr:`modality`,
|
Given the index of the processed item within {attr}`modality`,
|
||||||
output the token sequence (or text) to replace :attr:`target`.
|
output the token sequence (or text) to replace {attr}`target`.
|
||||||
|
|
||||||
For convenience, you can directly pass in the token sequence (or text)
|
For convenience, you can directly pass in the token sequence (or text)
|
||||||
instead of a function if it does not depend on the input.
|
instead of a function if it does not depend on the input.
|
||||||
@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
|
|||||||
|
|
||||||
|
|
||||||
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
||||||
"""Convenience function to apply :func:`full_groupby` based on modality."""
|
"""Convenience function to apply {func}`full_groupby` based on modality."""
|
||||||
return full_groupby(values, key=lambda x: x.modality)
|
return full_groupby(values, key=lambda x: x.modality)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BoundPromptSequence:
|
class _BoundPromptSequence:
|
||||||
"""
|
"""
|
||||||
A :data:`_PromptSeq` bound to a tokenizer to automatically
|
A {data}`_PromptSeq` bound to a tokenizer to automatically
|
||||||
convert between token sequence and text representations.
|
convert between token sequence and text representations.
|
||||||
"""
|
"""
|
||||||
tokenizer: AnyTokenizer = field(repr=False)
|
tokenizer: AnyTokenizer = field(repr=False)
|
||||||
@ -443,8 +443,8 @@ class _BoundPromptContent:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class BoundPromptUpdate:
|
class BoundPromptUpdate:
|
||||||
"""
|
"""
|
||||||
A :class:`PromptUpdate` bound to a tokenizer to automatically convert
|
A {class}`PromptUpdate` bound to a tokenizer to automatically convert
|
||||||
:attr:`target` and the result of :meth:`get_content` between
|
{attr}`target` and the result of {meth}`get_content` between
|
||||||
token sequence and text representations.
|
token sequence and text representations.
|
||||||
"""
|
"""
|
||||||
_origin: PromptUpdate
|
_origin: PromptUpdate
|
||||||
@ -479,7 +479,7 @@ class BoundPromptUpdate:
|
|||||||
|
|
||||||
def get_content(self, item_idx: int) -> _BoundPromptContent:
|
def get_content(self, item_idx: int) -> _BoundPromptContent:
|
||||||
"""
|
"""
|
||||||
Given the index of the processed item within :attr:`modality`,
|
Given the index of the processed item within {attr}`modality`,
|
||||||
output the token sequence (or text) to update.
|
output the token sequence (or text) to update.
|
||||||
"""
|
"""
|
||||||
content = self.content
|
content = self.content
|
||||||
@ -516,7 +516,7 @@ def iter_token_matches(
|
|||||||
match_ids: list[int],
|
match_ids: list[int],
|
||||||
) -> Generator[_TokenMatch]:
|
) -> Generator[_TokenMatch]:
|
||||||
"""
|
"""
|
||||||
Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
|
Yield each occurrence of `match_ids` in `token_ids`.
|
||||||
|
|
||||||
Note that empty matches are ignored.
|
Note that empty matches are ignored.
|
||||||
"""
|
"""
|
||||||
@ -545,8 +545,8 @@ def replace_token_matches(
|
|||||||
new_ids: list[int],
|
new_ids: list[int],
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""
|
"""
|
||||||
Replace each occurrence of :code:`match_ids` in :code:`token_ids`
|
Replace each occurrence of `match_ids` in `token_ids`
|
||||||
with :code:`new_ids`.
|
with `new_ids`.
|
||||||
|
|
||||||
Note that empty matches are ignored.
|
Note that empty matches are ignored.
|
||||||
"""
|
"""
|
||||||
@ -654,7 +654,7 @@ def find_token_matches(
|
|||||||
prompt: list[int],
|
prompt: list[int],
|
||||||
prompt_updates: Sequence[BoundPromptUpdate],
|
prompt_updates: Sequence[BoundPromptUpdate],
|
||||||
) -> Sequence[PromptTargetMatch]:
|
) -> Sequence[PromptTargetMatch]:
|
||||||
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
|
"""Return each target of `prompt_updates` found in `prompt`."""
|
||||||
|
|
||||||
def get_matches(update: BoundPromptUpdate):
|
def get_matches(update: BoundPromptUpdate):
|
||||||
target = update.target
|
target = update.target
|
||||||
@ -680,7 +680,7 @@ def find_text_matches(
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
prompt_updates: Sequence[BoundPromptUpdate],
|
prompt_updates: Sequence[BoundPromptUpdate],
|
||||||
) -> Sequence[PromptTargetMatch]:
|
) -> Sequence[PromptTargetMatch]:
|
||||||
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
|
"""Return each target of `prompt_updates` found in `prompt`."""
|
||||||
|
|
||||||
def get_matches(update: BoundPromptUpdate):
|
def get_matches(update: BoundPromptUpdate):
|
||||||
target = update.target
|
target = update.target
|
||||||
@ -707,7 +707,7 @@ def _resolve_matches(
|
|||||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||||
) -> list[PromptTargetMatch]:
|
) -> list[PromptTargetMatch]:
|
||||||
"""
|
"""
|
||||||
Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
|
Resolve `mm_matches` to ensure that there are no overlapping matches,
|
||||||
and sort them such that earlier matches take priority over later ones.
|
and sort them such that earlier matches take priority over later ones.
|
||||||
"""
|
"""
|
||||||
matches = [m for matches in mm_matches.values() for m in matches]
|
matches = [m for matches in mm_matches.values() for m in matches]
|
||||||
@ -731,7 +731,7 @@ def _apply_matches(
|
|||||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||||
mm_item_counts: Mapping[str, int],
|
mm_item_counts: Mapping[str, int],
|
||||||
) -> list[_S]:
|
) -> list[_S]:
|
||||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||||
out_seqs = list[Union[str, list[int]]]()
|
out_seqs = list[Union[str, list[int]]]()
|
||||||
prev_end_idx = 0
|
prev_end_idx = 0
|
||||||
next_idx_by_modality = defaultdict[str, int](lambda: 0)
|
next_idx_by_modality = defaultdict[str, int](lambda: 0)
|
||||||
@ -780,7 +780,7 @@ def apply_token_matches(
|
|||||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||||
mm_item_counts: Mapping[str, int],
|
mm_item_counts: Mapping[str, int],
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||||
if not mm_matches:
|
if not mm_matches:
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
@ -794,7 +794,7 @@ def apply_text_matches(
|
|||||||
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
|
||||||
mm_item_counts: Mapping[str, int],
|
mm_item_counts: Mapping[str, int],
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
|
"""Apply the updates in `mm_matches` to `prompt`."""
|
||||||
if not mm_matches:
|
if not mm_matches:
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
@ -809,7 +809,7 @@ def _iter_placeholders(
|
|||||||
mm_item_counts: Mapping[str, int],
|
mm_item_counts: Mapping[str, int],
|
||||||
) -> Iterable[PlaceholderFeaturesInfo]:
|
) -> Iterable[PlaceholderFeaturesInfo]:
|
||||||
"""
|
"""
|
||||||
Yield each set of placeholder tokens found in :code:`prompt`.
|
Yield each set of placeholder tokens found in `prompt`.
|
||||||
|
|
||||||
Matches are exclusive even when multiple modalities share
|
Matches are exclusive even when multiple modalities share
|
||||||
the same placeholder tokens. In that case, the modality that
|
the same placeholder tokens. In that case, the modality that
|
||||||
@ -1016,7 +1016,7 @@ class ProcessingCache:
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Put a processed multi-modal item into the cache
|
Put a processed multi-modal item into the cache
|
||||||
according to its dependencies (see :meth:`get`).
|
according to its dependencies (see {meth}`get`).
|
||||||
"""
|
"""
|
||||||
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
|
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
|
||||||
**{modality: input_item},
|
**{modality: input_item},
|
||||||
@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
|
|||||||
|
|
||||||
MultiModalHashes = dict[str, list[str]]
|
MultiModalHashes = dict[str, list[str]]
|
||||||
"""
|
"""
|
||||||
A collection of hashes with a similar structure as :class:`MultiModalKwargs`.
|
A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
"""
|
"""
|
||||||
Abstract base class to process multi-modal inputs to be used in vLLM.
|
Abstract base class to process multi-modal inputs to be used in vLLM.
|
||||||
|
|
||||||
Not to be confused with :class:`transformers.ProcessorMixin`.
|
Not to be confused with {class}`transformers.ProcessorMixin`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
def _get_data_parser(self) -> MultiModalDataParser:
|
def _get_data_parser(self) -> MultiModalDataParser:
|
||||||
"""
|
"""
|
||||||
Construct a parser to preprocess multi-modal data items
|
Construct a parser to preprocess multi-modal data items
|
||||||
before passing them to :meth:`_get_hf_mm_data`.
|
before passing them to {meth}`_get_hf_mm_data`.
|
||||||
|
|
||||||
You can support additional modalities by creating a subclass
|
You can support additional modalities by creating a subclass
|
||||||
of :class:`MultiModalDataParser` that has additional subparsers.
|
of {class}`MultiModalDataParser` that has additional subparsers.
|
||||||
"""
|
"""
|
||||||
return MultiModalDataParser()
|
return MultiModalDataParser()
|
||||||
|
|
||||||
@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
) -> MultiModalDataItems:
|
) -> MultiModalDataItems:
|
||||||
"""
|
"""
|
||||||
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
|
Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
|
||||||
before passing them to :meth:`_get_hf_mm_data`.
|
before passing them to {meth}`_get_hf_mm_data`.
|
||||||
"""
|
"""
|
||||||
mm_items = self.data_parser.parse_mm_data(mm_data)
|
mm_items = self.data_parser.parse_mm_data(mm_data)
|
||||||
supported_mm_limits = self.info.get_supported_mm_limits()
|
supported_mm_limits = self.info.get_supported_mm_limits()
|
||||||
@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
inputs.
|
inputs.
|
||||||
|
|
||||||
Moreover, this information is critical to determine the token positions
|
Moreover, this information is critical to determine the token positions
|
||||||
in order to construct :class:`~vllm-multimodal.input.PlaceholderRange`
|
in order to construct {class}`~vllm-multimodal.input.PlaceholderRange`
|
||||||
for each multi-modal item.
|
for each multi-modal item.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
"""
|
"""
|
||||||
Return whether the HF processor applies prompt updates.
|
Return whether the HF processor applies prompt updates.
|
||||||
|
|
||||||
For most HF processors, this should be :code:`True` when multi-modal
|
For most HF processors, this should be `True` when multi-modal
|
||||||
data items are passed, but :code:`False` when multi-modal embeddings
|
data items are passed, but `False` when multi-modal embeddings
|
||||||
are passed.
|
are passed.
|
||||||
"""
|
"""
|
||||||
return not any(
|
return not any(
|
||||||
@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
Most HF processors accept prompt text but not prompt tokens.
|
Most HF processors accept prompt text but not prompt tokens.
|
||||||
If the HF processor adds or removes tokens that are not related to
|
If the HF processor adds or removes tokens that are not related to
|
||||||
multi-modal data, you should override this method so it is consistent
|
multi-modal data, you should override this method so it is consistent
|
||||||
with the output of :meth:`_apply_hf_processor_text_only` on the
|
with the output of {meth}`_apply_hf_processor_text_only` on the
|
||||||
corresponding text.
|
corresponding text.
|
||||||
"""
|
"""
|
||||||
return prompt_tokens
|
return prompt_tokens
|
||||||
@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
|
|
||||||
Since HF processor requires that text and multi-modal items
|
Since HF processor requires that text and multi-modal items
|
||||||
correspond to each other, we generate dummy text using
|
correspond to each other, we generate dummy text using
|
||||||
:class:`DummyInputsBuilder` to go along with the multi-modal data.
|
{class}`DummyInputsBuilder` to go along with the multi-modal data.
|
||||||
"""
|
"""
|
||||||
mm_counts = mm_items.get_all_counts()
|
mm_counts = mm_items.get_all_counts()
|
||||||
|
|
||||||
@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
Apply the HF processor on the prompt text and multi-modal data.
|
Apply the HF processor on the prompt text and multi-modal data.
|
||||||
|
|
||||||
In addition, return whether prompt updates have been applied
|
In addition, return whether prompt updates have been applied
|
||||||
(for most HF processors, this should be :code:`True`).
|
(for most HF processors, this should be `True`).
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
If :code:`enable_hf_prompt_update=False`, we use HF processor
|
If `enable_hf_prompt_update=False`, we use HF processor
|
||||||
to perform prompt updates if available; HF processor requires
|
to perform prompt updates if available; HF processor requires
|
||||||
that the prompt corresponds to multi-modal items.
|
that the prompt corresponds to multi-modal items.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -25,7 +25,7 @@ logger = init_logger(__name__)
|
|||||||
class ProcessorInputs:
|
class ProcessorInputs:
|
||||||
"""
|
"""
|
||||||
Represents the keyword arguments to
|
Represents the keyword arguments to
|
||||||
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
|
{meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
|
||||||
"""
|
"""
|
||||||
prompt_text: str
|
prompt_text: str
|
||||||
mm_data: MultiModalDataDict
|
mm_data: MultiModalDataDict
|
||||||
@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
|||||||
# TODO: @abstractmethod after transition
|
# TODO: @abstractmethod after transition
|
||||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
"""
|
"""
|
||||||
Build the text input corresponding to :code:`mm_counts`.
|
Build the text input corresponding to `mm_counts`.
|
||||||
"""
|
"""
|
||||||
if (type(self).get_dummy_processor_inputs ==
|
if (type(self).get_dummy_processor_inputs ==
|
||||||
BaseDummyInputsBuilder.get_dummy_processor_inputs):
|
BaseDummyInputsBuilder.get_dummy_processor_inputs):
|
||||||
|
|||||||
@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
|
|||||||
|
|
||||||
|
|
||||||
class ProcessingInfoFactory(Protocol[_I_co]):
|
class ProcessingInfoFactory(Protocol[_I_co]):
|
||||||
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
|
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
|
|||||||
|
|
||||||
class DummyInputsBuilderFactory(Protocol[_I]):
|
class DummyInputsBuilderFactory(Protocol[_I]):
|
||||||
"""
|
"""
|
||||||
Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
|
Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
|
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
|
||||||
@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]):
|
|||||||
|
|
||||||
|
|
||||||
class MultiModalProcessorFactory(Protocol[_I]):
|
class MultiModalProcessorFactory(Protocol[_I]):
|
||||||
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
|
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@ -150,7 +150,7 @@ class MultiModalRegistry:
|
|||||||
Get the maximum number of tokens from each modality
|
Get the maximum number of tokens from each modality
|
||||||
for profiling the memory usage of a model.
|
for profiling the memory usage of a model.
|
||||||
|
|
||||||
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||||
"""
|
"""
|
||||||
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
||||||
|
|
||||||
@ -165,7 +165,7 @@ class MultiModalRegistry:
|
|||||||
Get the maximum number of multi-modal tokens
|
Get the maximum number of multi-modal tokens
|
||||||
for profiling the memory usage of a model.
|
for profiling the memory usage of a model.
|
||||||
|
|
||||||
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||||
"""
|
"""
|
||||||
return sum(self.get_max_tokens_by_modality(model_config).values())
|
return sum(self.get_max_tokens_by_modality(model_config).values())
|
||||||
|
|
||||||
@ -208,8 +208,9 @@ class MultiModalRegistry:
|
|||||||
When the model receives multi-modal data, the provided function is
|
When the model receives multi-modal data, the provided function is
|
||||||
invoked to transform the data into a dictionary of model inputs.
|
invoked to transform the data into a dictionary of model inputs.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:ref:`mm-processing`
|
{ref}`mm-processing`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def wrapper(model_cls: N) -> N:
|
def wrapper(model_cls: N) -> N:
|
||||||
@ -253,8 +254,9 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create a multi-modal processor for a specific model and tokenizer.
|
Create a multi-modal processor for a specific model and tokenizer.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
:ref:`mm-processing`
|
{ref}`mm-processing`
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
raise ValueError(f"{model_config.model} is not a multimodal model")
|
raise ValueError(f"{model_config.model} is not a multimodal model")
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Optional, TypeVar, Union
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
||||||
from urllib.parse import ParseResult, urlparse
|
from urllib.parse import ParseResult, urlparse
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -24,6 +24,10 @@ _M = TypeVar("_M")
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .hasher import MultiModalHashDict
|
from .hasher import MultiModalHashDict
|
||||||
from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
|
from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
|
||||||
|
else:
|
||||||
|
MultiModalHashDict = Any
|
||||||
|
MultiModalKwargs = Any
|
||||||
|
MultiModalPlaceholderDict = Any
|
||||||
|
|
||||||
|
|
||||||
class MediaConnector:
|
class MediaConnector:
|
||||||
@ -255,7 +259,7 @@ class MediaConnector:
|
|||||||
|
|
||||||
|
|
||||||
global_media_connector = MediaConnector()
|
global_media_connector = MediaConnector()
|
||||||
"""The global :class:`MediaConnector` instance used by vLLM."""
|
"""The global {class}`MediaConnector` instance used by vLLM."""
|
||||||
|
|
||||||
fetch_audio = global_media_connector.fetch_audio
|
fetch_audio = global_media_connector.fetch_audio
|
||||||
fetch_image = global_media_connector.fetch_image
|
fetch_image = global_media_connector.fetch_image
|
||||||
@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def merge_and_sort_multimodal_metadata(
|
def merge_and_sort_multimodal_metadata(
|
||||||
mm_positions: "MultiModalPlaceholderDict",
|
mm_positions: MultiModalPlaceholderDict,
|
||||||
mm_hashes: Optional["MultiModalHashDict"],
|
mm_hashes: Optional[MultiModalHashDict],
|
||||||
) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
|
) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
|
||||||
"""Given a MultiModalPlaceholderDict, merge all PlaceholderRange
|
"""Given a MultiModalPlaceholderDict, merge all PlaceholderRange
|
||||||
objects from all available modalities into a single list of
|
objects from all available modalities into a single list of
|
||||||
PlaceholderRange, sorted by their offset (starting index in the input
|
PlaceholderRange, sorted by their offset (starting index in the input
|
||||||
sequence) in the ascending order.
|
sequence) in the ascending order.
|
||||||
|
|
||||||
Optionally if a MultiModalHashDict is given, same operation will be
|
Optionally if a `MultiModalHashDict` is given, same operation will be
|
||||||
applied to the object and the sorted list of hashes will be returned.
|
applied to the object and the sorted list of hashes will be returned.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[str]: List of item modalities in order of their positions in
|
list[str]: List of item modalities in order of their positions in the
|
||||||
the input sequence.
|
input sequence.
|
||||||
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
||||||
mm_positions.
|
mm_positions.
|
||||||
Optional[list[str]]: Sorted list of all hashes from mm_hashes if
|
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
|
||||||
given, None otherwise.
|
None otherwise.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
modalities = list(mm_positions.keys())
|
modalities = list(mm_positions.keys())
|
||||||
@ -352,22 +356,23 @@ def merge_and_sort_multimodal_metadata(
|
|||||||
|
|
||||||
|
|
||||||
def group_mm_inputs_by_modality(
|
def group_mm_inputs_by_modality(
|
||||||
mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
|
mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
|
||||||
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
|
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
|
||||||
together into the same list for batching purpose. For MultiModalKwargs with
|
together into the same list for batching purpose. For MultiModalKwargs with
|
||||||
multiple modalities, put them into their own list.
|
multiple modalities, put them into their own list.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
mm_inputs: List of MultiModalKwargs.
|
mm_inputs: List of MultiModalKwargs.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each
|
list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
|
||||||
inner list contains consecutive MultiModalKwargs with same modality.
|
`MultiModalKwargs`, each inner list contains consecutive
|
||||||
|
`MultiModalKwargs` with same modality.
|
||||||
"""
|
"""
|
||||||
if not mm_inputs:
|
if not mm_inputs:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
|
def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
|
||||||
# If the input has multiple modalities, return a id as the unique key
|
# If the input has multiple modalities, return a id as the unique key
|
||||||
# for the mm_input input.
|
# for the mm_input input.
|
||||||
if len(mm_input.modalities) > 1:
|
if len(mm_input.modalities) > 1:
|
||||||
|
|||||||
@ -19,8 +19,6 @@ if TYPE_CHECKING:
|
|||||||
else:
|
else:
|
||||||
VllmConfig = None
|
VllmConfig = None
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class CpuPlatform(Platform):
|
class CpuPlatform(Platform):
|
||||||
_enum = PlatformEnum.CPU
|
_enum = PlatformEnum.CPU
|
||||||
|
|||||||
@ -454,10 +454,4 @@ finally:
|
|||||||
|
|
||||||
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
|
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
|
||||||
|
|
||||||
try:
|
CudaPlatform.log_warnings()
|
||||||
from sphinx.ext.autodoc.mock import _MockModule
|
|
||||||
|
|
||||||
if not isinstance(pynvml, _MockModule):
|
|
||||||
CudaPlatform.log_warnings()
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
CudaPlatform.log_warnings()
|
|
||||||
|
|||||||
@ -146,7 +146,7 @@ class Platform:
|
|||||||
return self._enum == PlatformEnum.OOT
|
return self._enum == PlatformEnum.OOT
|
||||||
|
|
||||||
def is_cuda_alike(self) -> bool:
|
def is_cuda_alike(self) -> bool:
|
||||||
"""Stateless version of :func:`torch.cuda.is_available`."""
|
"""Stateless version of {func}`torch.cuda.is_available`."""
|
||||||
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
||||||
|
|
||||||
def is_sleep_mode_available(self) -> bool:
|
def is_sleep_mode_available(self) -> bool:
|
||||||
@ -165,7 +165,7 @@ class Platform:
|
|||||||
cls,
|
cls,
|
||||||
device_id: int = 0,
|
device_id: int = 0,
|
||||||
) -> Optional[DeviceCapability]:
|
) -> Optional[DeviceCapability]:
|
||||||
"""Stateless version of :func:`torch.cuda.get_device_capability`."""
|
"""Stateless version of {func}`torch.cuda.get_device_capability`."""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -180,7 +180,7 @@ class Platform:
|
|||||||
The ``capability`` argument can either be:
|
The ``capability`` argument can either be:
|
||||||
|
|
||||||
- A tuple ``(major, minor)``.
|
- A tuple ``(major, minor)``.
|
||||||
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
|
- An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
|
||||||
"""
|
"""
|
||||||
current_capability = cls.get_device_capability(device_id=device_id)
|
current_capability = cls.get_device_capability(device_id=device_id)
|
||||||
if current_capability is None:
|
if current_capability is None:
|
||||||
|
|||||||
@ -1,7 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from .layerwise_profile import layerwise_profile
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"layerwise_profile",
|
|
||||||
]
|
|
||||||
@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
|
|||||||
|
|
||||||
|
|
||||||
def array_full(token_id: int, count: int):
|
def array_full(token_id: int, count: int):
|
||||||
""":class:`array` equivalent of :func:`numpy.full`."""
|
"""{class}`array` equivalent of {func}`numpy.full`."""
|
||||||
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
|
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
|
||||||
|
|
||||||
|
|
||||||
@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct,
|
|||||||
def from_prompt_token_counts(
|
def from_prompt_token_counts(
|
||||||
*token_counts: tuple[int, int]) -> "SequenceData":
|
*token_counts: tuple[int, int]) -> "SequenceData":
|
||||||
"""
|
"""
|
||||||
Construct a :class:`SequenceData` instance by concatenating
|
Construct a {class}`SequenceData` instance by concatenating
|
||||||
prompt token sequences.
|
prompt token sequences.
|
||||||
|
|
||||||
Each tuple represents one token sequence, expressed in the form
|
Each tuple represents one token sequence, expressed in the form
|
||||||
:code:`(token_id, count)`.
|
`(token_id, count)`.
|
||||||
"""
|
"""
|
||||||
if len(token_counts) == 0:
|
if len(token_counts) == 0:
|
||||||
return SequenceData.from_seqs([])
|
return SequenceData.from_seqs([])
|
||||||
@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct,
|
|||||||
prompt_embeds: Optional[torch.Tensor] = None,
|
prompt_embeds: Optional[torch.Tensor] = None,
|
||||||
) -> "SequenceData":
|
) -> "SequenceData":
|
||||||
"""
|
"""
|
||||||
Construct a :class:`SequenceData` instance from prompt and output
|
Construct a {class}`SequenceData` instance from prompt and output
|
||||||
token sequences.
|
token sequences.
|
||||||
"""
|
"""
|
||||||
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
|
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
|
||||||
@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct,
|
|||||||
class Sequence:
|
class Sequence:
|
||||||
"""Stores the data, status, and block information of a sequence.
|
"""Stores the data, status, and block information of a sequence.
|
||||||
|
|
||||||
The sequence is constructed from the :data:`DecoderOnlyInputs`
|
The sequence is constructed from the {data}`DecoderOnlyInputs`
|
||||||
(for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
|
(for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
|
||||||
instance passed in through the :code:`inputs` constructor argument.
|
instance passed in through the `inputs` constructor argument.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_id: The ID of the sequence.
|
seq_id: The ID of the sequence.
|
||||||
|
|||||||
@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
|
|||||||
"""Create a SmallerTpProposerWorker.
|
"""Create a SmallerTpProposerWorker.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
worker (MultiStepWorker): an actual worker wrapped with this class
|
worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
|
||||||
|
actual worker wrapped with this class
|
||||||
draft_ranks (List[int]): if this value is given, only the GPU ranks
|
draft_ranks (List[int]): if this value is given, only the GPU ranks
|
||||||
written in this value participate in draft generation
|
written in this value participate in draft generation
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
|
|||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
|
||||||
allow the model to output the auxiliary loss. See [here]() for more details
|
|
||||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||||
The aux loss factor for the total loss.
|
The aux loss factor for the total loss.
|
||||||
|
|
||||||
|
|||||||
@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
|
|||||||
Instantiating a configuration with the defaults will yield a similar
|
Instantiating a configuration with the defaults will yield a similar
|
||||||
configuration to that of the Exaone
|
configuration to that of the Exaone
|
||||||
|
|
||||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
Configuration objects inherit from {class}`~transformers.PretrainedConfig`
|
||||||
and can be used to control the model outputs. Read the documentation from :
|
and can be used to control the model outputs. Read the documentation from :
|
||||||
class:`~transformers.PretrainedConfig` for more information.
|
class:`~transformers.PretrainedConfig` for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
vocab_size ({obj}`int`, `optional`, defaults to 50257):
|
||||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||||
different tokens that can be represented by the :obj:`inputs_ids`
|
different tokens that can be represented by the {obj}`inputs_ids`
|
||||||
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
|
||||||
size of the model.
|
size of the model.
|
||||||
Defines the different tokens that can be represented by the
|
Defines the different tokens that can be represented by the
|
||||||
`inputs_ids` passed to the forward method of :class:
|
`inputs_ids` passed to the forward method of :class:
|
||||||
`~transformers.EXAONEModel`.
|
`~transformers.EXAONEModel`.
|
||||||
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
hidden_size ({obj}`int`, `optional`, defaults to 2048):
|
||||||
Dimensionality of the encoder layers and the pooler layer.
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
num_layers (:obj:`int`, `optional`, defaults to 24):
|
num_layers ({obj}`int`, `optional`, defaults to 24):
|
||||||
Number of hidden layers in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||||
Number of attention heads for each attention layer in the
|
Number of attention heads for each attention layer in the
|
||||||
@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
|
|||||||
specified, will default to `num_attention_heads`.
|
specified, will default to `num_attention_heads`.
|
||||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||||
percentage of hidden dimensions to allocate to rotary embeddings
|
percentage of hidden dimensions to allocate to rotary embeddings
|
||||||
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
intermediate_size ({obj}`int`, `optional`, defaults to 8192):
|
||||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||||
the Transformer encoder.
|
the Transformer encoder.
|
||||||
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
activation_function ({obj}`str` or {obj}`function`, `optional`,
|
||||||
defaults to :obj:`"gelu_new"`):
|
defaults to {obj}`"gelu_new"`):
|
||||||
The non-linear activation function (function or string) in the
|
The non-linear activation function (function or string) in the
|
||||||
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
|
||||||
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
{obj}`"selu"` and {obj}`"gelu_new"` are supported.
|
||||||
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||||
The dropout probabilitiy for all fully connected layers in the
|
The dropout probabilitiy for all fully connected layers in the
|
||||||
embeddings, encoder, and pooler.
|
embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
|
||||||
The maximum sequence length that this model might ever be used with.
|
The maximum sequence length that this model might ever be used with.
|
||||||
Typically set this to something large just in case
|
Typically set this to something large just in case
|
||||||
(e.g., 512 or 1024 or 2048).
|
(e.g., 512 or 1024 or 2048).
|
||||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
type_vocab_size ({obj}`int`, `optional`, defaults to 2):
|
||||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
The vocabulary size of the {obj}`token_type_ids` passed when calling
|
||||||
:class:`~transformers.EXAONEModel`.
|
{class}`~transformers.EXAONEModel`.
|
||||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
initializer_range ({obj}`float`, `optional`, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for
|
The standard deviation of the truncated_normal_initializer for
|
||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
|
||||||
Whether or not the model should return the last key/values
|
Whether or not the model should return the last key/values
|
||||||
attentions (not used by all models).
|
attentions (not used by all models).
|
||||||
Only relevant if ``config.is_decoder=True``.
|
Only relevant if ``config.is_decoder=True``.
|
||||||
gradient_checkpointing (:obj:`bool`, `optional`,
|
gradient_checkpointing ({obj}`bool`, `optional`,
|
||||||
defaults to :obj:`False`):
|
defaults to {obj}`False`):
|
||||||
If True, use gradient checkpointing to save memory at the expense
|
If True, use gradient checkpointing to save memory at the expense
|
||||||
of slower backward pass.
|
of slower backward pass.
|
||||||
Example::
|
Example::
|
||||||
|
|||||||
@ -39,9 +39,9 @@ def decode_tokens(
|
|||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Backend-agnostic equivalent of HF's
|
Backend-agnostic equivalent of HF's
|
||||||
:code:`tokenizer.decode(token_ids, ...)`.
|
`tokenizer.decode(token_ids, ...)`.
|
||||||
|
|
||||||
:code:`skip_special_tokens=None` means to use the backend's default
|
`skip_special_tokens=None` means to use the backend's default
|
||||||
settings.
|
settings.
|
||||||
"""
|
"""
|
||||||
if skip_special_tokens is not None:
|
if skip_special_tokens is not None:
|
||||||
@ -61,9 +61,9 @@ def encode_tokens(
|
|||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""
|
"""
|
||||||
Backend-agnostic equivalent of HF's
|
Backend-agnostic equivalent of HF's
|
||||||
:code:`tokenizer.encode(text, ...)`.
|
`tokenizer.encode(text, ...)`.
|
||||||
|
|
||||||
:code:`add_special_tokens=None` means to use the backend's default
|
`add_special_tokens=None` means to use the backend's default
|
||||||
settings.
|
settings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
|
|||||||
"""
|
"""
|
||||||
Gets the cumulative number of hits and queries against this cache.
|
Gets the cumulative number of hits and queries against this cache.
|
||||||
|
|
||||||
If :code:`delta=True`, instead gets these statistics
|
If `delta=True`, instead gets these statistics
|
||||||
since the last call that also passed :code:`delta=True`.
|
since the last call that also passed `delta=True`.
|
||||||
"""
|
"""
|
||||||
info = CacheInfo(hits=self._hits, total=self._total)
|
info = CacheInfo(hits=self._hits, total=self._total)
|
||||||
|
|
||||||
@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
|
|||||||
|
|
||||||
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
|
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
|
||||||
"""
|
"""
|
||||||
Unlike :class:`itertools.groupby`, groups are not broken by
|
Unlike {class}`itertools.groupby`, groups are not broken by
|
||||||
non-contiguous data.
|
non-contiguous data.
|
||||||
"""
|
"""
|
||||||
groups = defaultdict[_K, list[_V]](list)
|
groups = defaultdict[_K, list[_V]](list)
|
||||||
@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
|
|||||||
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
|
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||||
|
|
||||||
|
|
||||||
def is_in_doc_build() -> bool:
|
|
||||||
try:
|
|
||||||
from sphinx.ext.autodoc.mock import _MockModule
|
|
||||||
return isinstance(torch, _MockModule)
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
|
def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
|
||||||
"""
|
"""
|
||||||
Import a Python file according to its file path.
|
Import a Python file according to its file path.
|
||||||
@ -1820,10 +1812,11 @@ class _PlaceholderBase:
|
|||||||
Disallows downstream usage of placeholder modules.
|
Disallows downstream usage of placeholder modules.
|
||||||
|
|
||||||
We need to explicitly override each dunder method because
|
We need to explicitly override each dunder method because
|
||||||
:meth:`__getattr__` is not called when they are accessed.
|
{meth}`__getattr__` is not called when they are accessed.
|
||||||
|
|
||||||
See also:
|
:::{seealso}
|
||||||
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
|
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __getattr__(self, key: str) -> Never:
|
def __getattr__(self, key: str) -> Never:
|
||||||
@ -2052,9 +2045,6 @@ def direct_register_custom_op(
|
|||||||
library object. If you want to bind the operator to a different library,
|
library object. If you want to bind the operator to a different library,
|
||||||
make sure the library object is alive when the operator is used.
|
make sure the library object is alive when the operator is used.
|
||||||
"""
|
"""
|
||||||
if is_in_doc_build():
|
|
||||||
return
|
|
||||||
|
|
||||||
if not supports_custom_op():
|
if not supports_custom_op():
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
assert not current_platform.is_cuda_alike(), (
|
assert not current_platform.is_cuda_alike(), (
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
|
# MLA Common Components
|
||||||
|
|
||||||
This file implements common components for MLA implementations.
|
This file implements common components for MLA implementations.
|
||||||
|
|
||||||
First we define:
|
First we define:
|
||||||
|
|||||||
@ -180,6 +180,7 @@ class KVCacheManager:
|
|||||||
as eagle.
|
as eagle.
|
||||||
|
|
||||||
Blocks layout:
|
Blocks layout:
|
||||||
|
```
|
||||||
-----------------------------------------------------------------------
|
-----------------------------------------------------------------------
|
||||||
| < computed > | < new computed > | < new > | < pre-allocated > |
|
| < computed > | < new computed > | < new > | < pre-allocated > |
|
||||||
-----------------------------------------------------------------------
|
-----------------------------------------------------------------------
|
||||||
@ -189,6 +190,7 @@ class KVCacheManager:
|
|||||||
------------------------------------------------
|
------------------------------------------------
|
||||||
| <new full> |
|
| <new full> |
|
||||||
--------------
|
--------------
|
||||||
|
```
|
||||||
The following *_blocks are illustrated in this layout.
|
The following *_blocks are illustrated in this layout.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|||||||
@ -308,7 +308,7 @@ class OutputProcessor:
|
|||||||
* If there is no queue (for usage with LLMEngine),
|
* If there is no queue (for usage with LLMEngine),
|
||||||
return a list of RequestOutput objects.
|
return a list of RequestOutput objects.
|
||||||
|
|
||||||
****************** NOTE FOR DEVELOPERS ******************
|
NOTE FOR DEVELOPERS
|
||||||
|
|
||||||
vLLM V1 minimizes the number of python loops over the full
|
vLLM V1 minimizes the number of python loops over the full
|
||||||
batch to ensure system overheads are minimized. This is the
|
batch to ensure system overheads are minimized. This is the
|
||||||
@ -316,8 +316,6 @@ class OutputProcessor:
|
|||||||
|
|
||||||
If you need to touch every element of the batch, do it from
|
If you need to touch every element of the batch, do it from
|
||||||
within the loop below.
|
within the loop below.
|
||||||
|
|
||||||
**********************************************************
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
request_outputs: list[RequestOutput] = []
|
request_outputs: list[RequestOutput] = []
|
||||||
|
|||||||
@ -75,7 +75,7 @@ class RejectionSampler(nn.Module):
|
|||||||
outside of the rejection sampler with the default sampling
|
outside of the rejection sampler with the default sampling
|
||||||
strategy. It allows for more flexibility in the sampling
|
strategy. It allows for more flexibility in the sampling
|
||||||
process such as top_p, top_k sampling.
|
process such as top_p, top_k sampling.
|
||||||
sampling_metadata (SamplingMetadata):
|
sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
|
||||||
Additional metadata needed for sampling, such as temperature,
|
Additional metadata needed for sampling, such as temperature,
|
||||||
top-k/top-p parameters, or other relevant information.
|
top-k/top-p parameters, or other relevant information.
|
||||||
Returns:
|
Returns:
|
||||||
|
|||||||
@ -170,9 +170,10 @@ class Worker(WorkerBase):
|
|||||||
Then, it calculate the free memory that can be used for KV cache in
|
Then, it calculate the free memory that can be used for KV cache in
|
||||||
bytes.
|
bytes.
|
||||||
|
|
||||||
.. tip::
|
:::{tip}
|
||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
by adjusting the `gpu_memory_utilization` parameter.
|
by adjusting the `gpu_memory_utilization` parameter.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch.cuda.reset_peak_memory_stats()
|
torch.cuda.reset_peak_memory_stats()
|
||||||
|
|||||||
@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Perform sanity checks for the result of
|
Perform sanity checks for the result of
|
||||||
:meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
|
{meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
|
||||||
"""
|
"""
|
||||||
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
|
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
|
||||||
"Expected multimodal embeddings to be a list/tuple of 2D tensors, "
|
"Expected multimodal embeddings to be a list/tuple of 2D tensors, "
|
||||||
@ -39,7 +39,7 @@ def scatter_mm_placeholders(
|
|||||||
Scatter the multimodal embeddings into a contiguous tensor that represents
|
Scatter the multimodal embeddings into a contiguous tensor that represents
|
||||||
the placeholder tokens.
|
the placeholder tokens.
|
||||||
|
|
||||||
:class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
|
{class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
embeds: The multimodal embeddings.
|
embeds: The multimodal embeddings.
|
||||||
@ -66,7 +66,7 @@ def gather_mm_placeholders(
|
|||||||
"""
|
"""
|
||||||
Reconstructs the embeddings from the placeholder tokens.
|
Reconstructs the embeddings from the placeholder tokens.
|
||||||
|
|
||||||
This is the operation of :func:`scatter_mm_placeholders`.
|
This is the operation of {func}`scatter_mm_placeholders`.
|
||||||
"""
|
"""
|
||||||
if is_embed is None:
|
if is_embed is None:
|
||||||
return placeholders
|
return placeholders
|
||||||
|
|||||||
@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
|||||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||||
that can be allocated with the remaining free memory.
|
that can be allocated with the remaining free memory.
|
||||||
|
|
||||||
.. tip::
|
:::{tip}
|
||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
by adjusting the `gpu_memory_utilization` parameter.
|
by adjusting the `gpu_memory_utilization` parameter.
|
||||||
|
:::
|
||||||
"""
|
"""
|
||||||
# Profile the memory usage of the model and get the maximum number of
|
# Profile the memory usage of the model and get the maximum number of
|
||||||
# cache blocks that can be allocated with the remaining free memory.
|
# cache blocks that can be allocated with the remaining free memory.
|
||||||
|
|||||||
@ -734,11 +734,11 @@ def _pythonize_sampler_output(
|
|||||||
cache: Optional[PythonizationCache],
|
cache: Optional[PythonizationCache],
|
||||||
) -> None:
|
) -> None:
|
||||||
""" This function is only called when the output tensors are ready.
|
""" This function is only called when the output tensors are ready.
|
||||||
See :class:`ModelOutput`.
|
See {class}`ModelOutput`.
|
||||||
|
|
||||||
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
|
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
|
||||||
adding a Pythonized output data structure
|
adding a Pythonized output data structure
|
||||||
(:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
|
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_input
|
model_input
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user