[Doc] Improve documentation for multimodal CLI args (#16960)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-06 05:41:48 +08:00 · 2025-04-22 16:35:35 +08:00 · 2025-04-22 16:35:35 +08:00 · 8f7bace7c3
commit 8f7bace7c3
parent e4d6144232
2 changed files with 17 additions and 10 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -54,13 +54,15 @@ if TYPE_CHECKING:
    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
        BaseTokenizerGroup)

-    Config = TypeVar("Config", bound=DataclassInstance)
+    ConfigType = type[DataclassInstance]
 else:
    QuantizationConfig = None
-    Config = TypeVar("Config")
+    ConfigType = type

 logger = init_logger(__name__)

+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 _DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
@ -162,7 +164,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
    return out


-def config(cls: type[Config]) -> type[Config]:
+def config(cls: ConfigT) -> ConfigT:
    """
    A decorator that ensures all fields in a dataclass have default values
    and that each field has a docstring.
@ -181,7 +183,7 @@ def config(cls: type[Config]) -> type[Config]:
    return cls


-def get_field(cls: type[Config], name: str) -> Field:
+def get_field(cls: ConfigType, name: str) -> Field:
    """Get the default factory field of a dataclass by name. Used for getting
    default factory fields in `EngineArgs`."""
    if not is_dataclass(cls):
@ -2749,6 +2751,9 @@ class MultiModalConfig:
    The maximum number of input items allowed per prompt for each modality.
    This should be a JSON string that will be parsed into a dictionary.
    Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    ``{"images": 16, "videos": 2}``
    """

    def compute_hash(self) -> str:
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -17,7 +17,7 @@ from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm import version
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         Config, ConfigFormat, DecodingConfig, Device,
+                         ConfigFormat, ConfigType, DecodingConfig, Device,
                         DeviceConfig, DistributedExecutorBackend, HfOverrides,
                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                         ModelConfig, ModelImpl, MultiModalConfig,
@ -304,7 +304,7 @@ class EngineArgs:
            """Check if the class is a custom type."""
            return cls.__module__ != "builtins"

-        def get_kwargs(cls: type[Config]) -> dict[str, Any]:
+        def get_kwargs(cls: ConfigType) -> dict[str, Any]:
            cls_docs = get_attr_docs(cls)
            kwargs = {}
            for field in fields(cls):
@ -678,13 +678,15 @@ class EngineArgs:
            '--mm-processor-kwargs',
            default=None,
            type=json.loads,
-            help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
+            help=('Overrides for the multi-modal processor obtained from '
+                  '``AutoProcessor.from_pretrained``. The available overrides '
+                  'depend on the model that is being run.'
+                  'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'))
        parser.add_argument(
            '--disable-mm-preprocessor-cache',
            action='store_true',
-            help='If true, then disables caching of the multi-modal '
-            'preprocessor/mapper. (not recommended)')
+            help='If True, disable caching of the processed multi-modal '
+            'inputs.')

        # LoRA related configs
        parser.add_argument('--enable-lora',