change

Signed-off-by: Reagan Lee <reaganjlee@gmail.com> Signed-off-by: Reagan <reaganjlee@gmail.com>
2026-07-06 02:47:09 +08:00 · 2025-11-20 13:06:52 -05:00 · 2025-11-20 13:06:52 -05:00 · e3dd9108cb
commit e3dd9108cb
parent e087fbc393
14 changed files with 858 additions and 81 deletions
--- a/docs/cli/bench/multimodal_processor.md
+++ b/docs/cli/bench/multimodal_processor.md
@ -0,0 +1,9 @@
 # vllm bench multimodal-processor
 ## JSON CLI Arguments
 --8<-- "docs/cli/json_tip.inc.md"
 ## Arguments
 --8<-- "docs/argparse/bench_multimodal_processor.inc.md"
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -92,6 +92,7 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 bench_latency = auto_mock("vllm.benchmarks", "latency")
 bench_multimodal_processor = auto_mock("vllm.benchmarks", "multimodal_processor")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
 bench_sweep_plot_pareto = auto_mock(
@ -222,6 +223,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "run-batch": create_parser(openai_run_batch.make_arg_parser),
        # Benchmark CLI
        "bench_latency": create_parser(bench_latency.add_cli_args),
        "bench_multimodal_processor": create_parser(bench_multimodal_processor.add_cli_args),
        "bench_serve": create_parser(bench_serve.add_cli_args),
        "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -12,7 +12,11 @@ import torch
 import torch.nn as nn
 from PIL import Image
-from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config import (
    ModelConfig,
    VllmConfig,
    set_current_vllm_config,
 )
 from vllm.config.multimodal import (
    AudioDummyOptions,
    BaseDummyOptions,
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -1908,7 +1908,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
        try:
            # Enforce endpoint compatibility for multimodal datasets.
-            if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
+            backend = getattr(args, "backend", "openai-chat")
            if args.dataset_name == "random-mm" and backend not in ["openai-chat"]:
                raise ValueError(
                    "Multi-modal content (images) is only supported on "
                    "'openai-chat' backend."
--- a/vllm/benchmarks/multimodal_processor.py
+++ b/vllm/benchmarks/multimodal_processor.py
@ -0,0 +1,464 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark multimodal processor latency.
 This benchmark measures the latency of the multimodal processor module
 using randomly generated multimodal prompts with synthetic images.
 MM processor stats are automatically enabled.
 Run:
    vllm bench multimodal-processor \
        --model <your_model> \
        --num-prompts 10 \
        --input-len 1024 \
        --output-len 128 \
        --num-images 1
 """
 import argparse
 import dataclasses
 import json
 import time
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any
 import numpy as np
 from vllm.engine.arg_utils import EngineArgs
 from vllm.multimodal.processing import (
    get_timing_stats_from_engine_client,
 )
 from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.import_utils import PlaceholderModule
 try:
    import pandas as pd
 except ImportError:
    pd = PlaceholderModule("pandas")
@dataclass
 class MultimodalProcessorBenchmarkMetrics:
    """Metrics for multimodal processor benchmark."""
    completed: int
    failed: int
    mean_e2el_ms: float
    median_e2el_ms: float
    std_e2el_ms: float
    percentiles_e2el_ms: list[tuple[float, float]]
    """Per-stage timing stats: mean, median, std, percentiles for each stage."""
    mm_processor_stats: dict[str, dict[str, float]]
 def collect_mm_processor_stats(
    llm_engine: Any,
    debug: bool = False,
 ) -> dict[str, list[float]]:
    """
    Collect multimodal processor timing stats.
    Returns a dictionary mapping stage names to lists of timing values (in seconds).
    """
    all_stats = get_timing_stats_from_engine_client(llm_engine)
    stats_by_stage = {
        "hf_processor_time": [],
        "hashing_time": [],
        "cache_lookup_time": [],
        "prompt_update_time": [],
        "total_time": [],
    }
    for stats_dict in all_stats.values():
        stats_by_stage["hf_processor_time"].append(
            stats_dict.get("hf_processor_time", 0.0)
        )
        stats_by_stage["hashing_time"].append(stats_dict.get("hashing_time", 0.0))
        stats_by_stage["cache_lookup_time"].append(
            stats_dict.get("cache_lookup_time", 0.0)
        )
        stats_by_stage["prompt_update_time"].append(
            stats_dict.get("prompt_update_time", 0.0)
        )
        stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0))
    if debug and not any(stats_by_stage.values()):
        print(
            "Warning: No MM processor stats found. Ensure --enable-mm-processor-stats is set."
        )
    return stats_by_stage
 def calculate_mm_processor_metrics(
    stats_by_stage: dict[str, list[float]],
    selected_percentiles: list[float],
 ) -> dict[str, dict[str, float]]:
    """
    Calculate aggregate metrics from stats by stage.
    """
    metrics = {}
    for stage_name, times in stats_by_stage.items():
        if not times:
            metrics[stage_name] = {
                "mean": 0.0,
                "median": 0.0,
                "std": 0.0,
                **{f"p{p}": 0.0 for p in selected_percentiles},
            }
            continue
        times_ms = [t * 1000 for t in times]
        metrics[stage_name] = {
            "mean": float(np.mean(times_ms)),
            "median": float(np.median(times_ms)),
            "std": float(np.std(times_ms)),
            **{
                f"p{p}": float(np.percentile(times_ms, p)) for p in selected_percentiles
            },
        }
    return metrics
 def generate_random_multimodal_prompts(
    num_prompts: int,
    input_len: int,
    output_len: int,
    tokenizer: Any,
    num_images: int = 1,
    image_width: int = 256,
    image_height: int = 256,
    seed: int = 0,
 ) -> tuple[list[list[dict]], list[int]]:
    """
    Generate random multimodal prompts with synthetic images and text tokens.
    Returns:
        tuple: (prompts, expected_output_lens)
            - prompts: List of OpenAI chat format messages with text and images
            - expected_output_lens: List of expected output lengths
    """
    from PIL import Image
    from vllm.benchmarks.datasets import process_image
    rng = np.random.default_rng(seed)
    prompts = []
    expected_output_lens = []
    for i in range(num_prompts):
        vocab_size = tokenizer.vocab_size
        prompt_token_ids = rng.integers(
            0, vocab_size, size=input_len
        ).tolist()
        text_prompt = tokenizer.decode(prompt_token_ids)
        mm_items = []
        for _ in range(num_images):
            # Generate random RGB image
            random_pixels = rng.integers(
                0, 256, (image_height, image_width, 3), dtype=np.uint8
            )
            image = Image.fromarray(random_pixels)
            # Process to OpenAI format
            mm_item = process_image(image)
            mm_items.append(mm_item)
        # Create chat format: text + images
        content = [{"type": "text", "text": text_prompt}]
        content.extend(mm_items)
        prompts.append([{"role": "user", "content": content}])
        expected_output_lens.append(output_len)
    return prompts, expected_output_lens
 def benchmark_multimodal_processor(
    args: argparse.Namespace,
 ) -> dict[str, Any]:
    """
    Run the multimodal processor benchmark.
    """
    from vllm import LLM, SamplingParams
    engine_args = EngineArgs.from_cli_args(args)
    llm = LLM(**dataclasses.asdict(engine_args))
    # Validate max_model_len
    assert llm.llm_engine.model_config.max_model_len >= (
        args.input_len + args.output_len
    ), (
        "Please ensure that max_model_len is greater than "
        "the sum of input_len and output_len."
    )
    # Generate random multimodal prompts
    seed = getattr(args, "seed", 0)
    tokenizer = llm.get_tokenizer()
    prompts, expected_output_lens = generate_random_multimodal_prompts(
        num_prompts=args.num_prompts,
        input_len=args.input_len,
        output_len=args.output_len,
        tokenizer=tokenizer,
        num_images=args.num_images,
        image_width=args.image_width,
        image_height=args.image_height,
        seed=seed,
    )
    # Create sampling params
    sampling_params = [
        SamplingParams(
            n=1,
            temperature=0.0,  # Greedy sampling for deterministic speed benchmarks
            max_tokens=output_len,
            detokenize=True,
        )
        for output_len in expected_output_lens
    ]
    selected_percentiles = [
        float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
    ]
    freeze_gc_heap()
    # MM processor stats are automatically enabled via set_defaults
    # No need to check or raise error
    debug = getattr(args, "debug_mm_stats", False)
    print(f"Processing {len(prompts)} requests...")
    start_time = time.perf_counter()
    outputs = llm.chat(
        prompts, sampling_params, use_tqdm=not getattr(args, "disable_tqdm", False)
    )
    end_time = time.perf_counter()
    total_time = end_time - start_time
    mm_stats_by_stage = collect_mm_processor_stats(
        llm.llm_engine,
        debug=debug,
    )
    if not any(mm_stats_by_stage.values()):
        print(
            "\n⚠️  Warning: No MM processor stats found in registry.\n"
            "   This may indicate that:\n"
            "   - No multimodal requests were processed\n"
            "   - Stats were already retrieved (registry is cleared after retrieval)\n"
        )
    mm_processor_metrics = calculate_mm_processor_metrics(
        mm_stats_by_stage, selected_percentiles
    )
    completed = len([o for o in outputs if o.finished])
    failed = len(outputs) - completed
    e2el_times = []
    for output in outputs:
        if not output.finished or output.metrics is None:
            continue
        metrics = output.metrics
        for attr in ("finished_time", "last_token_time"):
            if (
                getattr(metrics, attr, None) is not None
                and getattr(metrics, "arrival_time", None) is not None
            ):
                e2el_times.append(
                    (getattr(metrics, attr) - metrics.arrival_time) * 1000
                )
                break
    if not e2el_times and completed > 0:
        avg_time_per_request = total_time / completed
        e2el_times = [avg_time_per_request * 1000] * completed
    if e2el_times:
        mean_e2el_ms = float(np.mean(e2el_times))
        median_e2el_ms = float(np.median(e2el_times))
        std_e2el_ms = float(np.std(e2el_times))
        percentiles_e2el_ms = [
            (p, float(np.percentile(e2el_times, p))) for p in selected_percentiles
        ]
    else:
        mean_e2el_ms = 0.0
        median_e2el_ms = 0.0
        std_e2el_ms = 0.0
        percentiles_e2el_ms = [(p, 0.0) for p in selected_percentiles]
    benchmark_result = {
        "completed": completed,
        "failed": failed,
        "mean_e2el_ms": mean_e2el_ms,
        "median_e2el_ms": median_e2el_ms,
        "std_e2el_ms": std_e2el_ms,
        "percentiles_e2el_ms": percentiles_e2el_ms,
        "mm_processor_stats": mm_processor_metrics,
    }
    return benchmark_result
 def add_cli_args(parser: argparse.ArgumentParser) -> None:
    """Add CLI arguments for the multimodal processor benchmark."""
    from vllm.engine.arg_utils import EngineArgs
    # Add EngineArgs (no conflict since we removed dataset parser)
    EngineArgs.add_cli_args(parser)
    # Automatically enable MM processor stats (required for this benchmark)
    parser.set_defaults(enable_mm_processor_stats=True)
    # Random generation arguments (similar to latency.py)
    parser.add_argument(
        "--num-prompts",
        type=int,
        default=10,
        help="Number of prompts to process.",
    )
    parser.add_argument(
        "--input-len",
        type=int,
        default=1024,
        help="Number of input tokens per request.",
    )
    parser.add_argument(
        "--output-len",
        type=int,
        default=128,
        help="Number of output tokens per request.",
    )
    parser.add_argument(
        "--num-images",
        type=int,
        default=1,
        help="Number of images per prompt.",
    )
    parser.add_argument(
        "--image-width",
        type=int,
        default=256,
        help="Width of generated images in pixels.",
    )
    parser.add_argument(
        "--image-height",
        type=int,
        default=256,
        help="Height of generated images in pixels.",
    )
    parser.add_argument(
        "--output-json",
        type=str,
        default=None,
        help="Path to save the benchmark results in JSON format.",
    )
    parser.add_argument(
        "--debug-mm-stats",
        action="store_true",
        help="Enable debug logging for MM processor stats collection.",
    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-separated list of percentiles to calculate (e.g., '50,90,99').",
    )
    parser.add_argument(
        "--disable-tqdm",
        action="store_true",
        help="Disable tqdm progress bar.",
    )
 def main(args: argparse.Namespace) -> None:
    """Main entry point for the multimodal processor benchmark."""
    from datetime import datetime
    print("Starting multimodal processor benchmark...")
    result = benchmark_multimodal_processor(args)
    print("\n" + "=" * 80)
    print("Multimodal Processor Benchmark Results")
    print("=" * 80)
    if "mm_processor_stats" in result:
        print("\nMM Processor Timing (ms):")
        selected_percentiles = [
            float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
        ]
        mm_data = []
        for stage, metrics in result["mm_processor_stats"].items():
            row = {
                "Stage": stage,
                "Mean": f"{metrics['mean']:.2f}",
                "Median": f"{metrics['median']:.2f}",
                "Std": f"{metrics['std']:.2f}",
            }
            for p in selected_percentiles:
                row[f"P{p}"] = f"{metrics.get(f'p{p}', 0.0):.2f}"
            mm_data.append(row)
        mm_df = pd.DataFrame(mm_data)
        print(mm_df.to_string(index=False))
    if "mean_e2el_ms" in result:
        print("\nEnd-to-End Latency (ms):")
        selected_percentiles = [
            float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
        ]
        e2el_data = [
            {"Metric": "Mean", "Value (ms)": f"{result['mean_e2el_ms']:.2f}"},
            {"Metric": "Median", "Value (ms)": f"{result['median_e2el_ms']:.2f}"},
            {"Metric": "Std", "Value (ms)": f"{result['std_e2el_ms']:.2f}"},
        ]
        for p in selected_percentiles:
            percentile_value = next(
                (val for pct, val in result["percentiles_e2el_ms"] if pct == p),
                0.0,
            )
            e2el_data.append(
                {
                    "Metric": f"P{p}",
                    "Value (ms)": f"{percentile_value:.2f}",
                }
            )
        e2el_df = pd.DataFrame(e2el_data)
        print(e2el_df.to_string(index=False))
    if args.output_json:
        result["config"] = {
            "model": args.model,
            "num_prompts": args.num_prompts,
            "input_len": args.input_len,
            "output_len": args.output_len,
            "num_images": args.num_images,
            "image_width": args.image_width,
            "image_height": args.image_height,
        }
        result["timestamp"] = datetime.now().isoformat()
        with open(args.output_json, "w") as f:
            json.dump(result, f, indent=2)
        print(f"\nResults saved to {args.output_json}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark multimodal processor latency"
    )
    add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@ -64,6 +64,11 @@ class ObservabilityConfig:
    module in the model and attach informations such as input/output shapes to
    nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
    enable_mm_processor_stats: bool = False
    """Enable collection of timing statistics for multimodal processor operations.
    This can be useful for performance analysis and debugging. Defaults to `False`
    (disabled)."""
    @cached_property
    def collect_model_forward_time(self) -> bool:
        """Whether to collect model forward time for the request."""
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -521,6 +521,7 @@ class EngineArgs:
    enable_layerwise_nvtx_tracing: bool = (
        ObservabilityConfig.enable_layerwise_nvtx_tracing
    )
    enable_mm_processor_stats: bool = ObservabilityConfig.enable_mm_processor_stats
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
@ -1040,6 +1041,10 @@ class EngineArgs:
            "--enable-layerwise-nvtx-tracing",
            **observability_kwargs["enable_layerwise_nvtx_tracing"],
        )
        observability_group.add_argument(
            "--enable-mm-processor-stats",
            **observability_kwargs["enable_mm_processor_stats"],
        )
        # Scheduler arguments
        scheduler_kwargs = get_kwargs(SchedulerConfig)
@ -1682,6 +1687,7 @@ class EngineArgs:
            kv_cache_metrics_sample=self.kv_cache_metrics_sample,
            cudagraph_metrics=self.cudagraph_metrics,
            enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
            enable_mm_processor_stats=self.enable_mm_processor_stats,
        )
        # Compilation config overrides
--- a/vllm/entrypoints/cli/init.py
+++ b/vllm/entrypoints/cli/init.py
@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
 from vllm.entrypoints.cli.benchmark.multimodal_processor import (
    BenchmarkMultimodalProcessorSubcommand,
 )
 from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
 from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
 from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
@ -8,6 +11,7 @@ from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcomm
 __all__: list[str] = [
    "BenchmarkLatencySubcommand",
    "BenchmarkMultimodalProcessorSubcommand",
    "BenchmarkServingSubcommand",
    "BenchmarkStartupSubcommand",
    "BenchmarkSweepSubcommand",
--- a/vllm/entrypoints/cli/benchmark/multimodal_processor.py
+++ b/vllm/entrypoints/cli/benchmark/multimodal_processor.py
@ -0,0 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 from vllm.benchmarks.multimodal_processor import add_cli_args, main
 from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 class BenchmarkMultimodalProcessorSubcommand(BenchmarkSubcommandBase):
    """The `multimodal-processor` subcommand for `vllm bench`."""
    name = "multimodal-processor"
    help = "Benchmark multimodal processor latency across different configurations."
    @classmethod
    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
        add_cli_args(parser)
    @staticmethod
    def cmd(args: argparse.Namespace) -> None:
        main(args)
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -651,9 +651,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
    # If set, vllm will log stats at this interval in seconds
    # If not set, vllm will log stats every 10 seconds.
-    "VLLM_LOG_STATS_INTERVAL": lambda: val
+    "VLLM_LOG_STATS_INTERVAL": lambda: (
-    if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
+        val
-    else 10.0,
+        if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
        else 10.0
    ),
    # Trace function calls
    # If set to 1, vllm will trace function calls
    # Useful for debugging
@ -678,28 +680,30 @@ environment_variables: dict[str, Callable[[], Any]] = {
        ),
    ),
    # If set, vllm will use flashinfer sampler
-    "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
+    "VLLM_USE_FLASHINFER_SAMPLER": lambda: (
-        int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
+        bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
-    )
+        if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
-    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
+        else None
-    else None,
+    ),
    # Pipeline stage partition strategy
    "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
    # (CPU backend only) CPU key-value cache space.
    # default is None and will be set as 4 GB
-    "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
+    "VLLM_CPU_KVCACHE_SPACE": lambda: (
-    if "VLLM_CPU_KVCACHE_SPACE" in os.environ
+        int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
-    else None,
+        if "VLLM_CPU_KVCACHE_SPACE" in os.environ
        else None
    ),
    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
    "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
    # (CPU backend only) CPU cores not used by OMP threads .
    # Those CPU cores will not be used by OMP threads of a rank.
-    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
+    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: (
-        os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
+        int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
-    )
+        if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
-    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
+        else None
-    else None,
+    ),
    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
    "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
    # If the env var is set, Ray Compiled Graph uses the specified
@ -843,9 +847,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded
-    "VLLM_PLUGINS": lambda: None
+    "VLLM_PLUGINS": lambda: (
-    if "VLLM_PLUGINS" not in os.environ
+        None
-    else os.environ["VLLM_PLUGINS"].split(","),
+        if "VLLM_PLUGINS" not in os.environ
        else os.environ["VLLM_PLUGINS"].split(",")
    ),
    # a local directory to look in for unrecognized LoRA adapters.
    # only works if plugins are enabled and
    # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
@ -917,9 +923,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # and performance comparisons. Currently only affects MPLinearKernel
    # selection
    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
-    "VLLM_DISABLED_KERNELS": lambda: []
+    "VLLM_DISABLED_KERNELS": lambda: (
-    if "VLLM_DISABLED_KERNELS" not in os.environ
+        []
-    else os.environ["VLLM_DISABLED_KERNELS"].split(","),
+        if "VLLM_DISABLED_KERNELS" not in os.environ
        else os.environ["VLLM_DISABLED_KERNELS"].split(",")
    ),
    # Disable pynccl (using torch.distributed instead)
    "VLLM_DISABLE_PYNCCL": lambda: (
        os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
@ -1155,11 +1163,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    == "1",
    # Gap between padding buckets for the forward pass. So we have
    # 8, we will run forward pass with [16, 24, 32, ...].
-    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
+    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: (
-        os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]
+        int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    )
+        if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
+        else 0
-    else 0,
+    ),
    "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
        os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
    ),
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@ -6,7 +6,7 @@ from typing import Any, cast
 from typing_extensions import assert_never
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ObservabilityConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
@ -47,6 +47,7 @@ class InputPreprocessor:
        self,
        model_config: ModelConfig,
        tokenizer: TokenizerLike | None,
        observability_config: ObservabilityConfig | None = None,
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
        mm_processor_cache: BaseMultiModalProcessorCache | None = None,
    ) -> None:
@ -54,6 +55,7 @@ class InputPreprocessor:
        self.model_config = model_config
        self.tokenizer = tokenizer
        self.observability_config = observability_config
        self.mm_registry = mm_registry
        self.mm_processor_cache = mm_processor_cache
@ -232,6 +234,7 @@ class InputPreprocessor:
        if not hasattr(self, "_mm_processor"):
            self._mm_processor = self.mm_registry.create_processor(
                self.model_config,
                self.observability_config,
                tokenizer=self.tokenizer,
                cache=self.mm_processor_cache,
            )
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextvars
 import threading
 import time
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
@ -22,6 +25,7 @@ import regex as re
 import torch
 from typing_extensions import TypeVar, assert_never
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
@ -53,7 +57,7 @@ if TYPE_CHECKING:
    from transformers.feature_extraction_utils import BatchFeature
    from transformers.processing_utils import ProcessorMixin
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, ObservabilityConfig
    from .cache import BaseMultiModalProcessorCache
    from .profiling import BaseDummyInputsBuilder
@ -63,6 +67,7 @@ else:
    ProcessorMixin = object
    ModelConfig = object
    ObservabilityConfig = object
    BaseMultiModalProcessorCache = object
@ -70,6 +75,125 @@ logger = init_logger(__name__)
 _S = TypeVar("_S", str, list[int])
 # Context variable to store the current request_id during preprocessing
 _request_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar(
    "_request_id_context", default=None
 )
 def get_current_request_id() -> str | None:
    """Get the current request_id from the context, if available."""
    return _request_id_context.get()
@contextmanager
 def set_request_id(request_id: str) -> Generator[None, None, None]:
    """Context manager to set the request_id for the current context."""
    token = _request_id_context.set(request_id)
    try:
        yield
    finally:
        _request_id_context.reset(token)
@dataclass
 class MultiModalProcessorTimingStats:
    """Per-request timing statistics for multimodal processor stages."""
    hf_processor_time: float = 0.0
    """Time spent in HuggingFace processor calls (seconds)."""
    hashing_time: float = 0.0
    """Time spent computing multimodal item hashes (seconds)."""
    cache_lookup_time: float = 0.0
    """Time spent in cache lookups and merges (seconds)."""
    prompt_update_time: float = 0.0
    """Time spent applying prompt updates and finding placeholders (seconds)."""
    total_time: float = 0.0
    """Total processing time (seconds)."""
    def to_dict(self) -> dict[str, float]:
        """Convert stats to a dictionary for JSON serialization."""
        return {
            "hf_processor_time": self.hf_processor_time,
            "hashing_time": self.hashing_time,
            "cache_lookup_time": self.cache_lookup_time,
            "prompt_update_time": self.prompt_update_time,
            "total_time": self.total_time,
        }
 def get_timing_stats_from_engine_client(engine_client: Any) -> dict[str, dict[str, float]]:
    """
    Get all timing stats from the context associated with the engine client.
    Args:
        engine_client: The engine client that has input_processor.
    Returns:
        A dictionary mapping request_id to stats dict.
    """
    try:
        if not engine_client.vllm_config.observability_config.enable_mm_processor_stats:
            return {}
    except (AttributeError, RuntimeError):
        return {}
    try:
        input_processor = engine_client.input_processor
        input_preprocessor = input_processor.input_preprocessor
        if hasattr(input_preprocessor, "_get_mm_processor"):
            mm_processor = input_preprocessor._get_mm_processor()
            if mm_processor is not None and hasattr(mm_processor, "info"):
                ctx = mm_processor.info.ctx
                return ctx.get_all_timing_stats()
    except (AttributeError, RuntimeError):
        pass
    return {}
@contextmanager
 def _timed_operation(ctx: "InputProcessingContext", stage_name: str):
    """
    Context manager to time an operation using the context's timing stats.
    The request_id is automatically retrieved from the context variable,
    so it doesn't need to be passed as a parameter.
    Args:
        ctx: The InputProcessingContext containing the timing stats registry.
        stage_name: Name of the stage being timed.
    """
    request_id = get_current_request_id()
    if ctx is None or request_id is None:
        yield
        return
    stats = ctx.get_timing_stats(request_id)
    if stats is None:
        yield
        return
    start_time = time.perf_counter()
    try:
        yield
    finally:
        elapsed = time.perf_counter() - start_time
        if stage_name == "hf_processor":
            stats.hf_processor_time += elapsed
        elif stage_name == "hashing":
            stats.hashing_time += elapsed
        elif stage_name == "cache_lookup":
            stats.cache_lookup_time += elapsed
        elif stage_name == "prompt_update":
            stats.prompt_update_time += elapsed
 PromptSeq: TypeAlias = str | list[int]
 """A token sequence (list of token IDs) or text."""
@ -951,6 +1075,21 @@ class InputProcessingContext:
    tokenizer: TokenizerLike | None
    """The tokenizer used to tokenize the inputs."""
    observability_config: "ObservabilityConfig | None" = field(
        default=None, compare=False, repr=False
    )
    """Configuration for observability features."""
    timing_stats_registry: dict[str, MultiModalProcessorTimingStats] = field(
        default_factory=dict, compare=False, repr=False
    )
    """Registry for storing timing stats keyed by request_id."""
    _timing_stats_registry_lock: threading.Lock = field(
        default_factory=threading.Lock, compare=False, repr=False
    )
    """Lock for thread-safe access to timing_stats_registry."""
    def get_tokenizer(self) -> TokenizerLike:
        if self.tokenizer is None:
            raise ValueError(
@ -1159,6 +1298,71 @@ class InputProcessingContext:
        return self._postprocess_output(output)
    def get_timing_stats(
        self, request_id: str
    ) -> MultiModalProcessorTimingStats | None:
        """
        Get timing stats for a request.
        """
        if (
            self.observability_config is None
            or not self.observability_config.enable_mm_processor_stats
        ):
            return None
        with self._timing_stats_registry_lock:
            return self.timing_stats_registry.get(request_id)
    def create_timing_stats(self, request_id: str) -> MultiModalProcessorTimingStats:
        """
        Create and store timing stats in the registry for a request.
        This should be called at the start of processing for a request.
        The stats object is created immediately and stored in the registry.
        """
        if (
            self.observability_config is None
            or not self.observability_config.enable_mm_processor_stats
        ):
            return MultiModalProcessorTimingStats()
        with self._timing_stats_registry_lock:
            if request_id in self.timing_stats_registry:
                raise ValueError(
                    f"Timing stats already exist for request_id: {request_id}"
                )
            stats = MultiModalProcessorTimingStats()
            self.timing_stats_registry[request_id] = stats
            return stats
    def clear_timing_stats_registry(self) -> int:
        """
        Clear all stats from the registry. Returns the number of stats cleared.
        """
        if (
            self.observability_config is None
            or not self.observability_config.enable_mm_processor_stats
        ):
            return 0
        with self._timing_stats_registry_lock:
            count = len(self.timing_stats_registry)
            self.timing_stats_registry.clear()
            return count
    def get_all_timing_stats(self) -> dict[str, dict[str, float]]:
        """
        Get all timing stats as a dictionary for API endpoints.
        """
        if (
            self.observability_config is None
            or not self.observability_config.enable_mm_processor_stats
        ):
            return {}
        with self._timing_stats_registry_lock:
            return {
                rid: stats.to_dict()
                for rid, stats in self.timing_stats_registry.items()
            }
 class BaseProcessingInfo:
    """Base class to provide the information necessary for data processing."""
@ -1494,11 +1698,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        Call the HF processor on the prompt text and
        associated multi-modal data.
        """
-        return self.info.ctx.call_hf_processor(
+        with _timed_operation(self.info.ctx, "hf_processor"):
-            self.info.get_hf_processor(**mm_kwargs),
+            return self.info.ctx.call_hf_processor(
-            dict(text=prompt, **mm_data),
+                self.info.get_hf_processor(**mm_kwargs),
-            dict(**mm_kwargs, **tok_kwargs),
+                dict(text=prompt, **mm_data),
-        )
+                dict(**mm_kwargs, **tok_kwargs),
            )
    def _hf_processor_applies_updates(
        self,
@ -1846,12 +2051,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        )
        # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = self._hash_mm_items(
+        with _timed_operation(self.info.ctx, "hashing"):
-            mm_data_items,
+            mm_hashes = self._hash_mm_items(
-            hf_processor_mm_kwargs,
+                mm_data_items,
-            tokenization_kwargs,
+                hf_processor_mm_kwargs,
-            mm_uuids=mm_uuids,
+                tokenization_kwargs,
-        )
+                mm_uuids=mm_uuids,
            )
        mm_prompt_updates = self._get_mm_prompt_updates(
            mm_data_items,
@ -1892,18 +2098,20 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                mm_uuids=mm_uuids,
            )
-        mm_hashes = self._hash_mm_items(
+        with _timed_operation(self.info.ctx, "hashing"):
-            mm_data_items,
+            mm_hashes = self._hash_mm_items(
-            hf_processor_mm_kwargs,
+                mm_data_items,
-            tokenization_kwargs,
+                hf_processor_mm_kwargs,
-            mm_uuids=mm_uuids,
+                tokenization_kwargs,
-        )
+                mm_uuids=mm_uuids,
            )
-        mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
+        with _timed_operation(self.info.ctx, "cache_lookup"):
-            cache=cache,
+            mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
-            mm_data_items=mm_data_items,
+                cache=cache,
-            mm_hashes=mm_hashes,
+                mm_data_items=mm_data_items,
-        )
+                mm_hashes=mm_hashes,
            )
        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
        # so we can't apply prompt updates until the new multimodal
@ -1933,13 +2141,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
            mm_missing_kwargs,
        )
-        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
+        with _timed_operation(self.info.ctx, "cache_lookup"):
-            cache,
+            mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
-            mm_hashes=mm_hashes,
+                cache,
-            mm_is_cached=mm_is_cached,
+                mm_hashes=mm_hashes,
-            mm_missing_kwargs=mm_missing_kwargs,
+                mm_is_cached=mm_is_cached,
-            mm_missing_prompt_updates=mm_missing_prompt_updates,
+                mm_missing_kwargs=mm_missing_kwargs,
-        )
+                mm_missing_prompt_updates=mm_missing_prompt_updates,
            )
        mm_info = MultiModalProcessingInfo(
            kwargs=mm_kwargs,
@ -2121,6 +2330,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        3. Extract information about the placeholder tokens from the
           processed token IDs.
        """
        request_id = get_current_request_id()
        if request_id is not None:
            self.info.ctx.create_timing_stats(request_id)
        stats = (
            self.info.ctx.get_timing_stats(request_id)
            if request_id is not None
            else None
        )
        mm_items = self._to_mm_items(mm_data)
        if tokenization_kwargs is None:
@ -2139,13 +2357,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        )
        # NOTE: tokenization_kwargs are not required to init processor
-        prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
+        with _timed_operation(self.info.ctx, "prompt_update"):
-            mm_items=mm_items,
+            prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
-            prompt_ids=prompt_ids,
+                mm_items=mm_items,
-            mm_kwargs=mm_info.kwargs,
+                prompt_ids=prompt_ids,
-            mm_prompt_updates=mm_info.prompt_updates,
+                mm_kwargs=mm_info.kwargs,
-            is_update_applied=is_update_applied,
+                mm_prompt_updates=mm_info.prompt_updates,
-        )
+                is_update_applied=is_update_applied,
            )
        mm_placeholder_ranges = {
            modality: [item.to_range() for item in placeholders]
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@ -5,6 +5,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.config.observability import ObservabilityConfig
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
@ -22,7 +23,7 @@ from .profiling import (
 )
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, ObservabilityConfig
    from vllm.model_executor.models.interfaces import SupportsMultiModal
 logger = init_logger(__name__)
@ -148,6 +149,7 @@ class MultiModalRegistry:
        *,
        cache: BaseMultiModalProcessorCache | None = None,
        profiler_limits: Mapping[str, int] | None = None,
        observability_config: ObservabilityConfig | None = None,
    ) -> Mapping[str, int]:
        """
        Get the maximum number of tokens per data item from each modality based
@ -156,7 +158,11 @@ class MultiModalRegistry:
        if not model_config.is_multimodal_model:
            return {}
-        processor = self.create_processor(model_config, cache=cache)
+        if observability_config is None:
            observability_config = ObservabilityConfig()
        processor = self.create_processor(
            model_config, observability_config, cache=cache
        )
        profiler: MultiModalProfiler = MultiModalProfiler(processor)
        seq_len = model_config.max_model_len
@ -174,6 +180,7 @@ class MultiModalRegistry:
        model_config: "ModelConfig",
        *,
        cache: BaseMultiModalProcessorCache | None = None,
        observability_config: ObservabilityConfig | None = None,
    ) -> Mapping[str, int]:
        """
        Get the maximum number of multi-modal input instances for each modality
@ -182,7 +189,11 @@ class MultiModalRegistry:
        if not model_config.is_multimodal_model:
            return {}
-        processor = self.create_processor(model_config, cache=cache)
+        if observability_config is None:
            observability_config = ObservabilityConfig()
        processor = self.create_processor(
            model_config, observability_config, cache=cache
        )
        profiler: MultiModalProfiler = MultiModalProfiler(processor)
        return profiler.get_mm_limits()
@ -231,27 +242,32 @@ class MultiModalRegistry:
    def _create_processing_ctx(
        self,
        model_config: "ModelConfig",
        observability_config: "ObservabilityConfig | None" = None,
        tokenizer: TokenizerLike | None = None,
    ) -> InputProcessingContext:
        if tokenizer is None and not model_config.skip_tokenizer_init:
            tokenizer = cached_tokenizer_from_config(model_config)
-        return InputProcessingContext(model_config, tokenizer)
+        return InputProcessingContext(
            model_config, tokenizer, observability_config=observability_config
        )
    def _create_processing_info(
        self,
        model_config: "ModelConfig",
        observability_config: "ObservabilityConfig | None" = None,
        *,
        tokenizer: TokenizerLike | None = None,
    ) -> BaseProcessingInfo:
        model_cls = self._get_model_cls(model_config)
        factories = model_cls._processor_factory
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+        ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
        return factories.info(ctx)
    def create_processor(
        self,
        model_config: "ModelConfig",
        observability_config: "ObservabilityConfig | None" = None,
        *,
        tokenizer: TokenizerLike | None = None,
        cache: BaseMultiModalProcessorCache | None = None,
@ -265,7 +281,7 @@ class MultiModalRegistry:
        model_cls = self._get_model_cls(model_config)
        factories = model_cls._processor_factory
-        ctx = self._create_processing_ctx(model_config, tokenizer)
+        ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
        return factories.build_processor(ctx, cache=cache)
@ -276,13 +292,18 @@ class MultiModalRegistry:
        mm_counts: Mapping[str, int] | None = None,
        *,
        cache: BaseMultiModalProcessorCache | None = None,
        observability_config: ObservabilityConfig | None = None,
    ) -> DummyDecoderData:
        """
        Create dummy data for profiling the memory usage of a model.
        The model is identified by `model_config`.
        """
-        processor = self.create_processor(model_config, cache=cache)
+        if observability_config is None:
            observability_config = ObservabilityConfig()
        processor = self.create_processor(
            model_config, observability_config, cache=cache
        )
        profiler: MultiModalProfiler = MultiModalProfiler(processor)
        # Extract configurable options from multimodal config.
@ -309,13 +330,18 @@ class MultiModalRegistry:
        mm_counts: Mapping[str, int] | None = None,
        *,
        cache: BaseMultiModalProcessorCache | None = None,
        observability_config: ObservabilityConfig | None = None,
    ) -> DummyEncoderData:
        """
        Create dummy data for profiling the memory usage of a model.
        The model is identified by `model_config`.
        """
-        processor = self.create_processor(model_config, cache=cache)
+        if observability_config is None:
            observability_config = ObservabilityConfig()
        processor = self.create_processor(
            model_config, observability_config, cache=cache
        )
        profiler: MultiModalProfiler = MultiModalProfiler(processor)
        # Extract configurable options from multimodal config.
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@ -15,7 +15,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
 from vllm.multimodal.parse import MultiModalDataParser
-from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.processing import EncDecMultiModalProcessor, set_request_id
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@ -57,6 +57,7 @@ class InputProcessor:
        self.input_preprocessor = InputPreprocessor(
            self.model_config,
            tokenizer,
            self.vllm_config.observability_config,
            mm_registry,
            mm_processor_cache=self.mm_processor_cache,
        )
@ -445,11 +446,13 @@ class InputProcessor:
        # 1. Tokenize text prompt, with LoRA request if one exists.
        # 2. For multimodal models with a merged preprocessor, preprocess
        #   multimodal data and expand prompt token ids accordingly.
-        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+        with set_request_id(request_id):
-            prompt,
+            processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-            tokenization_kwargs=tokenization_kwargs,
+                prompt,
-            mm_uuids=mm_uuids,
+                tokenization_kwargs=tokenization_kwargs,
-        )
+                mm_uuids=mm_uuids,
            )
        from vllm.platforms import current_platform
        current_platform.validate_request(
@ -590,6 +593,7 @@ class InputProcessor:
                mm_registry = self.input_preprocessor.mm_registry
                mm_processor = mm_registry.create_processor(
                    model_config,
                    self.vllm_config.observability_config,
                    tokenizer=tokenizer,
                )
                assert isinstance(mm_processor, EncDecMultiModalProcessor)