diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py index 889e2272c3cb8..32e7075fb6408 100644 --- a/vllm/benchmarks/mm_processor.py +++ b/vllm/benchmarks/mm_processor.py @@ -2,12 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark multimodal processor latency. -This benchmark measures the latency of the multimodal processor module +This benchmark measures the latency of the mm processor module using randomly generated multimodal prompts with synthetic images. MM processor stats are automatically enabled. Run: - vllm bench multimodal-processor \ + vllm bench mm-processor \ --model \ --num-prompts 10 \ --input-len 1024 \ @@ -55,7 +55,6 @@ class MultimodalProcessorBenchmarkMetrics: def collect_mm_processor_stats( llm_engine: Any, - debug: bool = False, ) -> dict[str, list[float]]: """ Collect multimodal processor timing stats. @@ -84,12 +83,6 @@ def collect_mm_processor_stats( ) stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0)) - if debug and not any(stats_by_stage.values()): - print( - "Warning: No MM processor stats found. " - "Ensure --enable-mm-processor-stats is set." - ) - return stats_by_stage @@ -222,8 +215,6 @@ def benchmark_multimodal_processor( freeze_gc_heap() - debug = getattr(args, "debug_mm_stats", False) - print(f"Processing {len(prompts)} requests...") start_time = time.perf_counter() @@ -236,7 +227,6 @@ def benchmark_multimodal_processor( mm_stats_by_stage = collect_mm_processor_stats( llm.llm_engine, - debug=debug, ) if not any(mm_stats_by_stage.values()): @@ -350,11 +340,6 @@ def add_cli_args(parser: argparse.ArgumentParser) -> None: default=None, help="Path to save the benchmark results in JSON format.", ) - parser.add_argument( - "--debug-mm-stats", - action="store_true", - help="Enable debug logging for MM processor stats collection.", - ) parser.add_argument( "--metric-percentiles", type=str, @@ -444,7 +429,7 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Benchmark multimodal processor latency" + description="Benchmark mm processor latency" ) add_cli_args(parser) args = parser.parse_args() diff --git a/vllm/config/observability.py b/vllm/config/observability.py index 3333b661a759c..eb4c1482f35cf 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -66,8 +66,8 @@ class ObservabilityConfig: enable_mm_processor_stats: bool = False """Enable collection of timing statistics for multimodal processor operations. - This can be useful for performance analysis and debugging. Defaults to `False` - (disabled).""" + This is for internal use only (e.g., benchmarks) and is not exposed as a CLI + argument. Defaults to `False` (disabled).""" @cached_property def collect_model_forward_time(self) -> bool: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ee2b6d0457c4d..143849c3bc970 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1041,10 +1041,9 @@ class EngineArgs: "--enable-layerwise-nvtx-tracing", **observability_kwargs["enable_layerwise_nvtx_tracing"], ) - observability_group.add_argument( - "--enable-mm-processor-stats", - **observability_kwargs["enable_mm_processor_stats"], - ) + # Note: --enable-mm-processor-stats is intentionally not exposed as a CLI + # argument. It can be set programmatically via parser.set_defaults() for + # internal use (e.g., benchmarks), but is not part of the public API. # Scheduler arguments scheduler_kwargs = get_kwargs(SchedulerConfig) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 67a1d05facb2b..b141125969fad 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -158,8 +158,6 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: return {} - if observability_config is None: - observability_config = ObservabilityConfig() processor = self.create_processor( model_config, observability_config, cache=cache ) @@ -189,8 +187,6 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: return {} - if observability_config is None: - observability_config = ObservabilityConfig() processor = self.create_processor( model_config, observability_config, cache=cache ) @@ -299,8 +295,6 @@ class MultiModalRegistry: The model is identified by `model_config`. """ - if observability_config is None: - observability_config = ObservabilityConfig() processor = self.create_processor( model_config, observability_config, cache=cache ) @@ -337,8 +331,6 @@ class MultiModalRegistry: The model is identified by `model_config`. """ - if observability_config is None: - observability_config = ObservabilityConfig() processor = self.create_processor( model_config, observability_config, cache=cache )