Signed-off-by: Reagan <reaganjlee@gmail.com>
This commit is contained in:
Reagan 2025-12-18 18:36:58 -08:00
parent a81128725c
commit ae0c59e57e
4 changed files with 8 additions and 32 deletions

View File

@ -2,12 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark multimodal processor latency.
This benchmark measures the latency of the multimodal processor module
This benchmark measures the latency of the mm processor module
using randomly generated multimodal prompts with synthetic images.
MM processor stats are automatically enabled.
Run:
vllm bench multimodal-processor \
vllm bench mm-processor \
--model <your_model> \
--num-prompts 10 \
--input-len 1024 \
@ -55,7 +55,6 @@ class MultimodalProcessorBenchmarkMetrics:
def collect_mm_processor_stats(
llm_engine: Any,
debug: bool = False,
) -> dict[str, list[float]]:
"""
Collect multimodal processor timing stats.
@ -84,12 +83,6 @@ def collect_mm_processor_stats(
)
stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0))
if debug and not any(stats_by_stage.values()):
print(
"Warning: No MM processor stats found. "
"Ensure --enable-mm-processor-stats is set."
)
return stats_by_stage
@ -222,8 +215,6 @@ def benchmark_multimodal_processor(
freeze_gc_heap()
debug = getattr(args, "debug_mm_stats", False)
print(f"Processing {len(prompts)} requests...")
start_time = time.perf_counter()
@ -236,7 +227,6 @@ def benchmark_multimodal_processor(
mm_stats_by_stage = collect_mm_processor_stats(
llm.llm_engine,
debug=debug,
)
if not any(mm_stats_by_stage.values()):
@ -350,11 +340,6 @@ def add_cli_args(parser: argparse.ArgumentParser) -> None:
default=None,
help="Path to save the benchmark results in JSON format.",
)
parser.add_argument(
"--debug-mm-stats",
action="store_true",
help="Enable debug logging for MM processor stats collection.",
)
parser.add_argument(
"--metric-percentiles",
type=str,
@ -444,7 +429,7 @@ def main(args: argparse.Namespace) -> None:
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark multimodal processor latency"
description="Benchmark mm processor latency"
)
add_cli_args(parser)
args = parser.parse_args()

View File

@ -66,8 +66,8 @@ class ObservabilityConfig:
enable_mm_processor_stats: bool = False
"""Enable collection of timing statistics for multimodal processor operations.
This can be useful for performance analysis and debugging. Defaults to `False`
(disabled)."""
This is for internal use only (e.g., benchmarks) and is not exposed as a CLI
argument. Defaults to `False` (disabled)."""
@cached_property
def collect_model_forward_time(self) -> bool:

View File

@ -1041,10 +1041,9 @@ class EngineArgs:
"--enable-layerwise-nvtx-tracing",
**observability_kwargs["enable_layerwise_nvtx_tracing"],
)
observability_group.add_argument(
"--enable-mm-processor-stats",
**observability_kwargs["enable_mm_processor_stats"],
)
# Note: --enable-mm-processor-stats is intentionally not exposed as a CLI
# argument. It can be set programmatically via parser.set_defaults() for
# internal use (e.g., benchmarks), but is not part of the public API.
# Scheduler arguments
scheduler_kwargs = get_kwargs(SchedulerConfig)

View File

@ -158,8 +158,6 @@ class MultiModalRegistry:
if not model_config.is_multimodal_model:
return {}
if observability_config is None:
observability_config = ObservabilityConfig()
processor = self.create_processor(
model_config, observability_config, cache=cache
)
@ -189,8 +187,6 @@ class MultiModalRegistry:
if not model_config.is_multimodal_model:
return {}
if observability_config is None:
observability_config = ObservabilityConfig()
processor = self.create_processor(
model_config, observability_config, cache=cache
)
@ -299,8 +295,6 @@ class MultiModalRegistry:
The model is identified by `model_config`.
"""
if observability_config is None:
observability_config = ObservabilityConfig()
processor = self.create_processor(
model_config, observability_config, cache=cache
)
@ -337,8 +331,6 @@ class MultiModalRegistry:
The model is identified by `model_config`.
"""
if observability_config is None:
observability_config = ObservabilityConfig()
processor = self.create_processor(
model_config, observability_config, cache=cache
)