mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-10 18:50:15 +08:00
change
Signed-off-by: Reagan Lee <reaganjlee@gmail.com> Signed-off-by: Reagan <reaganjlee@gmail.com>
This commit is contained in:
parent
e087fbc393
commit
e3dd9108cb
9
docs/cli/bench/multimodal_processor.md
Normal file
9
docs/cli/bench/multimodal_processor.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm bench multimodal-processor
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/bench_multimodal_processor.inc.md"
|
||||||
@ -92,6 +92,7 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
|
|||||||
|
|
||||||
|
|
||||||
bench_latency = auto_mock("vllm.benchmarks", "latency")
|
bench_latency = auto_mock("vllm.benchmarks", "latency")
|
||||||
|
bench_multimodal_processor = auto_mock("vllm.benchmarks", "multimodal_processor")
|
||||||
bench_serve = auto_mock("vllm.benchmarks", "serve")
|
bench_serve = auto_mock("vllm.benchmarks", "serve")
|
||||||
bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
|
bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
|
||||||
bench_sweep_plot_pareto = auto_mock(
|
bench_sweep_plot_pareto = auto_mock(
|
||||||
@ -222,6 +223,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
|||||||
"run-batch": create_parser(openai_run_batch.make_arg_parser),
|
"run-batch": create_parser(openai_run_batch.make_arg_parser),
|
||||||
# Benchmark CLI
|
# Benchmark CLI
|
||||||
"bench_latency": create_parser(bench_latency.add_cli_args),
|
"bench_latency": create_parser(bench_latency.add_cli_args),
|
||||||
|
"bench_multimodal_processor": create_parser(bench_multimodal_processor.add_cli_args),
|
||||||
"bench_serve": create_parser(bench_serve.add_cli_args),
|
"bench_serve": create_parser(bench_serve.add_cli_args),
|
||||||
"bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
|
"bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
|
||||||
"bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
|
"bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
|
||||||
|
|||||||
@ -12,7 +12,11 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import (
|
||||||
|
ModelConfig,
|
||||||
|
VllmConfig,
|
||||||
|
set_current_vllm_config,
|
||||||
|
)
|
||||||
from vllm.config.multimodal import (
|
from vllm.config.multimodal import (
|
||||||
AudioDummyOptions,
|
AudioDummyOptions,
|
||||||
BaseDummyOptions,
|
BaseDummyOptions,
|
||||||
|
|||||||
@ -1908,7 +1908,8 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Enforce endpoint compatibility for multimodal datasets.
|
# Enforce endpoint compatibility for multimodal datasets.
|
||||||
if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
|
backend = getattr(args, "backend", "openai-chat")
|
||||||
|
if args.dataset_name == "random-mm" and backend not in ["openai-chat"]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Multi-modal content (images) is only supported on "
|
"Multi-modal content (images) is only supported on "
|
||||||
"'openai-chat' backend."
|
"'openai-chat' backend."
|
||||||
|
|||||||
464
vllm/benchmarks/multimodal_processor.py
Normal file
464
vllm/benchmarks/multimodal_processor.py
Normal file
@ -0,0 +1,464 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
r"""Benchmark multimodal processor latency.
|
||||||
|
|
||||||
|
This benchmark measures the latency of the multimodal processor module
|
||||||
|
using randomly generated multimodal prompts with synthetic images.
|
||||||
|
MM processor stats are automatically enabled.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
vllm bench multimodal-processor \
|
||||||
|
--model <your_model> \
|
||||||
|
--num-prompts 10 \
|
||||||
|
--input-len 1024 \
|
||||||
|
--output-len 128 \
|
||||||
|
--num-images 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.multimodal.processing import (
|
||||||
|
get_timing_stats_from_engine_client,
|
||||||
|
)
|
||||||
|
from vllm.utils.gc_utils import freeze_gc_heap
|
||||||
|
from vllm.utils.import_utils import PlaceholderModule
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
except ImportError:
|
||||||
|
pd = PlaceholderModule("pandas")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MultimodalProcessorBenchmarkMetrics:
|
||||||
|
"""Metrics for multimodal processor benchmark."""
|
||||||
|
|
||||||
|
completed: int
|
||||||
|
failed: int
|
||||||
|
mean_e2el_ms: float
|
||||||
|
median_e2el_ms: float
|
||||||
|
std_e2el_ms: float
|
||||||
|
percentiles_e2el_ms: list[tuple[float, float]]
|
||||||
|
|
||||||
|
"""Per-stage timing stats: mean, median, std, percentiles for each stage."""
|
||||||
|
mm_processor_stats: dict[str, dict[str, float]]
|
||||||
|
|
||||||
|
|
||||||
|
def collect_mm_processor_stats(
|
||||||
|
llm_engine: Any,
|
||||||
|
debug: bool = False,
|
||||||
|
) -> dict[str, list[float]]:
|
||||||
|
"""
|
||||||
|
Collect multimodal processor timing stats.
|
||||||
|
Returns a dictionary mapping stage names to lists of timing values (in seconds).
|
||||||
|
"""
|
||||||
|
all_stats = get_timing_stats_from_engine_client(llm_engine)
|
||||||
|
|
||||||
|
stats_by_stage = {
|
||||||
|
"hf_processor_time": [],
|
||||||
|
"hashing_time": [],
|
||||||
|
"cache_lookup_time": [],
|
||||||
|
"prompt_update_time": [],
|
||||||
|
"total_time": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for stats_dict in all_stats.values():
|
||||||
|
stats_by_stage["hf_processor_time"].append(
|
||||||
|
stats_dict.get("hf_processor_time", 0.0)
|
||||||
|
)
|
||||||
|
stats_by_stage["hashing_time"].append(stats_dict.get("hashing_time", 0.0))
|
||||||
|
stats_by_stage["cache_lookup_time"].append(
|
||||||
|
stats_dict.get("cache_lookup_time", 0.0)
|
||||||
|
)
|
||||||
|
stats_by_stage["prompt_update_time"].append(
|
||||||
|
stats_dict.get("prompt_update_time", 0.0)
|
||||||
|
)
|
||||||
|
stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0))
|
||||||
|
|
||||||
|
if debug and not any(stats_by_stage.values()):
|
||||||
|
print(
|
||||||
|
"Warning: No MM processor stats found. Ensure --enable-mm-processor-stats is set."
|
||||||
|
)
|
||||||
|
|
||||||
|
return stats_by_stage
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_mm_processor_metrics(
|
||||||
|
stats_by_stage: dict[str, list[float]],
|
||||||
|
selected_percentiles: list[float],
|
||||||
|
) -> dict[str, dict[str, float]]:
|
||||||
|
"""
|
||||||
|
Calculate aggregate metrics from stats by stage.
|
||||||
|
"""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
for stage_name, times in stats_by_stage.items():
|
||||||
|
if not times:
|
||||||
|
metrics[stage_name] = {
|
||||||
|
"mean": 0.0,
|
||||||
|
"median": 0.0,
|
||||||
|
"std": 0.0,
|
||||||
|
**{f"p{p}": 0.0 for p in selected_percentiles},
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
times_ms = [t * 1000 for t in times]
|
||||||
|
metrics[stage_name] = {
|
||||||
|
"mean": float(np.mean(times_ms)),
|
||||||
|
"median": float(np.median(times_ms)),
|
||||||
|
"std": float(np.std(times_ms)),
|
||||||
|
**{
|
||||||
|
f"p{p}": float(np.percentile(times_ms, p)) for p in selected_percentiles
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
def generate_random_multimodal_prompts(
|
||||||
|
num_prompts: int,
|
||||||
|
input_len: int,
|
||||||
|
output_len: int,
|
||||||
|
tokenizer: Any,
|
||||||
|
num_images: int = 1,
|
||||||
|
image_width: int = 256,
|
||||||
|
image_height: int = 256,
|
||||||
|
seed: int = 0,
|
||||||
|
) -> tuple[list[list[dict]], list[int]]:
|
||||||
|
"""
|
||||||
|
Generate random multimodal prompts with synthetic images and text tokens.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (prompts, expected_output_lens)
|
||||||
|
- prompts: List of OpenAI chat format messages with text and images
|
||||||
|
- expected_output_lens: List of expected output lengths
|
||||||
|
"""
|
||||||
|
from PIL import Image
|
||||||
|
from vllm.benchmarks.datasets import process_image
|
||||||
|
|
||||||
|
rng = np.random.default_rng(seed)
|
||||||
|
|
||||||
|
prompts = []
|
||||||
|
expected_output_lens = []
|
||||||
|
|
||||||
|
for i in range(num_prompts):
|
||||||
|
vocab_size = tokenizer.vocab_size
|
||||||
|
prompt_token_ids = rng.integers(
|
||||||
|
0, vocab_size, size=input_len
|
||||||
|
).tolist()
|
||||||
|
|
||||||
|
text_prompt = tokenizer.decode(prompt_token_ids)
|
||||||
|
|
||||||
|
mm_items = []
|
||||||
|
for _ in range(num_images):
|
||||||
|
# Generate random RGB image
|
||||||
|
random_pixels = rng.integers(
|
||||||
|
0, 256, (image_height, image_width, 3), dtype=np.uint8
|
||||||
|
)
|
||||||
|
image = Image.fromarray(random_pixels)
|
||||||
|
# Process to OpenAI format
|
||||||
|
mm_item = process_image(image)
|
||||||
|
mm_items.append(mm_item)
|
||||||
|
|
||||||
|
# Create chat format: text + images
|
||||||
|
content = [{"type": "text", "text": text_prompt}]
|
||||||
|
content.extend(mm_items)
|
||||||
|
prompts.append([{"role": "user", "content": content}])
|
||||||
|
expected_output_lens.append(output_len)
|
||||||
|
|
||||||
|
return prompts, expected_output_lens
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_multimodal_processor(
|
||||||
|
args: argparse.Namespace,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Run the multimodal processor benchmark.
|
||||||
|
"""
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
|
# Validate max_model_len
|
||||||
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
|
args.input_len + args.output_len
|
||||||
|
), (
|
||||||
|
"Please ensure that max_model_len is greater than "
|
||||||
|
"the sum of input_len and output_len."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate random multimodal prompts
|
||||||
|
seed = getattr(args, "seed", 0)
|
||||||
|
tokenizer = llm.get_tokenizer()
|
||||||
|
prompts, expected_output_lens = generate_random_multimodal_prompts(
|
||||||
|
num_prompts=args.num_prompts,
|
||||||
|
input_len=args.input_len,
|
||||||
|
output_len=args.output_len,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
num_images=args.num_images,
|
||||||
|
image_width=args.image_width,
|
||||||
|
image_height=args.image_height,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create sampling params
|
||||||
|
sampling_params = [
|
||||||
|
SamplingParams(
|
||||||
|
n=1,
|
||||||
|
temperature=0.0, # Greedy sampling for deterministic speed benchmarks
|
||||||
|
max_tokens=output_len,
|
||||||
|
detokenize=True,
|
||||||
|
)
|
||||||
|
for output_len in expected_output_lens
|
||||||
|
]
|
||||||
|
|
||||||
|
selected_percentiles = [
|
||||||
|
float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
|
||||||
|
]
|
||||||
|
|
||||||
|
freeze_gc_heap()
|
||||||
|
|
||||||
|
# MM processor stats are automatically enabled via set_defaults
|
||||||
|
# No need to check or raise error
|
||||||
|
|
||||||
|
debug = getattr(args, "debug_mm_stats", False)
|
||||||
|
|
||||||
|
print(f"Processing {len(prompts)} requests...")
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
outputs = llm.chat(
|
||||||
|
prompts, sampling_params, use_tqdm=not getattr(args, "disable_tqdm", False)
|
||||||
|
)
|
||||||
|
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
total_time = end_time - start_time
|
||||||
|
|
||||||
|
mm_stats_by_stage = collect_mm_processor_stats(
|
||||||
|
llm.llm_engine,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not any(mm_stats_by_stage.values()):
|
||||||
|
print(
|
||||||
|
"\n⚠️ Warning: No MM processor stats found in registry.\n"
|
||||||
|
" This may indicate that:\n"
|
||||||
|
" - No multimodal requests were processed\n"
|
||||||
|
" - Stats were already retrieved (registry is cleared after retrieval)\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
mm_processor_metrics = calculate_mm_processor_metrics(
|
||||||
|
mm_stats_by_stage, selected_percentiles
|
||||||
|
)
|
||||||
|
|
||||||
|
completed = len([o for o in outputs if o.finished])
|
||||||
|
failed = len(outputs) - completed
|
||||||
|
|
||||||
|
e2el_times = []
|
||||||
|
for output in outputs:
|
||||||
|
if not output.finished or output.metrics is None:
|
||||||
|
continue
|
||||||
|
metrics = output.metrics
|
||||||
|
for attr in ("finished_time", "last_token_time"):
|
||||||
|
if (
|
||||||
|
getattr(metrics, attr, None) is not None
|
||||||
|
and getattr(metrics, "arrival_time", None) is not None
|
||||||
|
):
|
||||||
|
e2el_times.append(
|
||||||
|
(getattr(metrics, attr) - metrics.arrival_time) * 1000
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not e2el_times and completed > 0:
|
||||||
|
avg_time_per_request = total_time / completed
|
||||||
|
e2el_times = [avg_time_per_request * 1000] * completed
|
||||||
|
|
||||||
|
if e2el_times:
|
||||||
|
mean_e2el_ms = float(np.mean(e2el_times))
|
||||||
|
median_e2el_ms = float(np.median(e2el_times))
|
||||||
|
std_e2el_ms = float(np.std(e2el_times))
|
||||||
|
percentiles_e2el_ms = [
|
||||||
|
(p, float(np.percentile(e2el_times, p))) for p in selected_percentiles
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
mean_e2el_ms = 0.0
|
||||||
|
median_e2el_ms = 0.0
|
||||||
|
std_e2el_ms = 0.0
|
||||||
|
percentiles_e2el_ms = [(p, 0.0) for p in selected_percentiles]
|
||||||
|
|
||||||
|
benchmark_result = {
|
||||||
|
"completed": completed,
|
||||||
|
"failed": failed,
|
||||||
|
"mean_e2el_ms": mean_e2el_ms,
|
||||||
|
"median_e2el_ms": median_e2el_ms,
|
||||||
|
"std_e2el_ms": std_e2el_ms,
|
||||||
|
"percentiles_e2el_ms": percentiles_e2el_ms,
|
||||||
|
"mm_processor_stats": mm_processor_metrics,
|
||||||
|
}
|
||||||
|
|
||||||
|
return benchmark_result
|
||||||
|
|
||||||
|
|
||||||
|
def add_cli_args(parser: argparse.ArgumentParser) -> None:
|
||||||
|
"""Add CLI arguments for the multimodal processor benchmark."""
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
|
||||||
|
# Add EngineArgs (no conflict since we removed dataset parser)
|
||||||
|
EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
# Automatically enable MM processor stats (required for this benchmark)
|
||||||
|
parser.set_defaults(enable_mm_processor_stats=True)
|
||||||
|
|
||||||
|
# Random generation arguments (similar to latency.py)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Number of prompts to process.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=1024,
|
||||||
|
help="Number of input tokens per request.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=128,
|
||||||
|
help="Number of output tokens per request.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-images",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of images per prompt.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--image-width",
|
||||||
|
type=int,
|
||||||
|
default=256,
|
||||||
|
help="Width of generated images in pixels.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--image-height",
|
||||||
|
type=int,
|
||||||
|
default=256,
|
||||||
|
help="Height of generated images in pixels.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to save the benchmark results in JSON format.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--debug-mm-stats",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable debug logging for MM processor stats collection.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--metric-percentiles",
|
||||||
|
type=str,
|
||||||
|
default="99",
|
||||||
|
help="Comma-separated list of percentiles to calculate (e.g., '50,90,99').",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-tqdm",
|
||||||
|
action="store_true",
|
||||||
|
help="Disable tqdm progress bar.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace) -> None:
|
||||||
|
"""Main entry point for the multimodal processor benchmark."""
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
print("Starting multimodal processor benchmark...")
|
||||||
|
result = benchmark_multimodal_processor(args)
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Multimodal Processor Benchmark Results")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
if "mm_processor_stats" in result:
|
||||||
|
print("\nMM Processor Timing (ms):")
|
||||||
|
selected_percentiles = [
|
||||||
|
float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
|
||||||
|
]
|
||||||
|
mm_data = []
|
||||||
|
for stage, metrics in result["mm_processor_stats"].items():
|
||||||
|
row = {
|
||||||
|
"Stage": stage,
|
||||||
|
"Mean": f"{metrics['mean']:.2f}",
|
||||||
|
"Median": f"{metrics['median']:.2f}",
|
||||||
|
"Std": f"{metrics['std']:.2f}",
|
||||||
|
}
|
||||||
|
for p in selected_percentiles:
|
||||||
|
row[f"P{p}"] = f"{metrics.get(f'p{p}', 0.0):.2f}"
|
||||||
|
mm_data.append(row)
|
||||||
|
|
||||||
|
mm_df = pd.DataFrame(mm_data)
|
||||||
|
print(mm_df.to_string(index=False))
|
||||||
|
|
||||||
|
if "mean_e2el_ms" in result:
|
||||||
|
print("\nEnd-to-End Latency (ms):")
|
||||||
|
selected_percentiles = [
|
||||||
|
float(p) for p in getattr(args, "metric_percentiles", "99").split(",")
|
||||||
|
]
|
||||||
|
|
||||||
|
e2el_data = [
|
||||||
|
{"Metric": "Mean", "Value (ms)": f"{result['mean_e2el_ms']:.2f}"},
|
||||||
|
{"Metric": "Median", "Value (ms)": f"{result['median_e2el_ms']:.2f}"},
|
||||||
|
{"Metric": "Std", "Value (ms)": f"{result['std_e2el_ms']:.2f}"},
|
||||||
|
]
|
||||||
|
|
||||||
|
for p in selected_percentiles:
|
||||||
|
percentile_value = next(
|
||||||
|
(val for pct, val in result["percentiles_e2el_ms"] if pct == p),
|
||||||
|
0.0,
|
||||||
|
)
|
||||||
|
e2el_data.append(
|
||||||
|
{
|
||||||
|
"Metric": f"P{p}",
|
||||||
|
"Value (ms)": f"{percentile_value:.2f}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
e2el_df = pd.DataFrame(e2el_data)
|
||||||
|
print(e2el_df.to_string(index=False))
|
||||||
|
|
||||||
|
if args.output_json:
|
||||||
|
result["config"] = {
|
||||||
|
"model": args.model,
|
||||||
|
"num_prompts": args.num_prompts,
|
||||||
|
"input_len": args.input_len,
|
||||||
|
"output_len": args.output_len,
|
||||||
|
"num_images": args.num_images,
|
||||||
|
"image_width": args.image_width,
|
||||||
|
"image_height": args.image_height,
|
||||||
|
}
|
||||||
|
result["timestamp"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(result, f, indent=2)
|
||||||
|
print(f"\nResults saved to {args.output_json}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark multimodal processor latency"
|
||||||
|
)
|
||||||
|
add_cli_args(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
@ -64,6 +64,11 @@ class ObservabilityConfig:
|
|||||||
module in the model and attach informations such as input/output shapes to
|
module in the model and attach informations such as input/output shapes to
|
||||||
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
|
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
|
||||||
|
|
||||||
|
enable_mm_processor_stats: bool = False
|
||||||
|
"""Enable collection of timing statistics for multimodal processor operations.
|
||||||
|
This can be useful for performance analysis and debugging. Defaults to `False`
|
||||||
|
(disabled)."""
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def collect_model_forward_time(self) -> bool:
|
def collect_model_forward_time(self) -> bool:
|
||||||
"""Whether to collect model forward time for the request."""
|
"""Whether to collect model forward time for the request."""
|
||||||
|
|||||||
@ -521,6 +521,7 @@ class EngineArgs:
|
|||||||
enable_layerwise_nvtx_tracing: bool = (
|
enable_layerwise_nvtx_tracing: bool = (
|
||||||
ObservabilityConfig.enable_layerwise_nvtx_tracing
|
ObservabilityConfig.enable_layerwise_nvtx_tracing
|
||||||
)
|
)
|
||||||
|
enable_mm_processor_stats: bool = ObservabilityConfig.enable_mm_processor_stats
|
||||||
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
||||||
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
|
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
|
||||||
|
|
||||||
@ -1040,6 +1041,10 @@ class EngineArgs:
|
|||||||
"--enable-layerwise-nvtx-tracing",
|
"--enable-layerwise-nvtx-tracing",
|
||||||
**observability_kwargs["enable_layerwise_nvtx_tracing"],
|
**observability_kwargs["enable_layerwise_nvtx_tracing"],
|
||||||
)
|
)
|
||||||
|
observability_group.add_argument(
|
||||||
|
"--enable-mm-processor-stats",
|
||||||
|
**observability_kwargs["enable_mm_processor_stats"],
|
||||||
|
)
|
||||||
|
|
||||||
# Scheduler arguments
|
# Scheduler arguments
|
||||||
scheduler_kwargs = get_kwargs(SchedulerConfig)
|
scheduler_kwargs = get_kwargs(SchedulerConfig)
|
||||||
@ -1682,6 +1687,7 @@ class EngineArgs:
|
|||||||
kv_cache_metrics_sample=self.kv_cache_metrics_sample,
|
kv_cache_metrics_sample=self.kv_cache_metrics_sample,
|
||||||
cudagraph_metrics=self.cudagraph_metrics,
|
cudagraph_metrics=self.cudagraph_metrics,
|
||||||
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
|
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
|
||||||
|
enable_mm_processor_stats=self.enable_mm_processor_stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compilation config overrides
|
# Compilation config overrides
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
|
from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
|
||||||
|
from vllm.entrypoints.cli.benchmark.multimodal_processor import (
|
||||||
|
BenchmarkMultimodalProcessorSubcommand,
|
||||||
|
)
|
||||||
from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
|
from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
|
||||||
from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
|
from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
|
||||||
from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
|
from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
|
||||||
@ -8,6 +11,7 @@ from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcomm
|
|||||||
|
|
||||||
__all__: list[str] = [
|
__all__: list[str] = [
|
||||||
"BenchmarkLatencySubcommand",
|
"BenchmarkLatencySubcommand",
|
||||||
|
"BenchmarkMultimodalProcessorSubcommand",
|
||||||
"BenchmarkServingSubcommand",
|
"BenchmarkServingSubcommand",
|
||||||
"BenchmarkStartupSubcommand",
|
"BenchmarkStartupSubcommand",
|
||||||
"BenchmarkSweepSubcommand",
|
"BenchmarkSweepSubcommand",
|
||||||
|
|||||||
22
vllm/entrypoints/cli/benchmark/multimodal_processor.py
Normal file
22
vllm/entrypoints/cli/benchmark/multimodal_processor.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from vllm.benchmarks.multimodal_processor import add_cli_args, main
|
||||||
|
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkMultimodalProcessorSubcommand(BenchmarkSubcommandBase):
|
||||||
|
"""The `multimodal-processor` subcommand for `vllm bench`."""
|
||||||
|
|
||||||
|
name = "multimodal-processor"
|
||||||
|
help = "Benchmark multimodal processor latency across different configurations."
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||||
|
add_cli_args(parser)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cmd(args: argparse.Namespace) -> None:
|
||||||
|
main(args)
|
||||||
|
|
||||||
62
vllm/envs.py
62
vllm/envs.py
@ -651,9 +651,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
|
"NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
|
||||||
# If set, vllm will log stats at this interval in seconds
|
# If set, vllm will log stats at this interval in seconds
|
||||||
# If not set, vllm will log stats every 10 seconds.
|
# If not set, vllm will log stats every 10 seconds.
|
||||||
"VLLM_LOG_STATS_INTERVAL": lambda: val
|
"VLLM_LOG_STATS_INTERVAL": lambda: (
|
||||||
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
|
val
|
||||||
else 10.0,
|
if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0
|
||||||
|
else 10.0
|
||||||
|
),
|
||||||
# Trace function calls
|
# Trace function calls
|
||||||
# If set to 1, vllm will trace function calls
|
# If set to 1, vllm will trace function calls
|
||||||
# Useful for debugging
|
# Useful for debugging
|
||||||
@ -678,28 +680,30 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
# If set, vllm will use flashinfer sampler
|
# If set, vllm will use flashinfer sampler
|
||||||
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
|
"VLLM_USE_FLASHINFER_SAMPLER": lambda: (
|
||||||
int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])
|
bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
|
||||||
)
|
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
|
||||||
if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ
|
else None
|
||||||
else None,
|
),
|
||||||
# Pipeline stage partition strategy
|
# Pipeline stage partition strategy
|
||||||
"VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
"VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
||||||
# (CPU backend only) CPU key-value cache space.
|
# (CPU backend only) CPU key-value cache space.
|
||||||
# default is None and will be set as 4 GB
|
# default is None and will be set as 4 GB
|
||||||
"VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
|
"VLLM_CPU_KVCACHE_SPACE": lambda: (
|
||||||
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
|
int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
|
||||||
else None,
|
if "VLLM_CPU_KVCACHE_SPACE" in os.environ
|
||||||
|
else None
|
||||||
|
),
|
||||||
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
|
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
|
||||||
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
|
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
|
||||||
"VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
|
"VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
|
||||||
# (CPU backend only) CPU cores not used by OMP threads .
|
# (CPU backend only) CPU cores not used by OMP threads .
|
||||||
# Those CPU cores will not be used by OMP threads of a rank.
|
# Those CPU cores will not be used by OMP threads of a rank.
|
||||||
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
|
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: (
|
||||||
os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
|
int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0"))
|
||||||
)
|
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
|
||||||
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
|
else None
|
||||||
else None,
|
),
|
||||||
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
|
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
|
||||||
"VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
|
"VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
|
||||||
# If the env var is set, Ray Compiled Graph uses the specified
|
# If the env var is set, Ray Compiled Graph uses the specified
|
||||||
@ -843,9 +847,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# a list of plugin names to load, separated by commas.
|
# a list of plugin names to load, separated by commas.
|
||||||
# if this is not set, it means all plugins will be loaded
|
# if this is not set, it means all plugins will be loaded
|
||||||
# if this is set to an empty string, no plugins will be loaded
|
# if this is set to an empty string, no plugins will be loaded
|
||||||
"VLLM_PLUGINS": lambda: None
|
"VLLM_PLUGINS": lambda: (
|
||||||
if "VLLM_PLUGINS" not in os.environ
|
None
|
||||||
else os.environ["VLLM_PLUGINS"].split(","),
|
if "VLLM_PLUGINS" not in os.environ
|
||||||
|
else os.environ["VLLM_PLUGINS"].split(",")
|
||||||
|
),
|
||||||
# a local directory to look in for unrecognized LoRA adapters.
|
# a local directory to look in for unrecognized LoRA adapters.
|
||||||
# only works if plugins are enabled and
|
# only works if plugins are enabled and
|
||||||
# VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
|
# VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
|
||||||
@ -917,9 +923,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# and performance comparisons. Currently only affects MPLinearKernel
|
# and performance comparisons. Currently only affects MPLinearKernel
|
||||||
# selection
|
# selection
|
||||||
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
|
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
|
||||||
"VLLM_DISABLED_KERNELS": lambda: []
|
"VLLM_DISABLED_KERNELS": lambda: (
|
||||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
[]
|
||||||
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||||
|
else os.environ["VLLM_DISABLED_KERNELS"].split(",")
|
||||||
|
),
|
||||||
# Disable pynccl (using torch.distributed instead)
|
# Disable pynccl (using torch.distributed instead)
|
||||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||||
@ -1155,11 +1163,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
== "1",
|
== "1",
|
||||||
# Gap between padding buckets for the forward pass. So we have
|
# Gap between padding buckets for the forward pass. So we have
|
||||||
# 8, we will run forward pass with [16, 24, 32, ...].
|
# 8, we will run forward pass with [16, 24, 32, ...].
|
||||||
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
|
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: (
|
||||||
os.environ["VLLM_TPU_BUCKET_PADDING_GAP"]
|
int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
|
||||||
)
|
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
|
||||||
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ
|
else 0
|
||||||
else 0,
|
),
|
||||||
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
|
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
|
||||||
os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
|
os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)
|
||||||
),
|
),
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import Any, cast
|
|||||||
|
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig, ObservabilityConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||||
@ -47,6 +47,7 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
tokenizer: TokenizerLike | None,
|
tokenizer: TokenizerLike | None,
|
||||||
|
observability_config: ObservabilityConfig | None = None,
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
|
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -54,6 +55,7 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
self.model_config = model_config
|
self.model_config = model_config
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
self.observability_config = observability_config
|
||||||
self.mm_registry = mm_registry
|
self.mm_registry = mm_registry
|
||||||
self.mm_processor_cache = mm_processor_cache
|
self.mm_processor_cache = mm_processor_cache
|
||||||
|
|
||||||
@ -232,6 +234,7 @@ class InputPreprocessor:
|
|||||||
if not hasattr(self, "_mm_processor"):
|
if not hasattr(self, "_mm_processor"):
|
||||||
self._mm_processor = self.mm_registry.create_processor(
|
self._mm_processor = self.mm_registry.create_processor(
|
||||||
self.model_config,
|
self.model_config,
|
||||||
|
self.observability_config,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
cache=self.mm_processor_cache,
|
cache=self.mm_processor_cache,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,9 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import contextvars
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence
|
from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence
|
||||||
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass, field, replace
|
from dataclasses import dataclass, field, replace
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
@ -22,6 +25,7 @@ import regex as re
|
|||||||
import torch
|
import torch
|
||||||
from typing_extensions import TypeVar, assert_never
|
from typing_extensions import TypeVar, assert_never
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.tokenizers import TokenizerLike
|
from vllm.tokenizers import TokenizerLike
|
||||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||||
@ -53,7 +57,7 @@ if TYPE_CHECKING:
|
|||||||
from transformers.feature_extraction_utils import BatchFeature
|
from transformers.feature_extraction_utils import BatchFeature
|
||||||
from transformers.processing_utils import ProcessorMixin
|
from transformers.processing_utils import ProcessorMixin
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig, ObservabilityConfig
|
||||||
|
|
||||||
from .cache import BaseMultiModalProcessorCache
|
from .cache import BaseMultiModalProcessorCache
|
||||||
from .profiling import BaseDummyInputsBuilder
|
from .profiling import BaseDummyInputsBuilder
|
||||||
@ -63,6 +67,7 @@ else:
|
|||||||
ProcessorMixin = object
|
ProcessorMixin = object
|
||||||
|
|
||||||
ModelConfig = object
|
ModelConfig = object
|
||||||
|
ObservabilityConfig = object
|
||||||
|
|
||||||
BaseMultiModalProcessorCache = object
|
BaseMultiModalProcessorCache = object
|
||||||
|
|
||||||
@ -70,6 +75,125 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
_S = TypeVar("_S", str, list[int])
|
_S = TypeVar("_S", str, list[int])
|
||||||
|
|
||||||
|
# Context variable to store the current request_id during preprocessing
|
||||||
|
_request_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar(
|
||||||
|
"_request_id_context", default=None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_request_id() -> str | None:
|
||||||
|
"""Get the current request_id from the context, if available."""
|
||||||
|
return _request_id_context.get()
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def set_request_id(request_id: str) -> Generator[None, None, None]:
|
||||||
|
"""Context manager to set the request_id for the current context."""
|
||||||
|
token = _request_id_context.set(request_id)
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
_request_id_context.reset(token)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MultiModalProcessorTimingStats:
|
||||||
|
"""Per-request timing statistics for multimodal processor stages."""
|
||||||
|
|
||||||
|
hf_processor_time: float = 0.0
|
||||||
|
"""Time spent in HuggingFace processor calls (seconds)."""
|
||||||
|
|
||||||
|
hashing_time: float = 0.0
|
||||||
|
"""Time spent computing multimodal item hashes (seconds)."""
|
||||||
|
|
||||||
|
cache_lookup_time: float = 0.0
|
||||||
|
"""Time spent in cache lookups and merges (seconds)."""
|
||||||
|
|
||||||
|
prompt_update_time: float = 0.0
|
||||||
|
"""Time spent applying prompt updates and finding placeholders (seconds)."""
|
||||||
|
|
||||||
|
total_time: float = 0.0
|
||||||
|
"""Total processing time (seconds)."""
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, float]:
|
||||||
|
"""Convert stats to a dictionary for JSON serialization."""
|
||||||
|
return {
|
||||||
|
"hf_processor_time": self.hf_processor_time,
|
||||||
|
"hashing_time": self.hashing_time,
|
||||||
|
"cache_lookup_time": self.cache_lookup_time,
|
||||||
|
"prompt_update_time": self.prompt_update_time,
|
||||||
|
"total_time": self.total_time,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_timing_stats_from_engine_client(engine_client: Any) -> dict[str, dict[str, float]]:
|
||||||
|
"""
|
||||||
|
Get all timing stats from the context associated with the engine client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine_client: The engine client that has input_processor.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary mapping request_id to stats dict.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not engine_client.vllm_config.observability_config.enable_mm_processor_stats:
|
||||||
|
return {}
|
||||||
|
except (AttributeError, RuntimeError):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
input_processor = engine_client.input_processor
|
||||||
|
input_preprocessor = input_processor.input_preprocessor
|
||||||
|
|
||||||
|
if hasattr(input_preprocessor, "_get_mm_processor"):
|
||||||
|
mm_processor = input_preprocessor._get_mm_processor()
|
||||||
|
if mm_processor is not None and hasattr(mm_processor, "info"):
|
||||||
|
ctx = mm_processor.info.ctx
|
||||||
|
return ctx.get_all_timing_stats()
|
||||||
|
except (AttributeError, RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _timed_operation(ctx: "InputProcessingContext", stage_name: str):
|
||||||
|
"""
|
||||||
|
Context manager to time an operation using the context's timing stats.
|
||||||
|
|
||||||
|
The request_id is automatically retrieved from the context variable,
|
||||||
|
so it doesn't need to be passed as a parameter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ctx: The InputProcessingContext containing the timing stats registry.
|
||||||
|
stage_name: Name of the stage being timed.
|
||||||
|
"""
|
||||||
|
request_id = get_current_request_id()
|
||||||
|
if ctx is None or request_id is None:
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
stats = ctx.get_timing_stats(request_id)
|
||||||
|
if stats is None:
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
elapsed = time.perf_counter() - start_time
|
||||||
|
if stage_name == "hf_processor":
|
||||||
|
stats.hf_processor_time += elapsed
|
||||||
|
elif stage_name == "hashing":
|
||||||
|
stats.hashing_time += elapsed
|
||||||
|
elif stage_name == "cache_lookup":
|
||||||
|
stats.cache_lookup_time += elapsed
|
||||||
|
elif stage_name == "prompt_update":
|
||||||
|
stats.prompt_update_time += elapsed
|
||||||
|
|
||||||
|
|
||||||
PromptSeq: TypeAlias = str | list[int]
|
PromptSeq: TypeAlias = str | list[int]
|
||||||
"""A token sequence (list of token IDs) or text."""
|
"""A token sequence (list of token IDs) or text."""
|
||||||
|
|
||||||
@ -951,6 +1075,21 @@ class InputProcessingContext:
|
|||||||
tokenizer: TokenizerLike | None
|
tokenizer: TokenizerLike | None
|
||||||
"""The tokenizer used to tokenize the inputs."""
|
"""The tokenizer used to tokenize the inputs."""
|
||||||
|
|
||||||
|
observability_config: "ObservabilityConfig | None" = field(
|
||||||
|
default=None, compare=False, repr=False
|
||||||
|
)
|
||||||
|
"""Configuration for observability features."""
|
||||||
|
|
||||||
|
timing_stats_registry: dict[str, MultiModalProcessorTimingStats] = field(
|
||||||
|
default_factory=dict, compare=False, repr=False
|
||||||
|
)
|
||||||
|
"""Registry for storing timing stats keyed by request_id."""
|
||||||
|
|
||||||
|
_timing_stats_registry_lock: threading.Lock = field(
|
||||||
|
default_factory=threading.Lock, compare=False, repr=False
|
||||||
|
)
|
||||||
|
"""Lock for thread-safe access to timing_stats_registry."""
|
||||||
|
|
||||||
def get_tokenizer(self) -> TokenizerLike:
|
def get_tokenizer(self) -> TokenizerLike:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -1159,6 +1298,71 @@ class InputProcessingContext:
|
|||||||
|
|
||||||
return self._postprocess_output(output)
|
return self._postprocess_output(output)
|
||||||
|
|
||||||
|
def get_timing_stats(
|
||||||
|
self, request_id: str
|
||||||
|
) -> MultiModalProcessorTimingStats | None:
|
||||||
|
"""
|
||||||
|
Get timing stats for a request.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
self.observability_config is None
|
||||||
|
or not self.observability_config.enable_mm_processor_stats
|
||||||
|
):
|
||||||
|
return None
|
||||||
|
with self._timing_stats_registry_lock:
|
||||||
|
return self.timing_stats_registry.get(request_id)
|
||||||
|
|
||||||
|
def create_timing_stats(self, request_id: str) -> MultiModalProcessorTimingStats:
|
||||||
|
"""
|
||||||
|
Create and store timing stats in the registry for a request.
|
||||||
|
|
||||||
|
This should be called at the start of processing for a request.
|
||||||
|
The stats object is created immediately and stored in the registry.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
self.observability_config is None
|
||||||
|
or not self.observability_config.enable_mm_processor_stats
|
||||||
|
):
|
||||||
|
return MultiModalProcessorTimingStats()
|
||||||
|
|
||||||
|
with self._timing_stats_registry_lock:
|
||||||
|
if request_id in self.timing_stats_registry:
|
||||||
|
raise ValueError(
|
||||||
|
f"Timing stats already exist for request_id: {request_id}"
|
||||||
|
)
|
||||||
|
stats = MultiModalProcessorTimingStats()
|
||||||
|
self.timing_stats_registry[request_id] = stats
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def clear_timing_stats_registry(self) -> int:
|
||||||
|
"""
|
||||||
|
Clear all stats from the registry. Returns the number of stats cleared.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
self.observability_config is None
|
||||||
|
or not self.observability_config.enable_mm_processor_stats
|
||||||
|
):
|
||||||
|
return 0
|
||||||
|
with self._timing_stats_registry_lock:
|
||||||
|
count = len(self.timing_stats_registry)
|
||||||
|
self.timing_stats_registry.clear()
|
||||||
|
return count
|
||||||
|
|
||||||
|
def get_all_timing_stats(self) -> dict[str, dict[str, float]]:
|
||||||
|
"""
|
||||||
|
Get all timing stats as a dictionary for API endpoints.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
self.observability_config is None
|
||||||
|
or not self.observability_config.enable_mm_processor_stats
|
||||||
|
):
|
||||||
|
return {}
|
||||||
|
with self._timing_stats_registry_lock:
|
||||||
|
return {
|
||||||
|
rid: stats.to_dict()
|
||||||
|
for rid, stats in self.timing_stats_registry.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class BaseProcessingInfo:
|
class BaseProcessingInfo:
|
||||||
"""Base class to provide the information necessary for data processing."""
|
"""Base class to provide the information necessary for data processing."""
|
||||||
@ -1494,11 +1698,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
Call the HF processor on the prompt text and
|
Call the HF processor on the prompt text and
|
||||||
associated multi-modal data.
|
associated multi-modal data.
|
||||||
"""
|
"""
|
||||||
return self.info.ctx.call_hf_processor(
|
with _timed_operation(self.info.ctx, "hf_processor"):
|
||||||
self.info.get_hf_processor(**mm_kwargs),
|
return self.info.ctx.call_hf_processor(
|
||||||
dict(text=prompt, **mm_data),
|
self.info.get_hf_processor(**mm_kwargs),
|
||||||
dict(**mm_kwargs, **tok_kwargs),
|
dict(text=prompt, **mm_data),
|
||||||
)
|
dict(**mm_kwargs, **tok_kwargs),
|
||||||
|
)
|
||||||
|
|
||||||
def _hf_processor_applies_updates(
|
def _hf_processor_applies_updates(
|
||||||
self,
|
self,
|
||||||
@ -1846,12 +2051,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Use overrides if provided; fallback to data-dependent hashing.
|
# Use overrides if provided; fallback to data-dependent hashing.
|
||||||
mm_hashes = self._hash_mm_items(
|
with _timed_operation(self.info.ctx, "hashing"):
|
||||||
mm_data_items,
|
mm_hashes = self._hash_mm_items(
|
||||||
hf_processor_mm_kwargs,
|
mm_data_items,
|
||||||
tokenization_kwargs,
|
hf_processor_mm_kwargs,
|
||||||
mm_uuids=mm_uuids,
|
tokenization_kwargs,
|
||||||
)
|
mm_uuids=mm_uuids,
|
||||||
|
)
|
||||||
|
|
||||||
mm_prompt_updates = self._get_mm_prompt_updates(
|
mm_prompt_updates = self._get_mm_prompt_updates(
|
||||||
mm_data_items,
|
mm_data_items,
|
||||||
@ -1892,18 +2098,20 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_hashes = self._hash_mm_items(
|
with _timed_operation(self.info.ctx, "hashing"):
|
||||||
mm_data_items,
|
mm_hashes = self._hash_mm_items(
|
||||||
hf_processor_mm_kwargs,
|
mm_data_items,
|
||||||
tokenization_kwargs,
|
hf_processor_mm_kwargs,
|
||||||
mm_uuids=mm_uuids,
|
tokenization_kwargs,
|
||||||
)
|
mm_uuids=mm_uuids,
|
||||||
|
)
|
||||||
|
|
||||||
mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
|
with _timed_operation(self.info.ctx, "cache_lookup"):
|
||||||
cache=cache,
|
mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
|
||||||
mm_data_items=mm_data_items,
|
cache=cache,
|
||||||
mm_hashes=mm_hashes,
|
mm_data_items=mm_data_items,
|
||||||
)
|
mm_hashes=mm_hashes,
|
||||||
|
)
|
||||||
|
|
||||||
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
||||||
# so we can't apply prompt updates until the new multimodal
|
# so we can't apply prompt updates until the new multimodal
|
||||||
@ -1933,13 +2141,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_missing_kwargs,
|
mm_missing_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
|
with _timed_operation(self.info.ctx, "cache_lookup"):
|
||||||
cache,
|
mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
|
||||||
mm_hashes=mm_hashes,
|
cache,
|
||||||
mm_is_cached=mm_is_cached,
|
mm_hashes=mm_hashes,
|
||||||
mm_missing_kwargs=mm_missing_kwargs,
|
mm_is_cached=mm_is_cached,
|
||||||
mm_missing_prompt_updates=mm_missing_prompt_updates,
|
mm_missing_kwargs=mm_missing_kwargs,
|
||||||
)
|
mm_missing_prompt_updates=mm_missing_prompt_updates,
|
||||||
|
)
|
||||||
|
|
||||||
mm_info = MultiModalProcessingInfo(
|
mm_info = MultiModalProcessingInfo(
|
||||||
kwargs=mm_kwargs,
|
kwargs=mm_kwargs,
|
||||||
@ -2121,6 +2330,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
3. Extract information about the placeholder tokens from the
|
3. Extract information about the placeholder tokens from the
|
||||||
processed token IDs.
|
processed token IDs.
|
||||||
"""
|
"""
|
||||||
|
request_id = get_current_request_id()
|
||||||
|
if request_id is not None:
|
||||||
|
self.info.ctx.create_timing_stats(request_id)
|
||||||
|
|
||||||
|
stats = (
|
||||||
|
self.info.ctx.get_timing_stats(request_id)
|
||||||
|
if request_id is not None
|
||||||
|
else None
|
||||||
|
)
|
||||||
mm_items = self._to_mm_items(mm_data)
|
mm_items = self._to_mm_items(mm_data)
|
||||||
|
|
||||||
if tokenization_kwargs is None:
|
if tokenization_kwargs is None:
|
||||||
@ -2139,13 +2357,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: tokenization_kwargs are not required to init processor
|
# NOTE: tokenization_kwargs are not required to init processor
|
||||||
prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
|
with _timed_operation(self.info.ctx, "prompt_update"):
|
||||||
mm_items=mm_items,
|
prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
|
||||||
prompt_ids=prompt_ids,
|
mm_items=mm_items,
|
||||||
mm_kwargs=mm_info.kwargs,
|
prompt_ids=prompt_ids,
|
||||||
mm_prompt_updates=mm_info.prompt_updates,
|
mm_kwargs=mm_info.kwargs,
|
||||||
is_update_applied=is_update_applied,
|
mm_prompt_updates=mm_info.prompt_updates,
|
||||||
)
|
is_update_applied=is_update_applied,
|
||||||
|
)
|
||||||
|
|
||||||
mm_placeholder_ranges = {
|
mm_placeholder_ranges = {
|
||||||
modality: [item.to_range() for item in placeholders]
|
modality: [item.to_range() for item in placeholders]
|
||||||
|
|||||||
@ -5,6 +5,7 @@ from dataclasses import dataclass
|
|||||||
from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
|
from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
|
||||||
|
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
|
from vllm.config.observability import ObservabilityConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||||
|
|
||||||
@ -22,7 +23,7 @@ from .profiling import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig, ObservabilityConfig
|
||||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -148,6 +149,7 @@ class MultiModalRegistry:
|
|||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
profiler_limits: Mapping[str, int] | None = None,
|
profiler_limits: Mapping[str, int] | None = None,
|
||||||
|
observability_config: ObservabilityConfig | None = None,
|
||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
"""
|
"""
|
||||||
Get the maximum number of tokens per data item from each modality based
|
Get the maximum number of tokens per data item from each modality based
|
||||||
@ -156,7 +158,11 @@ class MultiModalRegistry:
|
|||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
if observability_config is None:
|
||||||
|
observability_config = ObservabilityConfig()
|
||||||
|
processor = self.create_processor(
|
||||||
|
model_config, observability_config, cache=cache
|
||||||
|
)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
seq_len = model_config.max_model_len
|
seq_len = model_config.max_model_len
|
||||||
@ -174,6 +180,7 @@ class MultiModalRegistry:
|
|||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
|
observability_config: ObservabilityConfig | None = None,
|
||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
"""
|
"""
|
||||||
Get the maximum number of multi-modal input instances for each modality
|
Get the maximum number of multi-modal input instances for each modality
|
||||||
@ -182,7 +189,11 @@ class MultiModalRegistry:
|
|||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
if observability_config is None:
|
||||||
|
observability_config = ObservabilityConfig()
|
||||||
|
processor = self.create_processor(
|
||||||
|
model_config, observability_config, cache=cache
|
||||||
|
)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
return profiler.get_mm_limits()
|
return profiler.get_mm_limits()
|
||||||
|
|
||||||
@ -231,27 +242,32 @@ class MultiModalRegistry:
|
|||||||
def _create_processing_ctx(
|
def _create_processing_ctx(
|
||||||
self,
|
self,
|
||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
|
observability_config: "ObservabilityConfig | None" = None,
|
||||||
tokenizer: TokenizerLike | None = None,
|
tokenizer: TokenizerLike | None = None,
|
||||||
) -> InputProcessingContext:
|
) -> InputProcessingContext:
|
||||||
if tokenizer is None and not model_config.skip_tokenizer_init:
|
if tokenizer is None and not model_config.skip_tokenizer_init:
|
||||||
tokenizer = cached_tokenizer_from_config(model_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
|
|
||||||
return InputProcessingContext(model_config, tokenizer)
|
return InputProcessingContext(
|
||||||
|
model_config, tokenizer, observability_config=observability_config
|
||||||
|
)
|
||||||
|
|
||||||
def _create_processing_info(
|
def _create_processing_info(
|
||||||
self,
|
self,
|
||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
|
observability_config: "ObservabilityConfig | None" = None,
|
||||||
*,
|
*,
|
||||||
tokenizer: TokenizerLike | None = None,
|
tokenizer: TokenizerLike | None = None,
|
||||||
) -> BaseProcessingInfo:
|
) -> BaseProcessingInfo:
|
||||||
model_cls = self._get_model_cls(model_config)
|
model_cls = self._get_model_cls(model_config)
|
||||||
factories = model_cls._processor_factory
|
factories = model_cls._processor_factory
|
||||||
ctx = self._create_processing_ctx(model_config, tokenizer)
|
ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
|
||||||
return factories.info(ctx)
|
return factories.info(ctx)
|
||||||
|
|
||||||
def create_processor(
|
def create_processor(
|
||||||
self,
|
self,
|
||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
|
observability_config: "ObservabilityConfig | None" = None,
|
||||||
*,
|
*,
|
||||||
tokenizer: TokenizerLike | None = None,
|
tokenizer: TokenizerLike | None = None,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
@ -265,7 +281,7 @@ class MultiModalRegistry:
|
|||||||
model_cls = self._get_model_cls(model_config)
|
model_cls = self._get_model_cls(model_config)
|
||||||
factories = model_cls._processor_factory
|
factories = model_cls._processor_factory
|
||||||
|
|
||||||
ctx = self._create_processing_ctx(model_config, tokenizer)
|
ctx = self._create_processing_ctx(model_config, observability_config, tokenizer)
|
||||||
|
|
||||||
return factories.build_processor(ctx, cache=cache)
|
return factories.build_processor(ctx, cache=cache)
|
||||||
|
|
||||||
@ -276,13 +292,18 @@ class MultiModalRegistry:
|
|||||||
mm_counts: Mapping[str, int] | None = None,
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
|
observability_config: ObservabilityConfig | None = None,
|
||||||
) -> DummyDecoderData:
|
) -> DummyDecoderData:
|
||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by `model_config`.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
if observability_config is None:
|
||||||
|
observability_config = ObservabilityConfig()
|
||||||
|
processor = self.create_processor(
|
||||||
|
model_config, observability_config, cache=cache
|
||||||
|
)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
# Extract configurable options from multimodal config.
|
# Extract configurable options from multimodal config.
|
||||||
@ -309,13 +330,18 @@ class MultiModalRegistry:
|
|||||||
mm_counts: Mapping[str, int] | None = None,
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
|
observability_config: ObservabilityConfig | None = None,
|
||||||
) -> DummyEncoderData:
|
) -> DummyEncoderData:
|
||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by `model_config`.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(model_config, cache=cache)
|
if observability_config is None:
|
||||||
|
observability_config = ObservabilityConfig()
|
||||||
|
processor = self.create_processor(
|
||||||
|
model_config, observability_config, cache=cache
|
||||||
|
)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
# Extract configurable options from multimodal config.
|
# Extract configurable options from multimodal config.
|
||||||
|
|||||||
@ -15,7 +15,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
|||||||
from vllm.multimodal.cache import processor_cache_from_config
|
from vllm.multimodal.cache import processor_cache_from_config
|
||||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
|
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
|
||||||
from vllm.multimodal.parse import MultiModalDataParser
|
from vllm.multimodal.parse import MultiModalDataParser
|
||||||
from vllm.multimodal.processing import EncDecMultiModalProcessor
|
from vllm.multimodal.processing import EncDecMultiModalProcessor, set_request_id
|
||||||
from vllm.multimodal.utils import argsort_mm_positions
|
from vllm.multimodal.utils import argsort_mm_positions
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
@ -57,6 +57,7 @@ class InputProcessor:
|
|||||||
self.input_preprocessor = InputPreprocessor(
|
self.input_preprocessor = InputPreprocessor(
|
||||||
self.model_config,
|
self.model_config,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
|
self.vllm_config.observability_config,
|
||||||
mm_registry,
|
mm_registry,
|
||||||
mm_processor_cache=self.mm_processor_cache,
|
mm_processor_cache=self.mm_processor_cache,
|
||||||
)
|
)
|
||||||
@ -445,11 +446,13 @@ class InputProcessor:
|
|||||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||||
# multimodal data and expand prompt token ids accordingly.
|
# multimodal data and expand prompt token ids accordingly.
|
||||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
with set_request_id(request_id):
|
||||||
prompt,
|
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
prompt,
|
||||||
mm_uuids=mm_uuids,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
)
|
mm_uuids=mm_uuids,
|
||||||
|
)
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
current_platform.validate_request(
|
current_platform.validate_request(
|
||||||
@ -590,6 +593,7 @@ class InputProcessor:
|
|||||||
mm_registry = self.input_preprocessor.mm_registry
|
mm_registry = self.input_preprocessor.mm_registry
|
||||||
mm_processor = mm_registry.create_processor(
|
mm_processor = mm_registry.create_processor(
|
||||||
model_config,
|
model_config,
|
||||||
|
self.vllm_config.observability_config,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user