[Metrics] Add test for multi-modal cache stats logging (#26588)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin 2025-10-10 17:00:50 +01:00 committed by GitHub
parent 7b03584de8
commit e519281920
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 30 additions and 4 deletions

View File

@ -1,10 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import pytest
import regex as re
from vllm import LLM
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.metrics import loggers as stat_loggers
from vllm.v1.metrics.reader import Counter, Metric
from ..openai.test_vision import TEST_IMAGE_ASSETS
@ -37,12 +41,27 @@ def _get_mm_cache_stats(metrics: list[Metric]):
return mm_cache_queries, mm_cache_hits
def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
caplog_vllm.clear()
with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
llm.llm_engine.do_log_stats()
assert len(caplog_vllm.records) == 1
msg = caplog_vllm.records[0].getMessage()
assert "MM cache hit rate" in msg
match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
assert match is not None
return float(match.group(1))
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
def test_mm_cache_stats(
num_gpus_available,
image_urls,
mm_processor_cache_type,
caplog_vllm,
):
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
@ -56,12 +75,15 @@ def test_mm_cache_stats(
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
# NOTE: This only resets hit rate stats in CachingMetrics
# The raw queries and hits counts remain unaffected
@ -69,6 +91,8 @@ def test_mm_cache_stats(
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)

View File

@ -60,7 +60,6 @@ class LoggingStatLogger(StatLoggerBase):
self._reset(time.monotonic())
self.last_scheduler_stats = SchedulerStats()
self.last_mm_cache_stats: Optional[MultiModalCacheStats] = None
# Caching metrics. This cannot be reset.
# TODO: Make the interval configurable.
@ -115,8 +114,6 @@ class LoggingStatLogger(StatLoggerBase):
if mm_cache_stats:
self.mm_caching_metrics.observe(mm_cache_stats)
self.last_mm_cache_stats = mm_cache_stats
def log(self):
now = time.monotonic()
prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
@ -157,7 +154,7 @@ class LoggingStatLogger(StatLoggerBase):
scheduler_stats.kv_cache_usage * 100,
self.prefix_caching_metrics.hit_rate * 100,
]
if self.last_mm_cache_stats:
if not self.mm_caching_metrics.empty:
log_parts.append("MM cache hit rate: %.1f%%")
log_args.append(self.mm_caching_metrics.hit_rate * 100)

View File

@ -96,6 +96,11 @@ class CachingMetrics:
self.aggregated_query_hit = 0
self.query_queue.clear()
@property
def empty(self) -> bool:
"""Return true if no requests have been observed."""
return self.aggregated_requests == 0
@property
def hit_rate(self) -> float:
"""Calculate the hit rate for the past N requests."""