[MISC] Add lora requests to metrics (#9477)

Co-authored-by: Kunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
This commit is contained in:
Kunjan 2024-10-18 13:50:18 -07:00 committed by GitHub
parent 3921a2f29e
commit 9bb10a7d27
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 2 deletions

View File

@ -1,4 +1,5 @@
import time import time
from collections import Counter as collectionsCounter
from collections import deque from collections import deque
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
@ -1617,6 +1618,25 @@ class LLMEngine:
n_requests: List[int] = [] n_requests: List[int] = []
finished_reason_requests: List[str] = [] finished_reason_requests: List[str] = []
# Lora requests
running_lora_adapters = dict(
collectionsCounter([
running_request.lora_request.lora_name
for scheduler in self.scheduler
for running_request in scheduler.running
if running_request.lora_request
]))
waiting_lora_adapters = dict(
collectionsCounter([
waiting_request.lora_request.lora_name
for scheduler in self.scheduler
for waiting_request in scheduler.waiting
if waiting_request.lora_request
]))
max_lora_stat = "0"
if self.lora_config:
max_lora_stat = str(self.lora_config.max_loras)
# NOTE: This loop assumes prefill seq_groups are before # NOTE: This loop assumes prefill seq_groups are before
# decode seq_groups in scheduled_seq_groups. # decode seq_groups in scheduled_seq_groups.
if scheduler_outputs is not None: if scheduler_outputs is not None:
@ -1738,7 +1758,9 @@ class LLMEngine:
num_generation_tokens_requests=num_generation_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests, n_requests=n_requests,
finished_reason_requests=finished_reason_requests, finished_reason_requests=finished_reason_requests,
) max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
running_lora_adapters=list(running_lora_adapters.keys()))
def add_lora(self, lora_request: LoRARequest) -> bool: def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_executor.add_lora(lora_request) return self.model_executor.add_lora(lora_request)

View File

@ -34,7 +34,11 @@ class Metrics:
See https://prometheus.github.io/client_python/multiprocess/ for more See https://prometheus.github.io/client_python/multiprocess/ for more
details on limitations. details on limitations.
""" """
labelname_finish_reason = "finished_reason" labelname_finish_reason = "finished_reason"
labelname_waiting_lora_adapters = "waiting_lora_adapters"
labelname_running_lora_adapters = "running_lora_adapters"
labelname_max_lora = "max_lora"
_gauge_cls = prometheus_client.Gauge _gauge_cls = prometheus_client.Gauge
_counter_cls = prometheus_client.Counter _counter_cls = prometheus_client.Counter
_histogram_cls = prometheus_client.Histogram _histogram_cls = prometheus_client.Histogram
@ -55,6 +59,16 @@ class Metrics:
documentation="Number of requests waiting to be processed.", documentation="Number of requests waiting to be processed.",
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
self.gauge_lora_info = self._gauge_cls(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests.",
labelnames=[
self.labelname_running_lora_adapters,
self.labelname_max_lora,
self.labelname_waiting_lora_adapters,
],
multiprocess_mode="livemostrecent",
)
self.gauge_scheduler_swapped = self._gauge_cls( self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped", name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.", documentation="Number of requests swapped to CPU.",
@ -426,6 +440,9 @@ class PrometheusStatLogger(StatLoggerBase):
for datum in data: for datum in data:
histogram.labels(**self.labels).observe(datum) histogram.labels(**self.labels).observe(datum)
def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
gauge.labels(**data).set(1)
def _log_prometheus(self, stats: Stats) -> None: def _log_prometheus(self, stats: Stats) -> None:
# System state data # System state data
self._log_gauge(self.metrics.gauge_scheduler_running, self._log_gauge(self.metrics.gauge_scheduler_running,
@ -442,7 +459,17 @@ class PrometheusStatLogger(StatLoggerBase):
stats.cpu_prefix_cache_hit_rate) stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate) stats.gpu_prefix_cache_hit_rate)
# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info = {
self.metrics.labelname_running_lora_adapters:
",".join(stats.running_lora_adapters),
self.metrics.labelname_waiting_lora_adapters:
",".join(stats.waiting_lora_adapters),
self.metrics.labelname_max_lora:
stats.max_lora,
}
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
# Iteration level data # Iteration level data
self._log_counter(self.metrics.counter_num_preemption, self._log_counter(self.metrics.counter_num_preemption,
stats.num_preemption_iter) stats.num_preemption_iter)

View File

@ -51,6 +51,9 @@ class Stats:
num_generation_tokens_requests: List[int] num_generation_tokens_requests: List[int]
n_requests: List[int] n_requests: List[int]
finished_reason_requests: List[str] finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None