mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 11:55:00 +08:00
[MISC] Add lora requests to metrics (#9477)
Co-authored-by: Kunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
This commit is contained in:
parent
3921a2f29e
commit
9bb10a7d27
@ -1,4 +1,5 @@
|
|||||||
import time
|
import time
|
||||||
|
from collections import Counter as collectionsCounter
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@ -1617,6 +1618,25 @@ class LLMEngine:
|
|||||||
n_requests: List[int] = []
|
n_requests: List[int] = []
|
||||||
finished_reason_requests: List[str] = []
|
finished_reason_requests: List[str] = []
|
||||||
|
|
||||||
|
# Lora requests
|
||||||
|
running_lora_adapters = dict(
|
||||||
|
collectionsCounter([
|
||||||
|
running_request.lora_request.lora_name
|
||||||
|
for scheduler in self.scheduler
|
||||||
|
for running_request in scheduler.running
|
||||||
|
if running_request.lora_request
|
||||||
|
]))
|
||||||
|
waiting_lora_adapters = dict(
|
||||||
|
collectionsCounter([
|
||||||
|
waiting_request.lora_request.lora_name
|
||||||
|
for scheduler in self.scheduler
|
||||||
|
for waiting_request in scheduler.waiting
|
||||||
|
if waiting_request.lora_request
|
||||||
|
]))
|
||||||
|
max_lora_stat = "0"
|
||||||
|
if self.lora_config:
|
||||||
|
max_lora_stat = str(self.lora_config.max_loras)
|
||||||
|
|
||||||
# NOTE: This loop assumes prefill seq_groups are before
|
# NOTE: This loop assumes prefill seq_groups are before
|
||||||
# decode seq_groups in scheduled_seq_groups.
|
# decode seq_groups in scheduled_seq_groups.
|
||||||
if scheduler_outputs is not None:
|
if scheduler_outputs is not None:
|
||||||
@ -1738,7 +1758,9 @@ class LLMEngine:
|
|||||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||||
n_requests=n_requests,
|
n_requests=n_requests,
|
||||||
finished_reason_requests=finished_reason_requests,
|
finished_reason_requests=finished_reason_requests,
|
||||||
)
|
max_lora=str(max_lora_stat),
|
||||||
|
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
|
||||||
|
running_lora_adapters=list(running_lora_adapters.keys()))
|
||||||
|
|
||||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||||
return self.model_executor.add_lora(lora_request)
|
return self.model_executor.add_lora(lora_request)
|
||||||
|
|||||||
@ -34,7 +34,11 @@ class Metrics:
|
|||||||
See https://prometheus.github.io/client_python/multiprocess/ for more
|
See https://prometheus.github.io/client_python/multiprocess/ for more
|
||||||
details on limitations.
|
details on limitations.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
labelname_finish_reason = "finished_reason"
|
labelname_finish_reason = "finished_reason"
|
||||||
|
labelname_waiting_lora_adapters = "waiting_lora_adapters"
|
||||||
|
labelname_running_lora_adapters = "running_lora_adapters"
|
||||||
|
labelname_max_lora = "max_lora"
|
||||||
_gauge_cls = prometheus_client.Gauge
|
_gauge_cls = prometheus_client.Gauge
|
||||||
_counter_cls = prometheus_client.Counter
|
_counter_cls = prometheus_client.Counter
|
||||||
_histogram_cls = prometheus_client.Histogram
|
_histogram_cls = prometheus_client.Histogram
|
||||||
@ -55,6 +59,16 @@ class Metrics:
|
|||||||
documentation="Number of requests waiting to be processed.",
|
documentation="Number of requests waiting to be processed.",
|
||||||
labelnames=labelnames,
|
labelnames=labelnames,
|
||||||
multiprocess_mode="sum")
|
multiprocess_mode="sum")
|
||||||
|
self.gauge_lora_info = self._gauge_cls(
|
||||||
|
name="vllm:lora_requests_info",
|
||||||
|
documentation="Running stats on lora requests.",
|
||||||
|
labelnames=[
|
||||||
|
self.labelname_running_lora_adapters,
|
||||||
|
self.labelname_max_lora,
|
||||||
|
self.labelname_waiting_lora_adapters,
|
||||||
|
],
|
||||||
|
multiprocess_mode="livemostrecent",
|
||||||
|
)
|
||||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||||
name="vllm:num_requests_swapped",
|
name="vllm:num_requests_swapped",
|
||||||
documentation="Number of requests swapped to CPU.",
|
documentation="Number of requests swapped to CPU.",
|
||||||
@ -426,6 +440,9 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
for datum in data:
|
for datum in data:
|
||||||
histogram.labels(**self.labels).observe(datum)
|
histogram.labels(**self.labels).observe(datum)
|
||||||
|
|
||||||
|
def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
|
||||||
|
gauge.labels(**data).set(1)
|
||||||
|
|
||||||
def _log_prometheus(self, stats: Stats) -> None:
|
def _log_prometheus(self, stats: Stats) -> None:
|
||||||
# System state data
|
# System state data
|
||||||
self._log_gauge(self.metrics.gauge_scheduler_running,
|
self._log_gauge(self.metrics.gauge_scheduler_running,
|
||||||
@ -442,7 +459,17 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
stats.cpu_prefix_cache_hit_rate)
|
stats.cpu_prefix_cache_hit_rate)
|
||||||
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
||||||
stats.gpu_prefix_cache_hit_rate)
|
stats.gpu_prefix_cache_hit_rate)
|
||||||
|
# Including max-lora in metric, in future this property of lora
|
||||||
|
# config maybe extended to be dynamic.
|
||||||
|
lora_info = {
|
||||||
|
self.metrics.labelname_running_lora_adapters:
|
||||||
|
",".join(stats.running_lora_adapters),
|
||||||
|
self.metrics.labelname_waiting_lora_adapters:
|
||||||
|
",".join(stats.waiting_lora_adapters),
|
||||||
|
self.metrics.labelname_max_lora:
|
||||||
|
stats.max_lora,
|
||||||
|
}
|
||||||
|
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
|
||||||
# Iteration level data
|
# Iteration level data
|
||||||
self._log_counter(self.metrics.counter_num_preemption,
|
self._log_counter(self.metrics.counter_num_preemption,
|
||||||
stats.num_preemption_iter)
|
stats.num_preemption_iter)
|
||||||
|
|||||||
@ -51,6 +51,9 @@ class Stats:
|
|||||||
num_generation_tokens_requests: List[int]
|
num_generation_tokens_requests: List[int]
|
||||||
n_requests: List[int]
|
n_requests: List[int]
|
||||||
finished_reason_requests: List[str]
|
finished_reason_requests: List[str]
|
||||||
|
waiting_lora_adapters: List[str]
|
||||||
|
running_lora_adapters: List[str]
|
||||||
|
max_lora: str
|
||||||
|
|
||||||
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user