mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 17:15:47 +08:00
[Misc] Add metrics for request queue time, forward time, and execute time (#9659)
This commit is contained in:
parent
622b7ab955
commit
74fc2d77ae
@ -1892,13 +1892,6 @@ class ObservabilityConfig:
|
|||||||
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
|
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
|
||||||
f"installed. Original error:\n{otel_import_error_traceback}")
|
f"installed. Original error:\n{otel_import_error_traceback}")
|
||||||
|
|
||||||
if ((self.collect_model_forward_time
|
|
||||||
or self.collect_model_execute_time)
|
|
||||||
and self.otlp_traces_endpoint is None):
|
|
||||||
raise ValueError(
|
|
||||||
"collect_model_forward_time or collect_model_execute_time "
|
|
||||||
"requires --otlp-traces-endpoint to be set.")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class EngineConfig:
|
class EngineConfig:
|
||||||
|
|||||||
@ -1645,6 +1645,9 @@ class LLMEngine:
|
|||||||
# Request stats
|
# Request stats
|
||||||
# Latency
|
# Latency
|
||||||
time_e2e_requests: List[float] = []
|
time_e2e_requests: List[float] = []
|
||||||
|
time_in_queue_requests: List[float] = []
|
||||||
|
model_forward_time_requests: List[float] = []
|
||||||
|
model_execute_time_requests: List[float] = []
|
||||||
# Metadata
|
# Metadata
|
||||||
num_prompt_tokens_requests: List[int] = []
|
num_prompt_tokens_requests: List[int] = []
|
||||||
num_generation_tokens_requests: List[int] = []
|
num_generation_tokens_requests: List[int] = []
|
||||||
@ -1738,6 +1741,15 @@ class LLMEngine:
|
|||||||
# Latency timings
|
# Latency timings
|
||||||
time_e2e_requests.append(now -
|
time_e2e_requests.append(now -
|
||||||
seq_group.metrics.arrival_time)
|
seq_group.metrics.arrival_time)
|
||||||
|
if seq_group.metrics.time_in_queue is not None:
|
||||||
|
time_in_queue_requests.append(
|
||||||
|
seq_group.metrics.time_in_queue)
|
||||||
|
if seq_group.metrics.model_forward_time is not None:
|
||||||
|
model_forward_time_requests.append(
|
||||||
|
seq_group.metrics.model_forward_time)
|
||||||
|
if seq_group.metrics.model_execute_time is not None:
|
||||||
|
model_execute_time_requests.append(
|
||||||
|
seq_group.metrics.model_execute_time * 1000)
|
||||||
# Metadata
|
# Metadata
|
||||||
num_prompt_tokens_requests.append(
|
num_prompt_tokens_requests.append(
|
||||||
len(seq_group.prompt_token_ids))
|
len(seq_group.prompt_token_ids))
|
||||||
@ -1795,6 +1807,9 @@ class LLMEngine:
|
|||||||
# Request stats
|
# Request stats
|
||||||
# Latency
|
# Latency
|
||||||
time_e2e_requests=time_e2e_requests,
|
time_e2e_requests=time_e2e_requests,
|
||||||
|
time_in_queue_requests=time_in_queue_requests,
|
||||||
|
model_forward_time_requests=model_forward_time_requests,
|
||||||
|
model_execute_time_requests=model_execute_time_requests,
|
||||||
# Metadata
|
# Metadata
|
||||||
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
||||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||||
|
|||||||
@ -133,7 +133,31 @@ class Metrics:
|
|||||||
name="vllm:e2e_request_latency_seconds",
|
name="vllm:e2e_request_latency_seconds",
|
||||||
documentation="Histogram of end to end request latency in seconds.",
|
documentation="Histogram of end to end request latency in seconds.",
|
||||||
labelnames=labelnames,
|
labelnames=labelnames,
|
||||||
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
|
buckets=[
|
||||||
|
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
|
||||||
|
40.0, 50.0, 60.0
|
||||||
|
])
|
||||||
|
self.histogram_time_in_queue_request = self._histogram_cls(
|
||||||
|
name="vllm:time_in_queue_requests",
|
||||||
|
documentation=
|
||||||
|
"Histogram of time the request spent in the queue in seconds.",
|
||||||
|
labelnames=labelnames,
|
||||||
|
buckets=[
|
||||||
|
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
|
||||||
|
40.0, 50.0, 60.0
|
||||||
|
])
|
||||||
|
self.histogram_model_forward_time_request = self._histogram_cls(
|
||||||
|
name="vllm:model_forward_time_milliseconds",
|
||||||
|
documentation=
|
||||||
|
"Histogram of time spent in the model forward pass in ms.",
|
||||||
|
labelnames=labelnames,
|
||||||
|
buckets=build_1_2_3_5_8_buckets(3000))
|
||||||
|
self.histogram_model_execute_time_request = self._histogram_cls(
|
||||||
|
name="vllm:model_execute_time_milliseconds",
|
||||||
|
documentation=
|
||||||
|
"Histogram of time spent in the model execute function in ms.",
|
||||||
|
labelnames=labelnames,
|
||||||
|
buckets=build_1_2_3_5_8_buckets(3000))
|
||||||
# Metadata
|
# Metadata
|
||||||
self.histogram_num_prompt_tokens_request = self._histogram_cls(
|
self.histogram_num_prompt_tokens_request = self._histogram_cls(
|
||||||
name="vllm:request_prompt_tokens",
|
name="vllm:request_prompt_tokens",
|
||||||
@ -299,16 +323,12 @@ class RayMetrics(Metrics):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Builds a list of buckets with increasing powers of 10 multiplied by
|
Builds a list of buckets with increasing powers of 10 multiplied by
|
||||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
mantissa values until the value exceeds the specified maximum.
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> build_1_2_5_buckets(100)
|
|
||||||
[1, 2, 5, 10, 20, 50, 100]
|
|
||||||
"""
|
"""
|
||||||
mantissa_lst = [1, 2, 5]
|
|
||||||
exponent = 0
|
exponent = 0
|
||||||
buckets: List[int] = []
|
buckets: List[int] = []
|
||||||
while True:
|
while True:
|
||||||
@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
|
|||||||
exponent += 1
|
exponent += 1
|
||||||
|
|
||||||
|
|
||||||
|
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||||
|
"""
|
||||||
|
Example:
|
||||||
|
>>> build_1_2_5_buckets(100)
|
||||||
|
[1, 2, 5, 10, 20, 50, 100]
|
||||||
|
"""
|
||||||
|
return build_buckets([1, 2, 5], max_value)
|
||||||
|
|
||||||
|
|
||||||
|
def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
|
||||||
|
"""
|
||||||
|
Example:
|
||||||
|
>>> build_1_2_3_5_8_buckets(100)
|
||||||
|
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
|
||||||
|
"""
|
||||||
|
return build_buckets([1, 2, 3, 5, 8], max_value)
|
||||||
|
|
||||||
|
|
||||||
def local_interval_elapsed(now: float, last_log: float,
|
def local_interval_elapsed(now: float, last_log: float,
|
||||||
local_interval: float) -> bool:
|
local_interval: float) -> bool:
|
||||||
elapsed_time = now - last_log
|
elapsed_time = now - last_log
|
||||||
@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
# Latency
|
# Latency
|
||||||
self._log_histogram(self.metrics.histogram_e2e_time_request,
|
self._log_histogram(self.metrics.histogram_e2e_time_request,
|
||||||
stats.time_e2e_requests)
|
stats.time_e2e_requests)
|
||||||
|
self._log_histogram(self.metrics.histogram_time_in_queue_request,
|
||||||
|
stats.time_in_queue_requests)
|
||||||
|
self._log_histogram(self.metrics.histogram_model_forward_time_request,
|
||||||
|
stats.model_forward_time_requests)
|
||||||
|
self._log_histogram(self.metrics.histogram_model_execute_time_request,
|
||||||
|
stats.model_execute_time_requests)
|
||||||
# Metadata
|
# Metadata
|
||||||
finished_reason_counter = CollectionsCounter(
|
finished_reason_counter = CollectionsCounter(
|
||||||
stats.finished_reason_requests)
|
stats.finished_reason_requests)
|
||||||
|
|||||||
@ -46,6 +46,9 @@ class Stats:
|
|||||||
# Request stats (should have _requests suffix)
|
# Request stats (should have _requests suffix)
|
||||||
# Latency
|
# Latency
|
||||||
time_e2e_requests: List[float]
|
time_e2e_requests: List[float]
|
||||||
|
time_in_queue_requests: List[float]
|
||||||
|
model_forward_time_requests: List[float]
|
||||||
|
model_execute_time_requests: List[float]
|
||||||
# Metadata
|
# Metadata
|
||||||
num_prompt_tokens_requests: List[int]
|
num_prompt_tokens_requests: List[int]
|
||||||
num_generation_tokens_requests: List[int]
|
num_generation_tokens_requests: List[int]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user