[Misc] Add metrics for request queue time, forward time, and execute time (#9659)

This commit is contained in:
科英 2024-10-30 01:32:56 +08:00 committed by GitHub
parent 622b7ab955
commit 74fc2d77ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 70 additions and 15 deletions

View File

@ -1892,13 +1892,6 @@ class ObservabilityConfig:
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}") f"installed. Original error:\n{otel_import_error_traceback}")
if ((self.collect_model_forward_time
or self.collect_model_execute_time)
and self.otlp_traces_endpoint is None):
raise ValueError(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set.")
@dataclass(frozen=True) @dataclass(frozen=True)
class EngineConfig: class EngineConfig:

View File

@ -1645,6 +1645,9 @@ class LLMEngine:
# Request stats # Request stats
# Latency # Latency
time_e2e_requests: List[float] = [] time_e2e_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
# Metadata # Metadata
num_prompt_tokens_requests: List[int] = [] num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = []
@ -1738,6 +1741,15 @@ class LLMEngine:
# Latency timings # Latency timings
time_e2e_requests.append(now - time_e2e_requests.append(now -
seq_group.metrics.arrival_time) seq_group.metrics.arrival_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
if seq_group.metrics.model_forward_time is not None:
model_forward_time_requests.append(
seq_group.metrics.model_forward_time)
if seq_group.metrics.model_execute_time is not None:
model_execute_time_requests.append(
seq_group.metrics.model_execute_time * 1000)
# Metadata # Metadata
num_prompt_tokens_requests.append( num_prompt_tokens_requests.append(
len(seq_group.prompt_token_ids)) len(seq_group.prompt_token_ids))
@ -1795,6 +1807,9 @@ class LLMEngine:
# Request stats # Request stats
# Latency # Latency
time_e2e_requests=time_e2e_requests, time_e2e_requests=time_e2e_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata # Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests, num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests,

View File

@ -133,7 +133,31 @@ class Metrics:
name="vllm:e2e_request_latency_seconds", name="vllm:e2e_request_latency_seconds",
documentation="Histogram of end to end request latency in seconds.", documentation="Histogram of end to end request latency in seconds.",
labelnames=labelnames, labelnames=labelnames,
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
"Histogram of time the request spent in the queue in seconds.",
labelnames=labelnames,
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
"Histogram of time spent in the model forward pass in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
"Histogram of time spent in the model execute function in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata # Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls( self.histogram_num_prompt_tokens_request = self._histogram_cls(
name="vllm:request_prompt_tokens", name="vllm:request_prompt_tokens",
@ -299,16 +323,12 @@ class RayMetrics(Metrics):
pass pass
def build_1_2_5_buckets(max_value: int) -> List[int]: def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
""" """
Builds a list of buckets with increasing powers of 10 multiplied by Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values (1, 2, 5) until the value exceeds the specified maximum. mantissa values until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
""" """
mantissa_lst = [1, 2, 5]
exponent = 0 exponent = 0
buckets: List[int] = [] buckets: List[int] = []
while True: while True:
@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent += 1 exponent += 1
def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)
def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return build_buckets([1, 2, 3, 5, 8], max_value)
def local_interval_elapsed(now: float, last_log: float, def local_interval_elapsed(now: float, last_log: float,
local_interval: float) -> bool: local_interval: float) -> bool:
elapsed_time = now - last_log elapsed_time = now - last_log
@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
# Latency # Latency
self._log_histogram(self.metrics.histogram_e2e_time_request, self._log_histogram(self.metrics.histogram_e2e_time_request,
stats.time_e2e_requests) stats.time_e2e_requests)
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata # Metadata
finished_reason_counter = CollectionsCounter( finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests) stats.finished_reason_requests)

View File

@ -46,6 +46,9 @@ class Stats:
# Request stats (should have _requests suffix) # Request stats (should have _requests suffix)
# Latency # Latency
time_e2e_requests: List[float] time_e2e_requests: List[float]
time_in_queue_requests: List[float]
model_forward_time_requests: List[float]
model_execute_time_requests: List[float]
# Metadata # Metadata
num_prompt_tokens_requests: List[int] num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int] num_generation_tokens_requests: List[int]