[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin 2025-02-05 00:46:54 +00:00 committed by GitHub
parent 18016a5e62
commit 233df6f5c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 66 additions and 27 deletions

View File

@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count", "vllm:request_prompt_tokens_count",

View File

@ -15,6 +15,23 @@ if TYPE_CHECKING:
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
class RequestFinishedReason(enum.IntEnum):
"""
Reason a request finished - stop, length, or abort.
stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted for another reason
"""
STOP = 0
LENGTH = 1
ABORT = 2
def __str__(self):
return self.name.lower()
@dataclass @dataclass
class EngineCoreRequest: class EngineCoreRequest:
@ -45,7 +62,7 @@ class EngineCoreOutput(
request_id: str request_id: str
new_token_ids: List[int] new_token_ids: List[int]
finished: bool finished: bool
finish_reason: Optional[str] = None finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None stop_reason: Union[int, str, None] = None
@ -56,7 +73,7 @@ class EngineCoreOutputs(
gc=False): # type: ignore[call-arg] gc=False): # type: ignore[call-arg]
#NOTE(Nick): We could consider ways to make this more compact, #NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout and using an int enum for finish/stop reason # e.g. columnwise layout
# [num_reqs] # [num_reqs]
outputs: List[EngineCoreOutput] outputs: List[EngineCoreOutput]

View File

@ -8,7 +8,8 @@ from vllm.logger import init_logger
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import ( from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
RequestFinishedReason)
logger = init_logger(__name__) logger = init_logger(__name__)
@ -18,7 +19,7 @@ class DetokenizerOutput:
output_text: str output_text: str
token_ids: List[int] token_ids: List[int]
finished: bool finished: bool
finish_reason: Optional[str] = None finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None stop_reason: Union[int, str, None] = None
@ -147,13 +148,13 @@ class IncrementalDetokenizer:
stop_str, truncate_to = stop stop_str, truncate_to = stop
if truncate_to != -1: if truncate_to != -1:
self.output_text = self.output_text[:truncate_to] self.output_text = self.output_text[:truncate_to]
finish_reason = "stop" # TODO: use constant finish_reason = RequestFinishedReason.STOP
stop_reason = stop_str stop_reason = stop_str
# TODO: handle stop_token_ids here too? # TODO: handle stop_token_ids here too?
# 3) Update the RequestOutput object with the new text. # 3) Update the RequestOutput object with the new text.
finished = bool(finish_reason) finished = finish_reason is not None
if self.output_kind == RequestOutputKind.FINAL_ONLY \ if self.output_kind == RequestOutputKind.FINAL_ONLY \
and not finished: and not finished:
return None return None

View File

@ -161,8 +161,10 @@ class OutputProcessor:
engine_core_output) engine_core_output)
# 3) Create and handle RequestOutput objects. # 3) Create and handle RequestOutput objects.
if request_output := self._make_request_output( if detokenizer_output is not None:
req_state, detokenizer_output): request_output = self._make_request_output(
req_state, detokenizer_output)
if req_state.queue is not None: if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate(). # AsyncLLM: put into queue for handling by generate().
req_state.queue.put_nowait(request_output) req_state.queue.put_nowait(request_output)
@ -172,6 +174,8 @@ class OutputProcessor:
# Free completed requests. # Free completed requests.
if request_output.finished: if request_output.finished:
assert detokenizer_output.finish_reason is not None
self.request_states.pop(req_id) self.request_states.pop(req_id)
if not engine_core_output.finished: if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer # If req not finished in EngineCore, but Detokenizer
@ -180,7 +184,8 @@ class OutputProcessor:
# Track per-request stats # Track per-request stats
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
request_output, req_state.stats) detokenizer_output.finish_reason, request_output,
req_state.stats)
return OutputProcessorOutput( return OutputProcessorOutput(
request_outputs=request_outputs, request_outputs=request_outputs,
@ -191,12 +196,8 @@ class OutputProcessor:
@staticmethod @staticmethod
def _make_request_output( def _make_request_output(
request_state: RequestState, request_state: RequestState,
detokenizer_output: Optional[DetokenizerOutput], detokenizer_output: DetokenizerOutput,
) -> Optional[RequestOutput]: ) -> RequestOutput:
if detokenizer_output is None:
return None
request_output = RequestOutput.new( request_output = RequestOutput.new(
request_state.request_id, request_state.request_id,
request_state.prompt, request_state.prompt,
@ -207,7 +208,8 @@ class OutputProcessor:
) )
if detokenizer_output.finished: if detokenizer_output.finished:
completion_output = request_output.outputs[0] completion_output = request_output.outputs[0]
completion_output.finish_reason = detokenizer_output.finish_reason completion_output.finish_reason = str(
detokenizer_output.finish_reason)
completion_output.stop_reason = detokenizer_output.stop_reason completion_output.stop_reason = detokenizer_output.stop_reason
return request_output return request_output

View File

@ -2,13 +2,14 @@
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import Dict, List
import numpy as np import numpy as np
import prometheus_client import prometheus_client
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.engine import RequestFinishedReason
from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.metrics.stats import IterationStats, SchedulerStats
logger = init_logger(__name__) logger = init_logger(__name__)
@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase):
documentation="Number of generation tokens processed.", documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
self.counter_request_success: Dict[RequestFinishedReason,
prometheus_client.Counter] = {}
counter_request_success_base = prometheus_client.Counter(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"])
for reason in RequestFinishedReason:
self.counter_request_success[
reason] = counter_request_success_base.labels(*(labelvalues +
[str(reason)]))
self.histogram_num_prompt_tokens_request = \ self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram( prometheus_client.Histogram(
name="vllm:request_prompt_tokens", name="vllm:request_prompt_tokens",
@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase):
iteration_stats.num_generation_tokens) iteration_stats.num_generation_tokens)
for finished_request in iteration_stats.finished_requests: for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason].inc()
self.histogram_num_prompt_tokens_request.observe( self.histogram_num_prompt_tokens_request.observe(
finished_request.num_prompt_tokens) finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe( self.histogram_num_generation_tokens_request.observe(

View File

@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
@dataclass @dataclass
@ -32,6 +32,7 @@ class RequestStateStats:
class FinishedRequestStats: class FinishedRequestStats:
"""Stats associated with a finished request.""" """Stats associated with a finished request."""
finish_reason: "RequestFinishedReason"
num_prompt_tokens: int = 0 num_prompt_tokens: int = 0
num_generation_tokens: int = 0 num_generation_tokens: int = 0
@ -73,8 +74,11 @@ class IterationStats:
request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now request_state_stats.last_token_time = now
def update_from_finished_request(self, request_output: "RequestOutput", def update_from_finished_request(self,
finish_reason: "RequestFinishedReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats): request_state_stats: RequestStateStats):
self.finished_requests.append( self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids), FinishedRequestStats(finish_reason,
len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens)) request_state_stats.num_generation_tokens))

View File

@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import RequestMetrics from vllm.sequence import RequestMetrics
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
from vllm.v1.utils import ConstantList from vllm.v1.utils import ConstantList
if TYPE_CHECKING: if TYPE_CHECKING:
@ -109,7 +109,7 @@ class Request:
def is_finished(self) -> bool: def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status) return RequestStatus.is_finished(self.status)
def get_finished_reason(self) -> Union[str, None]: def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
return RequestStatus.get_finished_reason(self.status) return RequestStatus.get_finished_reason(self.status)
def has_encoder_inputs(self) -> bool: def has_encoder_inputs(self) -> bool:
@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum):
return status > RequestStatus.PREEMPTED return status > RequestStatus.PREEMPTED
@staticmethod @staticmethod
def get_finished_reason(status: "RequestStatus") -> Union[str, None]: def get_finished_reason(
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
return _FINISHED_REASON_MAP.get(status) return _FINISHED_REASON_MAP.get(status)
@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum):
# are longer than the model's length cap. Therefore, the stop # are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API. # reason should also be "length" as in OpenAI API.
_FINISHED_REASON_MAP = { _FINISHED_REASON_MAP = {
RequestStatus.FINISHED_STOPPED: "stop", RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: "length", RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_ABORTED: "abort", RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
RequestStatus.FINISHED_IGNORED: "length", RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
} }