mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 00:16:00 +08:00
[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
18016a5e62
commit
233df6f5c4
@ -205,6 +205,7 @@ EXPECTED_METRICS_V1 = [
|
|||||||
"vllm:gpu_cache_usage_perc",
|
"vllm:gpu_cache_usage_perc",
|
||||||
"vllm:prompt_tokens_total",
|
"vllm:prompt_tokens_total",
|
||||||
"vllm:generation_tokens_total",
|
"vllm:generation_tokens_total",
|
||||||
|
"vllm:request_success_total",
|
||||||
"vllm:request_prompt_tokens_sum",
|
"vllm:request_prompt_tokens_sum",
|
||||||
"vllm:request_prompt_tokens_bucket",
|
"vllm:request_prompt_tokens_bucket",
|
||||||
"vllm:request_prompt_tokens_count",
|
"vllm:request_prompt_tokens_count",
|
||||||
|
|||||||
@ -15,6 +15,23 @@ if TYPE_CHECKING:
|
|||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
|
|
||||||
|
class RequestFinishedReason(enum.IntEnum):
|
||||||
|
"""
|
||||||
|
Reason a request finished - stop, length, or abort.
|
||||||
|
|
||||||
|
stop - a stop string was emitted
|
||||||
|
length - max_tokens was consumed, or max_model_len was reached
|
||||||
|
abort - aborted for another reason
|
||||||
|
|
||||||
|
"""
|
||||||
|
STOP = 0
|
||||||
|
LENGTH = 1
|
||||||
|
ABORT = 2
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name.lower()
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class EngineCoreRequest:
|
class EngineCoreRequest:
|
||||||
|
|
||||||
@ -45,7 +62,7 @@ class EngineCoreOutput(
|
|||||||
request_id: str
|
request_id: str
|
||||||
new_token_ids: List[int]
|
new_token_ids: List[int]
|
||||||
finished: bool
|
finished: bool
|
||||||
finish_reason: Optional[str] = None
|
finish_reason: Optional[RequestFinishedReason] = None
|
||||||
stop_reason: Union[int, str, None] = None
|
stop_reason: Union[int, str, None] = None
|
||||||
|
|
||||||
|
|
||||||
@ -56,7 +73,7 @@ class EngineCoreOutputs(
|
|||||||
gc=False): # type: ignore[call-arg]
|
gc=False): # type: ignore[call-arg]
|
||||||
|
|
||||||
#NOTE(Nick): We could consider ways to make this more compact,
|
#NOTE(Nick): We could consider ways to make this more compact,
|
||||||
# e.g. columnwise layout and using an int enum for finish/stop reason
|
# e.g. columnwise layout
|
||||||
|
|
||||||
# [num_reqs]
|
# [num_reqs]
|
||||||
outputs: List[EngineCoreOutput]
|
outputs: List[EngineCoreOutput]
|
||||||
|
|||||||
@ -8,7 +8,8 @@ from vllm.logger import init_logger
|
|||||||
from vllm.sampling_params import RequestOutputKind
|
from vllm.sampling_params import RequestOutputKind
|
||||||
from vllm.transformers_utils.detokenizer_utils import (
|
from vllm.transformers_utils.detokenizer_utils import (
|
||||||
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
|
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
|
||||||
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
|
from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
|
||||||
|
RequestFinishedReason)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -18,7 +19,7 @@ class DetokenizerOutput:
|
|||||||
output_text: str
|
output_text: str
|
||||||
token_ids: List[int]
|
token_ids: List[int]
|
||||||
finished: bool
|
finished: bool
|
||||||
finish_reason: Optional[str] = None
|
finish_reason: Optional[RequestFinishedReason] = None
|
||||||
stop_reason: Union[int, str, None] = None
|
stop_reason: Union[int, str, None] = None
|
||||||
|
|
||||||
|
|
||||||
@ -147,13 +148,13 @@ class IncrementalDetokenizer:
|
|||||||
stop_str, truncate_to = stop
|
stop_str, truncate_to = stop
|
||||||
if truncate_to != -1:
|
if truncate_to != -1:
|
||||||
self.output_text = self.output_text[:truncate_to]
|
self.output_text = self.output_text[:truncate_to]
|
||||||
finish_reason = "stop" # TODO: use constant
|
finish_reason = RequestFinishedReason.STOP
|
||||||
stop_reason = stop_str
|
stop_reason = stop_str
|
||||||
|
|
||||||
# TODO: handle stop_token_ids here too?
|
# TODO: handle stop_token_ids here too?
|
||||||
|
|
||||||
# 3) Update the RequestOutput object with the new text.
|
# 3) Update the RequestOutput object with the new text.
|
||||||
finished = bool(finish_reason)
|
finished = finish_reason is not None
|
||||||
if self.output_kind == RequestOutputKind.FINAL_ONLY \
|
if self.output_kind == RequestOutputKind.FINAL_ONLY \
|
||||||
and not finished:
|
and not finished:
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -161,8 +161,10 @@ class OutputProcessor:
|
|||||||
engine_core_output)
|
engine_core_output)
|
||||||
|
|
||||||
# 3) Create and handle RequestOutput objects.
|
# 3) Create and handle RequestOutput objects.
|
||||||
if request_output := self._make_request_output(
|
if detokenizer_output is not None:
|
||||||
req_state, detokenizer_output):
|
request_output = self._make_request_output(
|
||||||
|
req_state, detokenizer_output)
|
||||||
|
|
||||||
if req_state.queue is not None:
|
if req_state.queue is not None:
|
||||||
# AsyncLLM: put into queue for handling by generate().
|
# AsyncLLM: put into queue for handling by generate().
|
||||||
req_state.queue.put_nowait(request_output)
|
req_state.queue.put_nowait(request_output)
|
||||||
@ -172,6 +174,8 @@ class OutputProcessor:
|
|||||||
|
|
||||||
# Free completed requests.
|
# Free completed requests.
|
||||||
if request_output.finished:
|
if request_output.finished:
|
||||||
|
assert detokenizer_output.finish_reason is not None
|
||||||
|
|
||||||
self.request_states.pop(req_id)
|
self.request_states.pop(req_id)
|
||||||
if not engine_core_output.finished:
|
if not engine_core_output.finished:
|
||||||
# If req not finished in EngineCore, but Detokenizer
|
# If req not finished in EngineCore, but Detokenizer
|
||||||
@ -180,7 +184,8 @@ class OutputProcessor:
|
|||||||
|
|
||||||
# Track per-request stats
|
# Track per-request stats
|
||||||
iteration_stats.update_from_finished_request(
|
iteration_stats.update_from_finished_request(
|
||||||
request_output, req_state.stats)
|
detokenizer_output.finish_reason, request_output,
|
||||||
|
req_state.stats)
|
||||||
|
|
||||||
return OutputProcessorOutput(
|
return OutputProcessorOutput(
|
||||||
request_outputs=request_outputs,
|
request_outputs=request_outputs,
|
||||||
@ -191,12 +196,8 @@ class OutputProcessor:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_request_output(
|
def _make_request_output(
|
||||||
request_state: RequestState,
|
request_state: RequestState,
|
||||||
detokenizer_output: Optional[DetokenizerOutput],
|
detokenizer_output: DetokenizerOutput,
|
||||||
) -> Optional[RequestOutput]:
|
) -> RequestOutput:
|
||||||
|
|
||||||
if detokenizer_output is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
request_output = RequestOutput.new(
|
request_output = RequestOutput.new(
|
||||||
request_state.request_id,
|
request_state.request_id,
|
||||||
request_state.prompt,
|
request_state.prompt,
|
||||||
@ -207,7 +208,8 @@ class OutputProcessor:
|
|||||||
)
|
)
|
||||||
if detokenizer_output.finished:
|
if detokenizer_output.finished:
|
||||||
completion_output = request_output.outputs[0]
|
completion_output = request_output.outputs[0]
|
||||||
completion_output.finish_reason = detokenizer_output.finish_reason
|
completion_output.finish_reason = str(
|
||||||
|
detokenizer_output.finish_reason)
|
||||||
completion_output.stop_reason = detokenizer_output.stop_reason
|
completion_output.stop_reason = detokenizer_output.stop_reason
|
||||||
|
|
||||||
return request_output
|
return request_output
|
||||||
|
|||||||
@ -2,13 +2,14 @@
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List
|
from typing import Dict, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import prometheus_client
|
import prometheus_client
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.v1.engine import RequestFinishedReason
|
||||||
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
|
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -116,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
documentation="Number of generation tokens processed.",
|
documentation="Number of generation tokens processed.",
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
self.counter_request_success: Dict[RequestFinishedReason,
|
||||||
|
prometheus_client.Counter] = {}
|
||||||
|
counter_request_success_base = prometheus_client.Counter(
|
||||||
|
name="vllm:request_success_total",
|
||||||
|
documentation="Count of successfully processed requests.",
|
||||||
|
labelnames=labelnames + ["finished_reason"])
|
||||||
|
for reason in RequestFinishedReason:
|
||||||
|
self.counter_request_success[
|
||||||
|
reason] = counter_request_success_base.labels(*(labelvalues +
|
||||||
|
[str(reason)]))
|
||||||
|
|
||||||
self.histogram_num_prompt_tokens_request = \
|
self.histogram_num_prompt_tokens_request = \
|
||||||
prometheus_client.Histogram(
|
prometheus_client.Histogram(
|
||||||
name="vllm:request_prompt_tokens",
|
name="vllm:request_prompt_tokens",
|
||||||
@ -163,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
iteration_stats.num_generation_tokens)
|
iteration_stats.num_generation_tokens)
|
||||||
|
|
||||||
for finished_request in iteration_stats.finished_requests:
|
for finished_request in iteration_stats.finished_requests:
|
||||||
|
self.counter_request_success[finished_request.finish_reason].inc()
|
||||||
self.histogram_num_prompt_tokens_request.observe(
|
self.histogram_num_prompt_tokens_request.observe(
|
||||||
finished_request.num_prompt_tokens)
|
finished_request.num_prompt_tokens)
|
||||||
self.histogram_num_generation_tokens_request.observe(
|
self.histogram_num_generation_tokens_request.observe(
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.v1.engine import EngineCoreOutput
|
from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -32,6 +32,7 @@ class RequestStateStats:
|
|||||||
class FinishedRequestStats:
|
class FinishedRequestStats:
|
||||||
"""Stats associated with a finished request."""
|
"""Stats associated with a finished request."""
|
||||||
|
|
||||||
|
finish_reason: "RequestFinishedReason"
|
||||||
num_prompt_tokens: int = 0
|
num_prompt_tokens: int = 0
|
||||||
num_generation_tokens: int = 0
|
num_generation_tokens: int = 0
|
||||||
|
|
||||||
@ -73,8 +74,11 @@ class IterationStats:
|
|||||||
request_state_stats.num_generation_tokens += num_new_generation_tokens
|
request_state_stats.num_generation_tokens += num_new_generation_tokens
|
||||||
request_state_stats.last_token_time = now
|
request_state_stats.last_token_time = now
|
||||||
|
|
||||||
def update_from_finished_request(self, request_output: "RequestOutput",
|
def update_from_finished_request(self,
|
||||||
|
finish_reason: "RequestFinishedReason",
|
||||||
|
request_output: "RequestOutput",
|
||||||
request_state_stats: RequestStateStats):
|
request_state_stats: RequestStateStats):
|
||||||
self.finished_requests.append(
|
self.finished_requests.append(
|
||||||
FinishedRequestStats(len(request_output.prompt_token_ids),
|
FinishedRequestStats(finish_reason,
|
||||||
|
len(request_output.prompt_token_ids),
|
||||||
request_state_stats.num_generation_tokens))
|
request_state_stats.num_generation_tokens))
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import RequestMetrics
|
from vllm.sequence import RequestMetrics
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
|
||||||
from vllm.v1.utils import ConstantList
|
from vllm.v1.utils import ConstantList
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -109,7 +109,7 @@ class Request:
|
|||||||
def is_finished(self) -> bool:
|
def is_finished(self) -> bool:
|
||||||
return RequestStatus.is_finished(self.status)
|
return RequestStatus.is_finished(self.status)
|
||||||
|
|
||||||
def get_finished_reason(self) -> Union[str, None]:
|
def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
|
||||||
return RequestStatus.get_finished_reason(self.status)
|
return RequestStatus.get_finished_reason(self.status)
|
||||||
|
|
||||||
def has_encoder_inputs(self) -> bool:
|
def has_encoder_inputs(self) -> bool:
|
||||||
@ -149,7 +149,8 @@ class RequestStatus(enum.IntEnum):
|
|||||||
return status > RequestStatus.PREEMPTED
|
return status > RequestStatus.PREEMPTED
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
|
def get_finished_reason(
|
||||||
|
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
|
||||||
return _FINISHED_REASON_MAP.get(status)
|
return _FINISHED_REASON_MAP.get(status)
|
||||||
|
|
||||||
|
|
||||||
@ -158,8 +159,8 @@ class RequestStatus(enum.IntEnum):
|
|||||||
# are longer than the model's length cap. Therefore, the stop
|
# are longer than the model's length cap. Therefore, the stop
|
||||||
# reason should also be "length" as in OpenAI API.
|
# reason should also be "length" as in OpenAI API.
|
||||||
_FINISHED_REASON_MAP = {
|
_FINISHED_REASON_MAP = {
|
||||||
RequestStatus.FINISHED_STOPPED: "stop",
|
RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
|
||||||
RequestStatus.FINISHED_LENGTH_CAPPED: "length",
|
RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
|
||||||
RequestStatus.FINISHED_ABORTED: "abort",
|
RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
|
||||||
RequestStatus.FINISHED_IGNORED: "length",
|
RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user