[v1][stats][1/n] Add RequestStatsUpdate and RequestStats types (#10907)

Signed-off-by: rickyx <rickyx@anyscale.com>
2026-01-28 19:37:14 +08:00 · 2025-01-21 11:51:13 -08:00 · 2025-01-21 11:51:13 -08:00 · 132a132100
commit 132a132100
parent 1e60f87bb3
3 changed files with 749 additions and 0 deletions
--- a/tests/v1/test_stats.py
+++ b/tests/v1/test_stats.py
@ -0,0 +1,300 @@
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.stats.common import RequestStats, RequestStatsUpdate
+
+
+def make_update(
+    request_id: str,
+    update_type: RequestStatsUpdate.Type,
+    monotonic_ts_s: float,
+    **kwargs,
+):
+    if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED:
+        kwargs.setdefault("sampling_params", SamplingParams(n=1))
+        kwargs.setdefault("num_prompt_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.PREFILLING:
+        kwargs.setdefault("num_computed_tokens", 10)
+        kwargs.setdefault("num_cached_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.DETOKENIZED:
+        kwargs.setdefault("num_new_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.FINISHED:
+        kwargs.setdefault("finish_reason", "test_reason")
+
+    return RequestStatsUpdate(
+        request_id=request_id,
+        type=update_type,
+        monotonic_ts_s=monotonic_ts_s,
+        **kwargs,
+    )
+
+
+def test_invalid_request_update():
+    request_id = "test_request"
+    update_specific_required_fields = {
+        RequestStatsUpdate.Type.INPUT_PROCESSED: [
+            "sampling_params",
+            "num_prompt_tokens",
+        ],
+        RequestStatsUpdate.Type.PREFILLING: [
+            "num_computed_tokens",
+            "num_cached_tokens",
+        ],
+        RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"],
+        RequestStatsUpdate.Type.FINISHED: ["finish_reason"],
+    }
+
+    # Missing a required field should raise an assertion error.
+    for update_type in RequestStatsUpdate.Type:
+        required_fields = update_specific_required_fields.get(update_type, [])
+
+        # Try to miss one of the required fields.
+        kwargs = {field: object() for field in required_fields}
+        for field in required_fields:
+            copy_kwargs = kwargs.copy()
+            copy_kwargs.pop(field)
+            with pytest.raises(ValueError):
+                RequestStatsUpdate(
+                    request_id=request_id,
+                    type=update_type,
+                    **copy_kwargs,
+                )
+
+
+def test_invalid_request_update_transition():
+    # Test invalid transition type.
+    for src in RequestStatsUpdate.Type:
+        for dst in RequestStatsUpdate.Type:
+            if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]:
+                with pytest.raises(AssertionError):
+                    RequestStatsUpdate.check_valid_update(
+                        make_update(
+                            update_type=dst,
+                            request_id="test_request",
+                            monotonic_ts_s=1,
+                        ),
+                        last_update_type=src,
+                        last_updated_ts_s=0,
+                    )
+            else:
+                RequestStatsUpdate.check_valid_update(
+                    make_update(
+                        request_id="test_request",
+                        update_type=dst,
+                        monotonic_ts_s=1,
+                    ),
+                    last_update_type=src,
+                    last_updated_ts_s=0,
+                )
+
+    # Test invalid timestamp.
+    with pytest.raises(AssertionError):
+        RequestStatsUpdate.check_valid_update(
+            make_update(
+                request_id="test_request",
+                update_type=RequestStatsUpdate.Type.ARRIVED,
+                monotonic_ts_s=1,
+            ),
+            last_update_type=None,
+            last_updated_ts_s=2,
+        )
+
+
+def test_lifecycle_updates():
+    request_id = "test_request"
+    stats = RequestStats(request_id=request_id)
+
+    # Test the below scenario:
+    arrived_ts = 0
+    input_processed_ts = 1
+    queued_ts = 2
+    prefilling_ts = 3
+    decoded_ts = 5
+    detokenized_ts = 6
+    decoded_2_ts = 7
+    detokenized_2_ts = 8
+    preempted_ts = 9
+    resumed_ts = 10
+    decoded_3_ts = 11
+    detokenized_3_ts = 12
+    finished_ts = 13
+
+    # Test ARRIVED
+    arrived_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.ARRIVED,
+        monotonic_ts_s=arrived_ts,
+    )
+    stats.update_from(arrived_update)
+    assert stats.arrival_ts_s == arrived_ts
+    assert stats.last_updated_ts_s == arrived_ts
+
+    # Test INPUT_PROCESSED
+    sampling_params = SamplingParams(n=1)
+    input_processed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.INPUT_PROCESSED,
+        monotonic_ts_s=input_processed_ts,
+        sampling_params=sampling_params,
+        num_prompt_tokens=6,
+    )
+    stats.update_from(input_processed_update)
+    assert stats.input_processor_end_ts_s == input_processed_ts
+    assert stats.last_updated_ts_s == input_processed_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.sampling_params == sampling_params
+
+    assert stats.first_token_ts_s is None
+    assert stats.prefill_ts_s is None
+
+    # Test QUEUED
+    queued_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.QUEUED,
+        monotonic_ts_s=queued_ts,
+    )
+    stats.update_from(queued_update)
+    assert stats.queued_ts_s == queued_ts
+    assert stats.last_updated_ts_s == queued_ts
+
+    # Test PREFILLING
+    prefilling_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=prefilling_ts,
+        num_computed_tokens=3,
+        num_cached_tokens=1,
+    )
+    stats.update_from(prefilling_update)
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 3
+    assert stats.num_cached_tokens == 1
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+
+    # Test DECODING
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_ts,
+    )
+    stats.update_from(decoded_update)
+    assert stats.last_updated_ts_s == decoded_ts
+
+    # Test DETOKENIZED
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.last_updated_ts_s == detokenized_ts
+    assert stats.num_output_tokens == 1
+    # Since arrival
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    # Since first scheduled
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+
+    # Test another DECODING and DETOKENIZED should
+    # yield correct inter token latency
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_2_ts,
+    )
+    stats.update_from(decoded_update)
+
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_2_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.num_output_tokens == 2
+
+    # Test PREEMPTED
+    preempted_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREEMPTED,
+        monotonic_ts_s=preempted_ts,
+    )
+    stats.update_from(preempted_update)
+    assert stats.last_updated_ts_s == preempted_ts
+    assert stats.preempted_ts_s_lst == [preempted_ts]
+    # States should be reset
+    assert stats.num_computed_tokens == 0
+    assert stats.num_cached_tokens == 0
+    # These states should not be reset
+    assert stats.num_output_tokens == 2
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.prefill_latency_s == prefilling_ts - arrived_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.prefill_start_ts_s_lst == [prefilling_ts]
+
+    # Test resumed
+    resumed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=resumed_ts,
+        num_computed_tokens=6,
+        num_cached_tokens=2,
+    )
+    stats.update_from(resumed_update)
+    # prefill timestamp should not be updated since it's a resumed prefill
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 6
+    assert stats.num_cached_tokens == 2
+    assert stats.prefill_start_ts_s_lst == [
+        prefilling_ts,
+        resumed_ts,
+    ]
+    assert stats.last_updated_ts_s == resumed_ts
+
+    # Test another DECODED/DETOKENIZED should yield correct first token latency.
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_3_ts,
+    )
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_3_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(decoded_update)
+    stats.update_from(detokenized_update)
+    assert stats.first_token_ts_s == detokenized_ts - arrived_ts
+    assert stats.num_output_tokens == 3
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+        detokenized_3_ts - detokenized_2_ts,
+    ]
+
+    # Test FINISHED
+    finished_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.FINISHED,
+        monotonic_ts_s=finished_ts,
+        finish_reason="test_reason",
+    )
+    stats.update_from(finished_update)
+    assert stats.last_updated_ts_s == finished_ts
+    assert stats.e2e_latency_s == finished_ts - arrived_ts
+    assert stats.inference_latency_s == finished_ts - prefilling_ts
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+    assert stats.decode_latency_s == finished_ts - detokenized_ts
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+    assert stats.is_finished
+    assert stats.finish_reason == "test_reason"
+
+    # TODO(rickyx): Add model forward/execute time.
+    assert stats.model_forward_duration_s == 0.0
+    assert stats.model_execute_duration_s == 0.0
--- a/vllm/v1/stats/init.py
+++ b/vllm/v1/stats/init.py
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@ -0,0 +1,449 @@
+import time
+from dataclasses import dataclass
+from dataclasses import field as dataclass_field
+from enum import IntEnum
+from typing import ClassVar, Dict, List, Optional, Set
+
+import msgspec
+from msgspec import field as msgspec_field
+
+from vllm.sampling_params import SamplingParams
+
+
+class RequestStatsUpdate(msgspec.Struct,
+                         array_like=True,
+                         omit_defaults=True,
+                         gc=False):
+    """
+    An update to the request stats.
+
+    This represents a stats update at a specific timestamp with metadata
+    associated with the update.
+
+    NOTE: since there might be multiple processes generating updates at
+    different parts of the engine (e.g. input processor, scheduler, engine core,
+    etc.), we use the monotonic timestamp to record the update to compute any
+    intervals, and explicit wall-clock timestamp should be used for timestamps.
+
+    WARNING: This assumes stats are generated in a single machine. If there are
+    potentially multiple machines, one should always generate the stats updates
+    on one single machine or use something else.
+    """
+
+    class Type(IntEnum):
+        """See `RequestStats` for the lifecycle of a request."""
+
+        # Request arrived at the engine frontend.
+        ARRIVED = 0
+        # Input processed by the input processor.
+        INPUT_PROCESSED = 1
+        # Queued on the engine core.
+        QUEUED = 2
+        # Scheduled running prefill by the scheduler.
+        # A request could be running a new prefill on the prompt tokens or
+        # a resumed prefill on the original prefill tokens + generated output
+        # tokens before preemption.
+        PREFILLING = 3
+        # Preempted by the scheduler.
+        PREEMPTED = 4
+        # Output token is generated by the engine core.
+        DECODING = 5
+        # Token detokenized by the detokenizer.
+        # We will record the timestamp for each output token, as well as the
+        # finish reason.
+        DETOKENIZED = 6
+        # Request finishes (or aborts).
+        FINISHED = 7
+
+    """
+    Valid state updates:
+    ARRIVED
+    │
+    ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐
+    │              │                   │              │             │
+    │              │                   │              ▼             │
+    │              │                   │       -──► DECODING        │
+    │              │                   │       |      │             │
+    │              │                   │       |      ▼             │
+    │              │                   │       └─ DETOKENIZED       │
+    │              │                   │              │             │
+    │              │                   │              ▼             │
+    │              ▼                   ▼           PREEMPTED ◄──────┘
+    │              │                   │              │
+    └──────────────┴───────────────────┴──────────────┴
+                                │
+                                ▼
+                FINISHED (All could go to FINISHED)
+    """
+    _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = {
+        Type.ARRIVED: {
+            Type.INPUT_PROCESSED,
+            Type.FINISHED,
+        },
+        Type.INPUT_PROCESSED: {
+            Type.QUEUED,
+            Type.FINISHED,
+        },
+        Type.QUEUED: {
+            Type.PREFILLING,
+            Type.FINISHED,
+        },
+        Type.PREFILLING: {
+            Type.DECODING,
+            Type.PREEMPTED,
+            Type.FINISHED,
+        },
+        Type.DECODING: {
+            Type.DETOKENIZED,
+            Type.FINISHED,
+        },
+        Type.DETOKENIZED: {
+            Type.DECODING,
+            Type.PREEMPTED,
+            Type.FINISHED,
+        },
+        Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED},
+        Type.FINISHED: set(),
+    }
+
+    request_id: str
+
+    type: Type
+
+    # Timestamp when the update is recorded. This is used to record time
+    # intervals between events rather than wall clock time.
+    monotonic_ts_s: float = msgspec_field(
+        default_factory=lambda: time.monotonic())
+
+    ############################################################
+    # Metadata associated with the update.
+    ############################################################
+    # For input_processed. Metadata needed for stats logging.
+    num_prompt_tokens: Optional[int] = None
+    sampling_params: Optional[SamplingParams] = None
+
+    # For running.
+    # Number of tokens computed when scheduled to run.
+    num_computed_tokens: Optional[int] = None
+    # Number of cached tokens when scheduled to run.
+    num_cached_tokens: Optional[int] = None
+
+    # For decoded.
+    # The number of new output tokens generated.
+    num_new_tokens: Optional[int] = None
+
+    # For both detokenized and decoded.
+    # Finished reason.
+    finish_reason: Optional[str] = None
+
+    # Non-optional fields for each update type.
+    _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = {
+        Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"],
+        Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"],
+        Type.DETOKENIZED: ["num_new_tokens"],
+        Type.FINISHED: ["finish_reason"],
+    }
+
+    def __post_init__(self):
+        required_fields = self._REQUIRED_FIELDS.get(self.type, [])
+        for field in required_fields:
+            if getattr(self, field) is None:
+                raise ValueError(
+                    f"Field {field} is required for update type {self.type}.")
+
+    @staticmethod
+    def check_valid_update(
+        update: "RequestStatsUpdate",
+        last_update_type: Optional[Type],
+        last_updated_ts_s: Optional[float],
+    ):
+        if last_update_type is None:
+            assert update.type == RequestStatsUpdate.Type.ARRIVED
+        else:
+            valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[
+                last_update_type]
+            assert update.type in valid_cur_update_types, (
+                f"Invalid update type: {update.type} for last_update_type: "
+                f"{last_update_type}.")
+
+        if last_updated_ts_s is not None:
+            assert update.monotonic_ts_s >= last_updated_ts_s, (
+                "Update timestamp must be monotonically increasing, but "
+                f"last_updated_ts_s={last_updated_ts_s} and "
+                f"update.monotonic_ts_s={update.monotonic_ts_s}.")
+
+
+@dataclass
+class RequestStats:
+    """Stats associated with a request (`Request`)."""
+
+    ############################################################
+    # Metadata
+    ############################################################
+    request_id: str
+    sampling_params: Optional[SamplingParams] = None
+    num_prompt_tokens: Optional[int] = None
+
+    ############################################################
+    # Metrics and Stats
+    ############################################################
+    # Timestamp when the request was last updated.
+    last_updated_ts_s: Optional[float] = None
+
+    # Last update stats type.
+    last_update_type: Optional[RequestStatsUpdate.Type] = None
+
+    # Timestamp when the request arrived at the llm engine.
+    arrival_ts_s: Optional[float] = None
+
+    # Number of tokens cached. When part of the request prefix is cached,
+    # this will be set.
+    num_cached_tokens: int = 0
+
+    # Number of tokens computed.
+    num_computed_tokens: int = 0
+
+    # The timestamp when the request become waiting in the queue.
+    queued_ts_s: Optional[float] = None
+
+    # When the input processor is completed.
+    input_processor_end_ts_s: Optional[float] = None
+
+    # A sorted list of timestamps when the request was scheduled to prefill.
+    # This could be when:
+    # 1. the request is newly scheduled, so it's a new prefill.
+    # 2. the request was preempted and resumed. It is equivalent to running
+    #    a prefill of the original prefill tokens + generated output tokens
+    #    before preemption.
+    prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # A list of timestamps when a token is decoded by the engine core.
+    decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # A sorted list of timestamps for each output token.
+    output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # First token's timestamp.
+    first_token_ts_s: Optional[float] = None
+
+    # TODO(rickyx): we need model runner to surface these.
+    model_forward_duration_s: float = 0.0
+    # Includes model forward, block/sync across workers, cpu-gpu sync time
+    # and sampling time.
+    model_execute_duration_s: float = 0.0
+
+    # A sorted list of timestamps when the request was preempted at the
+    # scheduler.
+    # TODO(rickyx): right now, we don't actually have a good high-level
+    # metric to measure the impact of preemption other than observation of
+    # large P99 TPOT. Ideally we could quantify the impact of preemption by
+    # measuring the number of tokens re-computed due to preemption.
+    preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # Timestamp when the request was finished at the engine core.
+    finished_ts_s: Optional[float] = None
+
+    # Finish reason.
+    finish_reason: Optional[str] = None
+
+    ############################################################
+    # Derived properties.
+    ############################################################
+    @property
+    def prefill_ts_s(self) -> Optional[float]:
+        """The timestamp when the request started prefilling.
+        Since a request could be preempted in decoding and later resumed
+        to prefill the decoded tokens, we use the first prefill start timestamp.
+        """
+        return (self.prefill_start_ts_s_lst[0]
+                if self.prefill_start_ts_s_lst else None)
+
+    @property
+    def e2e_latency_s(self) -> Optional[float]:
+        if self.finished_ts_s is None or self.arrival_ts_s is None:
+            return None
+        assert self.finished_ts_s >= self.arrival_ts_s
+        return self.finished_ts_s - self.arrival_ts_s
+
+    @property
+    def queue_duration_s(self) -> Optional[float]:
+        """How long the request was waiting to run."""
+        if self.queued_ts_s is None or self.prefill_ts_s is None:
+            # Either not queued or not running yet.
+            return None
+        assert self.queued_ts_s <= self.prefill_ts_s
+        return self.prefill_ts_s - self.queued_ts_s
+
+    @property
+    def inference_latency_s(self) -> Optional[float]:
+        """How long the request was running inference
+        (prefill and decode)."""
+        if self.finished_ts_s is None or self.prefill_ts_s is None:
+            return None
+        assert self.finished_ts_s >= self.prefill_ts_s
+        return self.finished_ts_s - self.prefill_ts_s
+
+    @property
+    def first_token_latency_s(self) -> Optional[float]:
+        if self.first_token_ts_s is None or self.arrival_ts_s is None:
+            return None
+        assert self.first_token_ts_s >= self.arrival_ts_s
+        return self.first_token_ts_s - self.arrival_ts_s
+
+    @property
+    def prefill_latency_s(self) -> Optional[float]:
+        if self.first_token_ts_s is None or self.prefill_ts_s is None:
+            return None
+        assert self.first_token_ts_s >= self.prefill_ts_s
+        return self.first_token_ts_s - self.prefill_ts_s
+
+    @property
+    def decode_latency_s(self) -> Optional[float]:
+        if self.e2e_latency_s is None or self.first_token_latency_s is None:
+            return None
+        assert self.e2e_latency_s >= self.first_token_latency_s
+        return self.e2e_latency_s - self.first_token_latency_s
+
+    @property
+    def output_token_latency_s_lst(self) -> List[float]:
+        if len(self.output_token_ts_s_lst) == 0:
+            return []
+        latency_s_lst = []
+        for i in range(1, len(self.output_token_ts_s_lst)):
+            assert (self.output_token_ts_s_lst[i] >=
+                    self.output_token_ts_s_lst[i - 1])
+            latency_s = (self.output_token_ts_s_lst[i] -
+                         self.output_token_ts_s_lst[i - 1])
+            latency_s_lst.append(latency_s)
+        return latency_s_lst
+
+    @property
+    def num_output_tokens(self) -> int:
+        return len(self.output_token_ts_s_lst)
+
+    @property
+    def is_finished(self) -> bool:
+        return self.finished_ts_s is not None
+
+    def update_from(self, update: "RequestStatsUpdate"):
+        RequestStatsUpdate.check_valid_update(update, self.last_update_type,
+                                              self.last_updated_ts_s)
+        ts = update.monotonic_ts_s
+        self.last_updated_ts_s = ts
+        self.last_update_type = update.type
+        if update.type == RequestStatsUpdate.Type.ARRIVED:
+            self.arrival_ts_s = ts
+        elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED:
+            self.input_processor_end_ts_s = ts
+            self.sampling_params = update.sampling_params
+            self.num_prompt_tokens = update.num_prompt_tokens
+        elif update.type == RequestStatsUpdate.Type.QUEUED:
+            self.queued_ts_s = ts
+        elif update.type == RequestStatsUpdate.Type.PREFILLING:
+            self.prefill_start_ts_s_lst.append(ts)
+            self.num_cached_tokens = update.num_cached_tokens
+            self.num_computed_tokens = update.num_computed_tokens
+        elif update.type == RequestStatsUpdate.Type.PREEMPTED:
+            self._reset_for_preemption(ts)
+        elif update.type == RequestStatsUpdate.Type.DECODING:
+            self.decoding_ts_s_lst.append(ts)
+        elif update.type == RequestStatsUpdate.Type.DETOKENIZED:
+            self._record_detokenized_output(
+                ts,
+                update.num_new_tokens,
+            )
+        elif update.type == RequestStatsUpdate.Type.FINISHED:
+            self.finished_ts_s = ts
+            self.finish_reason = update.finish_reason
+        else:
+            raise ValueError(f"Unknown update type: {update.type}")
+
+    def _record_detokenized_output(
+        self,
+        ts_s: float,
+        num_new_tokens: int,
+    ):
+        # Update if first output token is generated.
+        if len(self.output_token_ts_s_lst) == 0:
+            self.first_token_ts_s = ts_s
+            assert (
+                self.prefill_ts_s is not None
+            ), "Request must be running before generating output tokens."
+
+        # Some X new tokens were generated at the ts.
+        self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens)
+
+    def _reset_for_preemption(self, ts_s: float):
+        self.preempted_ts_s_lst.append(ts_s)
+        # Reset the computed tokens since it might restart the prefill.
+        self.num_computed_tokens = 0
+        # Cached token count might also change when resumed.
+        self.num_cached_tokens = 0
+        # These stats don't change since they happen before request running.
+        # - arrival_ts_s
+        # - input_processor_end_ts_s
+        # - sampling_params
+        # - num_prompt_tokens
+        # - first_token_ts_s
+        #
+        # These stats are accumulated over preemptions:
+        # - output_token_ts_s_lst
+        # - prefill_start_ts_s_lst (after preemption, it will prefill the
+        #   original prefill tokens and any output tokens generated before
+        #   preemption.)
+
+
+@dataclass
+class KVCacheStats:
+    #   KV Cache Usage in %
+    gpu_cache_usage_sys: float = 0.0
+    gpu_prefix_cache_hit_rate: float = 0.0
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    # Number of requests currently running.
+    num_running_reqs: int = 0
+    # Number of requests currently waiting.
+    num_waiting_reqs: int = 0
+
+    kv_cache_stats: KVCacheStats = dataclass_field(
+        default_factory=KVCacheStats)
+
+
+@dataclass
+class EngineCoreProcessStats:
+    """Stats associated with the engine core process."""
+
+    # Number of requests currently in the input queue. None if the engine core
+    # is not running in multiprocess mode.
+    input_queue_size: Optional[int] = None
+    # Number of outputs currently in the output queue. None if the engine core
+    # is not running in multiprocess mode.
+    output_queue_size: Optional[int] = None
+
+
+class EngineCoreStatsSnapshot(msgspec.Struct,
+                              array_like=True,
+                              omit_defaults=True,
+                              gc=False):
+    """
+    A snapshot of the EngineCore's current stats over a period of time.
+    """
+
+    # Snapshot of the scheduler stats.
+    scheduler_stats: SchedulerStats = msgspec_field(
+        default_factory=SchedulerStats)
+
+    # Per request stats updates.
+    requests_stats_updates: List[RequestStatsUpdate] = msgspec_field(
+        default_factory=list)
+
+    # Engine core's queue stats.
+    engine_core_process_stats: EngineCoreProcessStats = msgspec_field(
+        default_factory=EngineCoreProcessStats)
+
+    # TODO(rickyx): Add other components' stats,
+    # e.g. model runner/worker and etc.