[V1][Metrics] Add API for accessing in-memory Prometheus metrics (#17010)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-12-10 05:15:42 +08:00 · 2025-05-27 10:37:06 +01:00 · 2025-05-27 10:37:06 +01:00 · 06a0338015
commit 06a0338015
parent 4318c0559d
10 changed files with 543 additions and 28 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -222,6 +222,7 @@ steps:
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -6,6 +6,7 @@ import os
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Vector
 def load_prompts(dataset_path, num_prompts):
@ -105,30 +106,33 @@ def main():
        print(f"generated text: {output.outputs[0].text}")
        print("-" * 50)
-    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+    try:
        metrics = llm.get_metrics()
    except AssertionError:
        print("Metrics are not supported in the V0 engine.")
        return
-    # calculate the average number of accepted tokens per forward pass, +1 is
+    num_drafts = num_accepted = 0
-    # to account for the token from the target model that's always going to be
+    acceptance_counts = [0] * args.num_spec_tokens
-    # accepted
+    for metric in metrics:
-    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+        if metric.name == "vllm:spec_decode_num_drafts":
-    for output in outputs:
+            assert isinstance(metric, Counter)
-        for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+            num_drafts += metric.value
-            acceptance_counts[step] += count
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
            assert isinstance(metric, Counter)
            num_accepted += metric.value
        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
            assert isinstance(metric, Vector)
            for pos in range(len(metric.values)):
                acceptance_counts[pos] += metric.values[pos]
    print("-" * 50)
-    print(
+    print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
        f"mean acceptance length (including bonus tokens): \
        {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}"
    )
    print("-" * 50)
    # print acceptance at each token position
    for i in range(len(acceptance_counts)):
-        print(
+        print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")
            f"acceptance at token {i}:"
            f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}"
        )
 if __name__ == "__main__":
--- a/examples/offline_inference/metrics.py
+++ b/examples/offline_inference/metrics.py
@ -0,0 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 def main():
    # Create an LLM.
    llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    print("-" * 50)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
        print("-" * 50)
    # Dump all metrics
    for metric in llm.get_metrics():
        if isinstance(metric, Gauge):
            print(f"{metric.name} (gauge) = {metric.value}")
        elif isinstance(metric, Counter):
            print(f"{metric.name} (counter) = {metric.value}")
        elif isinstance(metric, Vector):
            print(f"{metric.name} (vector) = {metric.values}")
        elif isinstance(metric, Histogram):
            print(f"{metric.name} (histogram)")
            print(f"    sum = {metric.sum}")
            print(f"    count = {metric.count}")
            for bucket_le, value in metric.buckets.items():
                print(f"    {bucket_le} = {value}")
 if __name__ == "__main__":
    main()
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@ -6,6 +6,7 @@ from typing import Optional
 import pytest
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
@ -97,3 +98,67 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
            raise AssertionError(
                f"{len(completion_counts)} unique completions; expected"
                f" {n}. Repeats: {repeats}")
 def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
    max_tokens = 100
    # Use spec decoding to test num_accepted_tokens_per_pos
    speculative_config = {
        "method": "ngram",
        "prompt_lookup_max": 5,
        "prompt_lookup_min": 3,
        "num_speculative_tokens": 5,
    }
    monkeypatch.setenv("VLLM_USE_V1", "1")
    with vllm_runner(
            MODEL,
            speculative_config=speculative_config,
            disable_log_stats=False,
    ) as vllm_model:
        model: LLM = vllm_model.model
        sampling_params = SamplingParams(temperature=0.0,
                                         max_tokens=max_tokens)
        outputs = model.generate(example_prompts, sampling_params)
        n_prompts = len(example_prompts)
        assert len(outputs) == n_prompts
        total_tokens = 0
        for out in outputs:
            assert len(out.outputs) == 1
            total_tokens += len(out.outputs[0].token_ids)
        assert total_tokens == max_tokens * n_prompts
        metrics = model.get_metrics()
        def find_metric(name) -> list[Metric]:
            found = []
            for metric in metrics:
                if metric.name == name:
                    found.append(metric)
            return found
        num_requests_running = find_metric("vllm:num_requests_running")
        assert len(num_requests_running) == 1
        assert isinstance(num_requests_running[0], Gauge)
        assert num_requests_running[0].value == .0
        generation_tokens = find_metric("vllm:generation_tokens")
        assert len(generation_tokens) == 1
        assert isinstance(generation_tokens[0], Counter)
        assert generation_tokens[0].value == total_tokens
        request_generation_tokens = find_metric(
            "vllm:request_generation_tokens")
        assert len(request_generation_tokens) == 1
        assert isinstance(request_generation_tokens[0], Histogram)
        assert "+Inf" in request_generation_tokens[0].buckets
        assert request_generation_tokens[0].buckets["+Inf"] == n_prompts
        assert request_generation_tokens[0].count == n_prompts
        assert request_generation_tokens[0].sum == total_tokens
        num_accepted_tokens_per_pos = find_metric(
            "vllm:spec_decode_num_accepted_tokens_per_pos")
        assert len(num_accepted_tokens_per_pos) == 1
        assert isinstance(num_accepted_tokens_per_pos[0], Vector)
        assert len(num_accepted_tokens_per_pos[0].values) == 5
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/test_metrics_reader.py
@ -0,0 +1,112 @@
 # SPDX-License-Identifier: Apache-2.0
 import prometheus_client
 import pytest
 from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
                                    get_metrics_snapshot)
@pytest.fixture(autouse=True)
 def test_registry(monkeypatch):
    # Use a custom registry for tests
    test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
    monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
    return test_registry
@pytest.mark.parametrize("num_engines", [1, 4])
 def test_gauge_metric(test_registry, num_engines):
    g = prometheus_client.Gauge("vllm:test_gauge",
                                "Test gauge metric",
                                labelnames=["model", "engine_index"],
                                registry=test_registry)
    for i in range(num_engines):
        g.labels(model="foo", engine_index=str(i)).set(98.5)
    metrics = get_metrics_snapshot()
    assert len(metrics) == num_engines
    engine_labels = [str(i) for i in range(num_engines)]
    for m in metrics:
        assert isinstance(m, Gauge)
        assert m.name == "vllm:test_gauge"
        assert m.value == 98.5
        assert m.labels["model"] == "foo"
        assert m.labels["engine_index"] in engine_labels
        engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
 def test_counter_metric(test_registry, num_engines):
    c = prometheus_client.Counter("vllm:test_counter",
                                  "Test counter metric",
                                  labelnames=["model", "engine_index"],
                                  registry=test_registry)
    for i in range(num_engines):
        c.labels(model="bar", engine_index=str(i)).inc(19)
    metrics = get_metrics_snapshot()
    assert len(metrics) == num_engines
    engine_labels = [str(i) for i in range(num_engines)]
    for m in metrics:
        assert isinstance(m, Counter)
        assert m.name == "vllm:test_counter"
        assert m.value == 19
        assert m.labels["model"] == "bar"
        assert m.labels["engine_index"] in engine_labels
        engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
 def test_histogram_metric(test_registry, num_engines):
    h = prometheus_client.Histogram("vllm:test_histogram",
                                    "Test histogram metric",
                                    labelnames=["model", "engine_index"],
                                    buckets=[10, 20, 30, 40, 50],
                                    registry=test_registry)
    for i in range(num_engines):
        hist = h.labels(model="blaa", engine_index=str(i))
        hist.observe(42)
        hist.observe(21)
        hist.observe(7)
    metrics = get_metrics_snapshot()
    assert len(metrics) == num_engines
    engine_labels = [str(i) for i in range(num_engines)]
    for m in metrics:
        assert isinstance(m, Histogram)
        assert m.name == "vllm:test_histogram"
        assert m.count == 3
        assert m.sum == 70
        assert m.buckets["10.0"] == 1
        assert m.buckets["20.0"] == 1
        assert m.buckets["30.0"] == 2
        assert m.buckets["40.0"] == 2
        assert m.buckets["50.0"] == 3
        assert m.labels["model"] == "blaa"
        assert m.labels["engine_index"] in engine_labels
        engine_labels.remove(m.labels["engine_index"])
@pytest.mark.parametrize("num_engines", [1, 4])
 def test_vector_metric(test_registry, num_engines):
    c = prometheus_client.Counter(
        "vllm:spec_decode_num_accepted_tokens_per_pos",
        "Vector-like counter metric",
        labelnames=["position", "model", "engine_index"],
        registry=test_registry)
    for i in range(num_engines):
        c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
        c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
        c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
    metrics = get_metrics_snapshot()
    assert len(metrics) == num_engines
    engine_labels = [str(i) for i in range(num_engines)]
    for m in metrics:
        assert isinstance(m, Vector)
        assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
        assert m.values == [10, 5, 1]
        assert m.labels["model"] == "llama"
        assert m.labels["engine_index"] in engine_labels
        engine_labels.remove(m.labels["engine_index"])
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -4,7 +4,8 @@ import itertools
 import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
                    cast, overload)
 import cloudpickle
 import torch.nn as nn
@ -47,6 +48,9 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                        is_list_of)
 if TYPE_CHECKING:
    from vllm.v1.metrics.reader import Metric
 logger = init_logger(__name__)
 _R = TypeVar("_R", default=Any)
@ -1294,6 +1298,20 @@ class LLM:
        """
        self.llm_engine.wake_up(tags)
    def get_metrics(self) -> list["Metric"]:
        """Return a snapshot of aggregated metrics from Prometheus.
        Returns:
            A ``MetricSnapshot`` instance capturing the current state
            of all aggregated metrics from Prometheus.
        Note:
            This method is only available with the V1 LLM engine.
        """
        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
        assert isinstance(self.llm_engine, V1LLMEngine)
        return self.llm_engine.get_metrics()
    # LEGACY
    def _convert_v1_inputs(
        self,
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -27,7 +27,10 @@ from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import StatLoggerFactory
+from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
                                     StatLoggerFactory)
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
 logger = init_logger(__name__)
@ -64,6 +67,11 @@ class LLMEngine:
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.log_stats = log_stats
        self.stat_logger: Optional[StatLoggerBase] = None
        if self.log_stats:
            self.stat_logger = PrometheusStatLogger(vllm_config)
        # important: init dp group before init the engine_core
        # In the decoupled engine case this is handled in EngineCoreProc.
        parallel_config = vllm_config.parallel_config
@ -86,7 +94,7 @@ class LLMEngine:
        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
        self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=False)
+                                                log_stats=self.log_stats)
        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
        self.engine_core = EngineCoreClient.make_client(
@ -94,7 +102,7 @@ class LLMEngine:
            asyncio_mode=False,
            vllm_config=vllm_config,
            executor_class=executor_class,
-            log_stats=False,  # FIXME: implement
+            log_stats=self.log_stats,
        )
        if not multiprocess_mode:
@ -223,12 +231,21 @@ class LLMEngine:
        outputs = self.engine_core.get_output()
        # 2) Process EngineCoreOutputs.
        iteration_stats = IterationStats() if self.log_stats else None
        processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs)
+            outputs.outputs,
            engine_core_timestamp=outputs.timestamp,
            iteration_stats=iteration_stats)
        # 3) Abort any reqs that finished due to stop strings.
        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
        # 4) Record stats
        if self.stat_logger is not None:
            assert outputs.scheduler_stats is not None
            self.stat_logger.record(scheduler_stats=outputs.scheduler_stats,
                                    iteration_stats=iteration_stats)
        return processed_outputs.request_outputs
    def get_vllm_config(self):
@ -260,6 +277,10 @@ class LLMEngine:
    def is_sleeping(self) -> bool:
        return self.engine_core.is_sleeping()
    def get_metrics(self) -> list[Metric]:
        assert self.log_stats, "Stat logging disabled"
        return get_metrics_snapshot()
    def get_tokenizer_group(self) -> TokenizerGroup:
        if self.tokenizer is None:
            raise ValueError("Unable to get tokenizer because "
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -200,24 +200,24 @@ class PrometheusStatLogger(StatLoggerBase):
        # Counters
        #
        self.counter_num_preempted_reqs = self._counter_cls(
-            name="vllm:num_preemptions_total",
+            name="vllm:num_preemptions",
            documentation="Cumulative number of preemption from the engine.",
            labelnames=labelnames).labels(*labelvalues)
        self.counter_prompt_tokens = self._counter_cls(
-            name="vllm:prompt_tokens_total",
+            name="vllm:prompt_tokens",
            documentation="Number of prefill tokens processed.",
            labelnames=labelnames).labels(*labelvalues)
        self.counter_generation_tokens = self._counter_cls(
-            name="vllm:generation_tokens_total",
+            name="vllm:generation_tokens",
            documentation="Number of generation tokens processed.",
            labelnames=labelnames).labels(*labelvalues)
        self.counter_request_success: dict[FinishReason,
                                           prometheus_client.Counter] = {}
        counter_request_success_base = self._counter_cls(
-            name="vllm:request_success_total",
+            name="vllm:request_success",
            documentation="Count of successfully processed requests.",
            labelnames=labelnames + ["finished_reason"])
        for reason in FinishReason:
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@ -0,0 +1,245 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
 from typing import Optional
 from prometheus_client import REGISTRY
 from prometheus_client import Metric as PromMetric
 from prometheus_client.samples import Sample
@dataclass
 class Metric:
    """A base class for prometheus metrics.
    Each metric may be associated with key=value labels, and
    in some cases a single vLLM instance may have multiple
    metrics with the same name but different sets of labels.
    """
    name: str
    labels: dict[str, str]
@dataclass
 class Counter(Metric):
    """A monotonically increasing integer counter."""
    value: int
@dataclass
 class Vector(Metric):
    """An ordered array of integer counters.
    This type - which doesn't exist in Prometheus - models one very
    specific metric, vllm:spec_decode_num_accepted_tokens_per_pos.
    """
    values: list[int]
@dataclass
 class Gauge(Metric):
    """A numerical value that can go up or down."""
    value: float
@dataclass
 class Histogram(Metric):
    """Observations recorded in configurable buckets.
    Buckets are represented by a dictionary. The key is
    the upper limit of the bucket, and the value is the
    observed count in that bucket. A '+Inf' key always
    exists.
    The count property is the total count across all
    buckets, identical to the count of the '+Inf' bucket.
    The sum property is the total sum of all observed
    values.
    """
    count: int
    sum: float
    buckets: dict[str, int]
 def get_metrics_snapshot() -> list[Metric]:
    """An API for accessing in-memory Prometheus metrics.
    Example:
        >>> for metric in llm.get_metrics():
        ...     if isinstance(metric, Counter):
        ...         print(f"{metric} = {metric.value}")
        ...     elif isinstance(metric, Gauge):
        ...         print(f"{metric} = {metric.value}")
        ...     elif isinstance(metric, Histogram):
        ...         print(f"{metric}")
        ...         print(f"    sum = {metric.sum}")
        ...         print(f"    count = {metric.count}")
        ...         for bucket_le, value in metrics.buckets.items():
        ...             print(f"    {bucket_le} = {value}")
    """
    collected: list[Metric] = []
    for metric in REGISTRY.collect():
        if not metric.name.startswith("vllm:"):
            continue
        if metric.type == "gauge":
            samples = _get_samples(metric)
            for s in samples:
                collected.append(
                    Gauge(name=metric.name, labels=s.labels, value=s.value))
        elif metric.type == "counter":
            samples = _get_samples(metric, "_total")
            if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
                #
                # Ugly vllm:num_accepted_tokens_per_pos special case.
                #
                # This metric is a vector of counters - for each spec
                # decoding token position, we observe the number of
                # accepted tokens using a Counter labeled with 'position'.
                # We convert these into a vector of integer values.
                #
                for labels, values in _digest_num_accepted_by_pos_samples(
                        samples):
                    collected.append(
                        Vector(name=metric.name, labels=labels, values=values))
            else:
                for s in samples:
                    collected.append(
                        Counter(name=metric.name,
                                labels=s.labels,
                                value=int(s.value)))
        elif metric.type == "histogram":
            #
            # A histogram has a number of '_bucket' samples where
            # the 'le' label represents the upper limit of the bucket.
            # We convert these bucketized values into a dict of values
            # indexed by the value of the 'le' label. The 'le=+Inf'
            # label is a special case, catching all values observed.
            #
            bucket_samples = _get_samples(metric, "_bucket")
            count_samples = _get_samples(metric, "_count")
            sum_samples = _get_samples(metric, "_sum")
            for labels, buckets, count_value, sum_value in _digest_histogram(
                    bucket_samples, count_samples, sum_samples):
                collected.append(
                    Histogram(name=metric.name,
                              labels=labels,
                              buckets=buckets,
                              count=count_value,
                              sum=sum_value))
        else:
            raise AssertionError(f"Unknown metric type {metric.type}")
    return collected
 def _get_samples(metric: PromMetric,
                 suffix: Optional[str] = None) -> list[Sample]:
    name = (metric.name + suffix) if suffix is not None else metric.name
    return [s for s in metric.samples if s.name == name]
 def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]:
    labels_copy = labels.copy()
    labels_copy.pop(key_to_remove)
    return labels_copy
 def _digest_histogram(
    bucket_samples: list[Sample], count_samples: list[Sample],
    sum_samples: list[Sample]
 ) -> list[tuple[dict[str, str], dict[str, int], int, float]]:
    #
    # In the case of DP, we have an indigestable
    # per-bucket-per-engine count as a list of labelled
    # samples, along with total and sum samples
    #
    # bucket_samples (in):
    #   labels = {bucket: 100, idx: 0}, value = 2
    #   labels = {bucket: 200, idx: 0}, value = 4
    #   labels = {bucket: Inf, idx: 0}, value = 10
    #   labels = {bucket: 100, idx: 1}, value = 1
    #   labels = {bucket: 200, idx: 2}, value = 5
    #   labels = {bucket: Inf, idx: 3}, value = 7
    # count_samples (in):
    #   labels = {idx: 0}, value = 10
    #   labels = {idx: 1}, value = 7
    # sum_samples (in):
    #   labels = {idx: 0}, value = 2000
    #   labels = {idx: 1}, value = 1200
    #
    # output: [
    #   {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000
    #   {idx: 1}, {"100": 1, "200": 5, "Inf": 7},   7, 1200
    # ]
    buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {}
    for s in bucket_samples:
        bucket = s.labels["le"]
        labels_key = frozenset(_strip_label(s.labels, "le").items())
        if labels_key not in buckets_by_labels:
            buckets_by_labels[labels_key] = {}
        buckets_by_labels[labels_key][bucket] = int(s.value)
    counts_by_labels: dict[frozenset[tuple[str, str]], int] = {}
    for s in count_samples:
        labels_key = frozenset(s.labels.items())
        counts_by_labels[labels_key] = int(s.value)
    sums_by_labels: dict[frozenset[tuple[str, str]], float] = {}
    for s in sum_samples:
        labels_key = frozenset(s.labels.items())
        sums_by_labels[labels_key] = s.value
    assert set(buckets_by_labels.keys()) == set(
        counts_by_labels.keys()) == set(sums_by_labels.keys())
    output = []
    label_keys = list(buckets_by_labels.keys())
    for k in label_keys:
        labels = dict(k)
        output.append((labels, buckets_by_labels[k], counts_by_labels[k],
                       sums_by_labels[k]))
    return output
 def _digest_num_accepted_by_pos_samples(
        samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]:
    #
    # In the case of DP, we have an indigestable
    # per-position-per-engine count as a list of
    # labelled samples
    #
    # samples (in):
    #   labels = {pos: 0, idx: 0}, value = 10
    #   labels = {pos: 1, idx: 0}, value = 7
    #   labels = {pos: 2, idx: 0}, value = 2
    #   labels = {pos: 0, idx: 1}, value = 5
    #   labels = {pos: 1, idx: 1}, value = 3
    #   labels = {pos: 2, idx: 1}, value = 1
    #
    # output: [
    #   {idx: 0}, [10, 7, 2]
    #   {idx: 1}, [5, 3, 1]
    # ]
    #
    max_pos = 0
    values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {}
    for s in samples:
        position = int(s.labels["position"])
        max_pos = max(max_pos, position)
        labels_key = frozenset(_strip_label(s.labels, "position").items())
        if labels_key not in values_by_labels:
            values_by_labels[labels_key] = {}
        values_by_labels[labels_key][position] = int(s.value)
    output = []
    for labels_key, values_by_position in values_by_labels.items():
        labels = dict(labels_key)
        values = [0] * (max_pos + 1)
        for pos, val in values_by_position.items():
            values[pos] = val
        output.append((labels, values))
    return output
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@ -134,17 +134,17 @@ class SpecDecodingProm:
        self.counter_spec_decode_num_drafts = \
            self._counter_cls(
-                name="vllm:spec_decode_num_drafts_total",
+                name="vllm:spec_decode_num_drafts",
                documentation="Number of spec decoding drafts.",
                labelnames=labelnames).labels(*labelvalues)
        self.counter_spec_decode_num_draft_tokens = \
            self._counter_cls(
-                name="vllm:spec_decode_num_draft_tokens_total",
+                name="vllm:spec_decode_num_draft_tokens",
                documentation="Number of draft tokens.",
                labelnames=labelnames,).labels(*labelvalues)
        self.counter_spec_decode_num_accepted_tokens = \
            self._counter_cls(
-                name="vllm:spec_decode_num_accepted_tokens_total",
+                name="vllm:spec_decode_num_accepted_tokens",
                documentation="Number of accepted tokens.",
                labelnames=labelnames).labels(*labelvalues)