mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-16 11:46:00 +08:00
[Docs] Generate full list of metrics in user docs (#30388)
Signed-off-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
aacf0abf8b
commit
2dcbac9077
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
|
||||
|
||||
### v1 Metrics
|
||||
|
||||
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||
In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
|
||||
|
||||
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
||||
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
|
||||
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
||||
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
||||
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
||||
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
|
||||
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
|
||||
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
|
||||
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
||||
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
||||
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
|
||||
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
|
||||
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
||||
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
||||
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
||||
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
|
||||
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
|
||||
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
||||
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
||||
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
||||
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
|
||||
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
|
||||
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
||||
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
||||
|
||||
|
||||
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import ast
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||
DOCS_DIR = ROOT_DIR / "docs"
|
||||
GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
|
||||
|
||||
# Files to scan for metric definitions - each will generate a separate table
|
||||
METRIC_SOURCE_FILES = [
|
||||
{"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
|
||||
{
|
||||
"path": "vllm/v1/spec_decode/metrics.py",
|
||||
"output": "spec_decode.md",
|
||||
},
|
||||
{
|
||||
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
||||
"output": "nixl_connector.md",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class MetricExtractor(ast.NodeVisitor):
|
||||
"""AST visitor to extract metric definitions."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics: list[dict[str, str]] = []
|
||||
|
||||
def visit_Call(self, node: ast.Call) -> None:
|
||||
"""Visit function calls to find metric class instantiations."""
|
||||
metric_type = self._get_metric_type(node)
|
||||
if metric_type:
|
||||
name = self._extract_kwarg(node, "name")
|
||||
documentation = self._extract_kwarg(node, "documentation")
|
||||
|
||||
if name:
|
||||
self.metrics.append(
|
||||
{
|
||||
"name": name,
|
||||
"type": metric_type,
|
||||
"documentation": documentation or "",
|
||||
}
|
||||
)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def _get_metric_type(self, node: ast.Call) -> str | None:
|
||||
"""Determine if this call creates a metric and return its type."""
|
||||
metric_type_map = {
|
||||
"_gauge_cls": "gauge",
|
||||
"_counter_cls": "counter",
|
||||
"_histogram_cls": "histogram",
|
||||
}
|
||||
if isinstance(node.func, ast.Attribute):
|
||||
return metric_type_map.get(node.func.attr)
|
||||
return None
|
||||
|
||||
def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
|
||||
"""Extract a keyword argument value from a function call."""
|
||||
for keyword in node.keywords:
|
||||
if keyword.arg == key:
|
||||
return self._get_string_value(keyword.value)
|
||||
return None
|
||||
|
||||
def _get_string_value(self, node: ast.AST) -> str | None:
|
||||
"""Extract string value from an AST node."""
|
||||
if isinstance(node, ast.Constant):
|
||||
return str(node.value) if node.value is not None else None
|
||||
return None
|
||||
|
||||
|
||||
def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
|
||||
"""Parse a Python file and extract all metric definitions."""
|
||||
try:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
source = f.read()
|
||||
|
||||
tree = ast.parse(source, filename=str(filepath))
|
||||
extractor = MetricExtractor()
|
||||
extractor.visit(tree)
|
||||
return extractor.metrics
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
|
||||
|
||||
|
||||
def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
|
||||
"""Generate a markdown table from extracted metrics."""
|
||||
if not metrics:
|
||||
return "No metrics found.\n"
|
||||
|
||||
# Sort by type, then by name
|
||||
metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
|
||||
|
||||
lines = []
|
||||
lines.append("| Metric Name | Type | Description |")
|
||||
lines.append("|-------------|------|-------------|")
|
||||
|
||||
for metric in metrics_sorted:
|
||||
name = metric["name"]
|
||||
metric_type = metric["type"].capitalize()
|
||||
doc = metric["documentation"].replace("\n", " ").strip()
|
||||
lines.append(f"| `{name}` | {metric_type} | {doc} |")
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"""Generate metrics documentation tables from source files."""
|
||||
logger.info("Generating metrics documentation")
|
||||
|
||||
# Create generated directory if it doesn't exist
|
||||
GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
total_metrics = 0
|
||||
for source_config in METRIC_SOURCE_FILES:
|
||||
source_path = source_config["path"]
|
||||
output_file = source_config["output"]
|
||||
|
||||
filepath = ROOT_DIR / source_path
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(f"Metrics source file not found: {filepath}")
|
||||
|
||||
logger.debug("Extracting metrics from: %s", source_path)
|
||||
metrics = extract_metrics_from_file(filepath)
|
||||
logger.debug("Found %d metrics in %s", len(metrics), source_path)
|
||||
|
||||
# Generate and write the markdown table for this source
|
||||
table_content = generate_markdown_table(metrics)
|
||||
output_path = GENERATED_METRICS_DIR / output_file
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(table_content)
|
||||
|
||||
total_metrics += len(metrics)
|
||||
logger.info(
|
||||
"Generated metrics table: %s (%d metrics)",
|
||||
output_path.relative_to(ROOT_DIR),
|
||||
len(metrics),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Total metrics generated: %d across %d files",
|
||||
total_metrics,
|
||||
len(METRIC_SOURCE_FILES),
|
||||
)
|
||||
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
|
||||
|
||||
The following metrics are exposed:
|
||||
|
||||
??? code
|
||||
## General Metrics
|
||||
|
||||
```python
|
||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
||||
```
|
||||
--8<-- "docs/generated/metrics/general.md"
|
||||
|
||||
## Speculative Decoding Metrics
|
||||
|
||||
--8<-- "docs/generated/metrics/spec_decode.md"
|
||||
|
||||
## NIXL KV Connector Metrics
|
||||
|
||||
--8<-- "docs/generated/metrics/nixl_connector.md"
|
||||
|
||||
## Deprecation Policy
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||
|
||||
@ -51,6 +51,7 @@ hooks:
|
||||
- docs/mkdocs/hooks/remove_announcement.py
|
||||
- docs/mkdocs/hooks/generate_examples.py
|
||||
- docs/mkdocs/hooks/generate_argparse.py
|
||||
- docs/mkdocs/hooks/generate_metrics.py
|
||||
- docs/mkdocs/hooks/url_schemes.py
|
||||
|
||||
plugins:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user