mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 18:05:01 +08:00
Signed-off-by: Fangping Shi <fangping_shi@apple.com> Co-authored-by: Fangping Shi <fangping_shi@apple.com>
393 lines
15 KiB
YAML
393 lines
15 KiB
YAML
kind: PersesDashboard
|
||
metadata:
|
||
name: query-statistics
|
||
createdAt: 0001-01-01T00:00:00Z
|
||
updatedAt: 0001-01-01T00:00:00Z
|
||
version: 0
|
||
project: ""
|
||
spec:
|
||
display:
|
||
name: Query Statistics_New
|
||
|
||
variables:
|
||
- kind: ListVariable
|
||
spec:
|
||
name: NS
|
||
display: { name: Namespace }
|
||
allowMultiple: false
|
||
defaultValue: llm-d
|
||
plugin:
|
||
kind: PrometheusLabelValuesVariable
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
labelName: namespace
|
||
matchers:
|
||
- up{service=~".*vllm.*"}
|
||
|
||
- kind: ListVariable
|
||
spec:
|
||
name: SVC
|
||
display: { name: Service }
|
||
allowMultiple: false
|
||
defaultValue: vllm-qwen2-0-5b-sim
|
||
plugin:
|
||
kind: PrometheusLabelValuesVariable
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
labelName: service
|
||
matchers:
|
||
- up{namespace="$NS",service=~".*vllm.*"}
|
||
|
||
- kind: ListVariable
|
||
spec:
|
||
name: MODEL
|
||
display: { name: Model (real vLLM) }
|
||
allowAllValue: true
|
||
allowMultiple: true
|
||
defaultValue: ["$__all"]
|
||
plugin:
|
||
kind: PrometheusLabelValuesVariable
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
labelName: model_name
|
||
matchers:
|
||
- vllm:request_success_total{namespace="$NS",service="$SVC"}
|
||
|
||
panels:
|
||
|
||
# --- Core (works on Simulator & Real) ---
|
||
core_running_now:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Running Requests (now) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_waiting_now:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Waiting Requests (now) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_kv_usage_now:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: KV Cache Usage (0–1) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_running_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Running Over Time }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_waiting_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Waiting Over Time }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_targets_up:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Scrape Targets Up }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
|
||
minStep: "15s"
|
||
|
||
# --- KV Cache as Percent (works on Simulator & Real) ---
|
||
core_kv_usage_pct_now:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: KV Cache Usage (%) – now }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||
minStep: "15s"
|
||
|
||
core_kv_usage_pct_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: KV Cache Usage (%) – over time }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||
minStep: "15s"
|
||
|
||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||
per_pod_running_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Running by Pod }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
per_pod_waiting_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Waiting by Pod }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||
minStep: "15s"
|
||
|
||
per_pod_kv_pct_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: KV Cache (%) by Pod }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||
minStep: "15s"
|
||
|
||
# --- Real vLLM only (zeros on simulator) ---
|
||
real_req_rate_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Request Rate (real vLLM) }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||
minStep: "15s"
|
||
|
||
real_p50:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: p50 Latency (real vLLM) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||
minStep: "15s"
|
||
|
||
real_p90:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: p90 Latency (real vLLM) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||
minStep: "15s"
|
||
|
||
real_p99:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: p99 Latency (real vLLM) }
|
||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||
minStep: "15s"
|
||
|
||
real_input_tokens_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Input Tokens / sec (real vLLM) }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||
minStep: "15s"
|
||
|
||
real_output_tokens_ts:
|
||
kind: Panel
|
||
spec:
|
||
display: { name: Output Tokens / sec (real vLLM) }
|
||
plugin:
|
||
kind: TimeSeriesChart
|
||
spec:
|
||
legend: { mode: table, position: bottom }
|
||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||
queries:
|
||
- kind: TimeSeriesQuery
|
||
spec:
|
||
plugin:
|
||
kind: PrometheusTimeSeriesQuery
|
||
spec:
|
||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||
minStep: "15s"
|
||
|
||
layouts:
|
||
- kind: Grid
|
||
spec:
|
||
display: { title: Core (Sim & Real) }
|
||
items:
|
||
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
|
||
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
|
||
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
|
||
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
|
||
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
|
||
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
|
||
|
||
- kind: Grid
|
||
spec:
|
||
display: { title: KV Cache (%) }
|
||
items:
|
||
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
|
||
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
|
||
|
||
- kind: Grid
|
||
spec:
|
||
display: { title: Per-Pod breakdowns }
|
||
items:
|
||
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
|
||
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
|
||
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
|
||
|
||
- kind: Grid
|
||
spec:
|
||
display: { title: Real vLLM only (shows 0 on simulator) }
|
||
items:
|
||
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
|
||
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
|
||
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
|
||
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
|
||
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
|
||
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
|
||
|