mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 15:25:01 +08:00
Signed-off-by: Fangping Shi <fangping_shi@apple.com> Co-authored-by: Fangping Shi <fangping_shi@apple.com>
765 lines
23 KiB
YAML
765 lines
23 KiB
YAML
kind: PersesDashboard
|
|
metadata:
|
|
name: performance-statistics
|
|
createdAt: 0001-01-01T00:00:00Z
|
|
updatedAt: 0001-01-01T00:00:00Z
|
|
version: 0
|
|
project: ""
|
|
spec:
|
|
display:
|
|
name: Performance Statistics
|
|
|
|
variables:
|
|
- kind: ListVariable
|
|
spec:
|
|
display:
|
|
name: Deployment_ID
|
|
hidden: false
|
|
name: Deployment_id
|
|
allowAllValue: true
|
|
allowMultiple: true
|
|
defaultValue:
|
|
- $__all
|
|
sort: alphabetical-asc
|
|
plugin:
|
|
kind: PrometheusLabelValuesVariable
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
labelName: model_name
|
|
matchers:
|
|
# Any one vllm metric that always carries model_name
|
|
- vllm:generation_tokens_total{}
|
|
|
|
panels:
|
|
"1":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: E2E Latency over Time
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
# avg latency by model = sum(rate(sum)) / sum(rate(count))
|
|
query: >
|
|
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
|
/
|
|
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
|
seriesNameFormat: '{{model_name}}'
|
|
|
|
"2":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: E2E Latency (Avg)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
|
/
|
|
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
|
|
|
"3":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: E2E Latency (P50)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.50,
|
|
sum by (le, model_name) (
|
|
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"4":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: E2E Latency (P90)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (le, model_name) (
|
|
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"5":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: E2E Latency (P99)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (le, model_name) (
|
|
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"6":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TTFT over Time
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
|
/
|
|
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
|
seriesNameFormat: '{{model_name}}'
|
|
|
|
"7":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TTFT (Avg)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
|
/
|
|
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
|
|
|
"8":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TTFT (P50)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.50,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"9":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TTFT (P90)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"10":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TTFT (P99)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"11":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: ITL (Time per Output Token) over Time
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
|
/
|
|
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
|
seriesNameFormat: '{{model_name}}'
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.50,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
seriesNameFormat: '{{model_name}} p50'
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
seriesNameFormat: '{{model_name}} p90'
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
seriesNameFormat: '{{model_name}} p99'
|
|
|
|
"12":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: ITL (Avg)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
|
/
|
|
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
|
|
|
"13":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: ITL (P50)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.50,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"14":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: ITL (P90)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"15":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: ITL (P99)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (le, model_name) (
|
|
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
|
)
|
|
)
|
|
|
|
"16":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: TPS (Tokens/sec) over Time
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
|
seriesNameFormat: '{{model_name}} generation'
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
|
seriesNameFormat: '{{model_name}} prompt'
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
# overall iteration tokens/sec if exposed
|
|
query: >
|
|
rate(vllm:iteration_tokens_total_count[$__interval])
|
|
seriesNameFormat: 'iteration overall'
|
|
|
|
"17":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: KV Cache Usage (avg %)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
|
query: >
|
|
100 * avg(vllm:kv_cache_usage_perc)
|
|
|
|
"18":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: Running Requests by Pod
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (pod) (vllm:num_requests_running)
|
|
seriesNameFormat: '{{pod}}'
|
|
|
|
"19":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: Waiting Requests by Pod
|
|
plugin:
|
|
kind: TimeSeriesChart
|
|
spec:
|
|
legend:
|
|
mode: table
|
|
position: bottom
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: >
|
|
sum by (pod) (vllm:num_requests_waiting)
|
|
seriesNameFormat: '{{pod}}'
|
|
|
|
"20":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: Running Requests (sum)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: sum(vllm:num_requests_running)
|
|
|
|
"21":
|
|
kind: Panel
|
|
spec:
|
|
display:
|
|
name: Waiting Requests (sum)
|
|
plugin:
|
|
kind: StatChart
|
|
spec:
|
|
calculation: last-number
|
|
queries:
|
|
- kind: TimeSeriesQuery
|
|
spec:
|
|
plugin:
|
|
kind: PrometheusTimeSeriesQuery
|
|
spec:
|
|
datasource:
|
|
kind: PrometheusDatasource
|
|
name: accelerators-thanos-querier-datasource
|
|
query: sum(vllm:num_requests_waiting)
|
|
|
|
layouts:
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: Overview
|
|
items:
|
|
- x: 0
|
|
y: 0
|
|
width: 6
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/17' } # KV cache %
|
|
- x: 6
|
|
y: 0
|
|
width: 6
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/20' } # running sum
|
|
- x: 12
|
|
y: 0
|
|
width: 6
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/21' } # waiting sum
|
|
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: E2E Latency
|
|
items:
|
|
- x: 0
|
|
y: 1
|
|
width: 10
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/1' }
|
|
- x: 10
|
|
y: 1
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/2' }
|
|
- x: 17
|
|
y: 1
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/3' }
|
|
- x: 10
|
|
y: 4
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/4' }
|
|
- x: 17
|
|
y: 4
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/5' }
|
|
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: TTFT
|
|
items:
|
|
- x: 0
|
|
y: 8
|
|
width: 10
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/6' }
|
|
- x: 10
|
|
y: 8
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/7' }
|
|
- x: 17
|
|
y: 8
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/8' }
|
|
- x: 10
|
|
y: 11
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/9' }
|
|
- x: 17
|
|
y: 11
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/10' }
|
|
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: ITL (Time per Output Token)
|
|
items:
|
|
- x: 0
|
|
y: 15
|
|
width: 10
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/11' }
|
|
- x: 10
|
|
y: 15
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/12' }
|
|
- x: 17
|
|
y: 15
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/13' }
|
|
- x: 10
|
|
y: 18
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/14' }
|
|
- x: 17
|
|
y: 18
|
|
width: 7
|
|
height: 3
|
|
content: { $ref: '#/spec/panels/15' }
|
|
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: TPS (Prompt / Generation / Iteration)
|
|
items:
|
|
- x: 0
|
|
y: 22
|
|
width: 14
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/16' }
|
|
|
|
- kind: Grid
|
|
spec:
|
|
display:
|
|
title: Per-Pod Request State
|
|
items:
|
|
- x: 0
|
|
y: 28
|
|
width: 12
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/18' }
|
|
- x: 12
|
|
y: 28
|
|
width: 12
|
|
height: 6
|
|
content: { $ref: '#/spec/panels/19' }
|
|
|