add benchmark shell

Signed-off-by: clark <panf2333@gmail.com>
This commit is contained in:
clark 2025-01-16 19:43:07 +08:00
parent 8fa9df7987
commit 16ed827378
6 changed files with 311 additions and 12 deletions

View File

@ -0,0 +1,207 @@
#!/bin/bash
# Requirement: 2x GPUs.
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
# Decoding instance: force the input tokens be the same across requests to bypass prefilling
set -ex
kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
for port in 7010 7011 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
gpu_memory_utilization=0.6
max_model_len=10000
VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len $max_model_len \
--enable-chunked-prefill \
--gpu-memory-utilization $gpu_memory_utilization &
VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len $max_model_len \
--enable-chunked-prefill \
--gpu-memory-utilization $gpu_memory_utilization &
wait_for_server 8100
wait_for_server 8200
python3 ../round_robin_proxy.py &
sleep 1
}
launch_disagg_prefill_http() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
python3 ../disagg_prefill_proxy_server.py &
sleep 1
}
launch_disagg_prefill_zmq() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
gpu_memory_utilization=0.6
max_model_len=10000
# disagg prefill
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--zmq-server-port 7010 \
--max-model-len $max_model_len \
--gpu-memory-utilization $gpu_memory_utilization \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--zmq-server-port 7011 \
--max-model-len $max_model_len \
--gpu-memory-utilization $gpu_memory_utilization \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
python3 \
-m vllm.entrypoints.disagg_connector \
--port 8000 \
--prefill-addr 127.0.0.1:7010 \
--decode-addr 127.0.0.1:7011 &
wait_for_server 8100
wait_for_server 8200
wait_for_server 8000
sleep 1
}
benchmark() {
results_folder="./results"
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name="sonnet"
dataset_path="../../sonnet_4x.txt"
num_prompts=100
qps=$1
prefix_len=50
input_len=1024
output_len=$2
tag=$3
python3 ../../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
sleep 2
}
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
(which lsof) || (apt-get -y install lsof)
pip install quart httpx matplotlib aiohttp datasets
cd "$(dirname "$0")"
cd ../..
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
cd disagg_benchmarks/zmq
rm -rf results
mkdir results
mkdir results/http_zmq_chunk
mkdir results/http_zmq
default_output_len=6
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
launch_chunked_prefill
for qps in 2 4 6 8 10 12; do
benchmark $qps $default_output_len chunked_prefill
done
kill_gpu_processes
launch_disagg_prefill_http
for qps in 2 4 6 8 10 12; do
benchmark $qps $default_output_len disagg_prefill_http
done
kill_gpu_processes
launch_disagg_prefill_zmq
for qps in 2 4 6 8 10 12; do
benchmark $qps $default_output_len disagg_prefill_zmq
done
kill_gpu_processes
python3 visualize_benchmark_results_zmq_http.py
}
main "$@"

View File

@ -11,7 +11,7 @@ import aiohttp
# 3. python test_request.py # 3. python test_request.py
async def test_connect_completions(session): async def test_connect_completions(session):
try: try:
base_url = "http://localhost:8001/v1/connect/completions" base_url = "http://localhost:8001/v1/completions"
body = { body = {
"temperature": 0.5, "temperature": 0.5,
"top_p": 0.9, "top_p": 0.9,

View File

@ -0,0 +1,72 @@
import json
import matplotlib.pyplot as plt
import pandas as pd
if __name__ == "__main__":
data = []
for name in ['disagg_prefill_http', 'disagg_prefill_zmq', 'chunked_prefill']:
for qps in [2, 4, 6, 8, 10, 12]:
with open(f"results/{name}-qps-{qps}.json") as f:
x = json.load(f)
x['name'] = name
x['qps'] = qps
data.append(x)
df = pd.DataFrame.from_dict(data)
dis_http_df = df[df['name'] == 'disagg_prefill_http']
dis_zmq_df = df[df['name'] == 'disagg_prefill_zmq']
chu_df = df[df['name'] == 'chunked_prefill']
plt.style.use('bmh')
plt.rcParams['font.size'] = 20
for key in [
'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
'median_itl_ms', 'p99_itl_ms'
]:
fig, ax = plt.subplots(figsize=(11, 7))
plt.plot(dis_http_df['qps'],
dis_http_df[key],
label='disagg_prefill_http',
marker='o',
linewidth=4)
plt.plot(dis_zmq_df['qps'],
dis_zmq_df[key],
label='disagg_prefill_zmq',
marker='o',
linewidth=4)
plt.plot(chu_df['qps'],
chu_df[key],
label='chunked_prefill',
marker='o',
linewidth=4)
ax.legend()
ax.set_xlabel('QPS')
ax.set_ylabel(key)
ax.set_ylim(bottom=0)
fig.savefig(f'results/http_zmq_chunk/{key}.png')
plt.close(fig)
fig1, ax1 = plt.subplots(figsize=(11, 7))
plt.plot(dis_http_df['qps'],
dis_http_df[key],
label='disagg_prefill_http',
marker='o',
linewidth=4)
plt.plot(dis_zmq_df['qps'],
dis_zmq_df[key],
label='disagg_prefill_zmq',
marker='o',
linewidth=4)
ax1.legend()
ax1.set_xlabel('QPS')
ax1.set_ylabel(key)
ax1.set_ylim(bottom=0)
fig1.savefig(f'results/http_zmq/{key}.png')
plt.close(fig1)

View File

@ -11,25 +11,27 @@ from contextlib import asynccontextmanager
from typing import AsyncGenerator from typing import AsyncGenerator
import uvicorn import uvicorn
import uvloop
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from fastapi import FastAPI, Request from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
# default prefill and decode addr # default prefill and decode addr
time_out = 3 time_out = 180
fastapi_port = 8001 fastapi_port = 8000
prefill_addr = "ipc://localhost:7010" prefill_addr = "ipc://localhost:7010"
socket_prefill_num = 20 socket_prefill_num = 100
decode_addr = "ipc://localhost:7020" decode_addr = "ipc://localhost:7020"
socket_decode_num = 20 socket_decode_num = 100
context_type_json = "application/json" context_type_json = "application/json"
context_type_error = "error" context_type_error = "error"
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
logger = init_logger('vllm.entrypoints.connect') logger = init_logger('vllm.entrypoints.disagg_connector')
@asynccontextmanager @asynccontextmanager
@ -146,7 +148,7 @@ async def decode(route: str, header: dict, original_request_data: dict):
media_type="text/event-stream") media_type="text/event-stream")
@app.post('/v1/connect/completions') @app.post('/v1/completions')
async def chat_completions(request: Request): async def chat_completions(request: Request):
try: try:
# Add the X-Request-Id header to the raw headers list # Add the X-Request-Id header to the raw headers list
@ -210,5 +212,25 @@ async def run_disagg_connector(args, **uvicorn_kwargs) -> None:
if __name__ == "__main__": if __name__ == "__main__":
# url = 'tcp://127.0.0.1:5555' # NOTE(simon):
uvicorn.run(app, host="0.0.0.0", port=fastapi_port) # This section should be in sync with vllm/scripts.py for CLI entrypoints.
parser = FlexibleArgumentParser(
description="vLLM disagg zmq server.")
parser.add_argument("--port",
type=int,
default=8000,
help="The fastapi server port")
parser.add_argument("--prefill-addr",
type=str,
required=True,
help="The prefill address IP:PORT")
parser.add_argument("--decode-addr",
type=str,
required=True,
help="The decode address IP:PORT")
args = parser.parse_args()
uvloop.run(run_disagg_connector(args))
# uvicorn.run(app, host="0.0.0.0", port=fastapi_port)

View File

@ -91,7 +91,7 @@ async def serve_zmq(arg, zmq_server_port: int, app: FastAPI) -> None:
try: try:
tasks = [ tasks = [
asyncio.create_task(worker_routine(workers_addr, app, context, i)) asyncio.create_task(worker_routine(workers_addr, app, context, i))
for i in range(20) for i in range(100)
] ]
logger.info("zmq tasks: %s", tasks) logger.info("zmq tasks: %s", tasks)
# thread safety proxy create socket in the background: # thread safety proxy create socket in the background:

View File

@ -94,8 +94,6 @@ async def worker_routine(worker_addr: str, app: FastAPI,
json.dumps(generator.model_dump()).encode('utf-8')]) json.dumps(generator.model_dump()).encode('utf-8')])
else: else:
async for chunk in generator: async for chunk in generator:
logger.info("worker-%d Sending response chunk: [ %s ]",
i, chunk)
await socket.send_multipart([identity, await socket.send_multipart([identity,
b"text/event-stream", b"text/event-stream",
chunk.encode('utf-8')]) chunk.encode('utf-8')])