From 16ed8273783cbc2bc1a0164c7e890cf20d2742fd Mon Sep 17 00:00:00 2001 From: clark Date: Thu, 16 Jan 2025 19:43:07 +0800 Subject: [PATCH] add benchmark shell Signed-off-by: clark --- .../disagg_performance_benchmark_zmq_http.sh | 207 ++++++++++++++++++ .../disagg_benchmarks/zmq/test_request.py | 2 +- .../visualize_benchmark_results_zmq_http.py | 72 ++++++ vllm/entrypoints/disagg_connector.py | 38 +++- vllm/entrypoints/launcher.py | 2 +- vllm/entrypoints/openai/connect_worker.py | 2 - 6 files changed, 311 insertions(+), 12 deletions(-) create mode 100644 benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh create mode 100644 benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py diff --git a/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh b/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh new file mode 100644 index 0000000000000..11a2f6c2622f4 --- /dev/null +++ b/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# Requirement: 2x GPUs. + + +# Model: meta-llama/Meta-Llama-3.1-8B-Instruct +# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests +# Resource: 2x GPU +# Approaches: +# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 +# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance +# Prefilling instance: max_output_token=1 +# Decoding instance: force the input tokens be the same across requests to bypass prefilling + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + for port in 7010 7011 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done + sleep 1 +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +launch_chunked_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + gpu_memory_utilization=0.6 + max_model_len=10000 + VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len $max_model_len \ + --enable-chunked-prefill \ + --gpu-memory-utilization $gpu_memory_utilization & + VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len $max_model_len \ + --enable-chunked-prefill \ + --gpu-memory-utilization $gpu_memory_utilization & + wait_for_server 8100 + wait_for_server 8200 + python3 ../round_robin_proxy.py & + sleep 1 +} + +launch_disagg_prefill_http() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + python3 ../disagg_prefill_proxy_server.py & + sleep 1 +} + + + +launch_disagg_prefill_zmq() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + gpu_memory_utilization=0.6 + max_model_len=10000 + # disagg prefill + VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --zmq-server-port 7010 \ + --max-model-len $max_model_len \ + --gpu-memory-utilization $gpu_memory_utilization \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --zmq-server-port 7011 \ + --max-model-len $max_model_len \ + --gpu-memory-utilization $gpu_memory_utilization \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + python3 \ + -m vllm.entrypoints.disagg_connector \ + --port 8000 \ + --prefill-addr 127.0.0.1:7010 \ + --decode-addr 127.0.0.1:7011 & + + wait_for_server 8100 + wait_for_server 8200 + wait_for_server 8000 + sleep 1 +} + + +benchmark() { + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../../sonnet_4x.txt" + num_prompts=100 + qps=$1 + prefix_len=50 + input_len=1024 + output_len=$2 + tag=$3 + + python3 ../../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" + + sleep 2 +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + (which lsof) || (apt-get -y install lsof) + pip install quart httpx matplotlib aiohttp datasets + cd "$(dirname "$0")" + cd ../.. + # create sonnet-4x.txt so that we can sample 2048 tokens for input + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks/zmq + + rm -rf results + mkdir results + mkdir results/http_zmq_chunk + mkdir results/http_zmq + + default_output_len=6 + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + launch_chunked_prefill + for qps in 2 4 6 8 10 12; do + benchmark $qps $default_output_len chunked_prefill + done + kill_gpu_processes + + + launch_disagg_prefill_http + for qps in 2 4 6 8 10 12; do + benchmark $qps $default_output_len disagg_prefill_http + done + kill_gpu_processes + + launch_disagg_prefill_zmq + for qps in 2 4 6 8 10 12; do + benchmark $qps $default_output_len disagg_prefill_zmq + done + kill_gpu_processes + + python3 visualize_benchmark_results_zmq_http.py + +} + + +main "$@" diff --git a/benchmarks/disagg_benchmarks/zmq/test_request.py b/benchmarks/disagg_benchmarks/zmq/test_request.py index aac4cf8310712..9a3e82f1ccf45 100644 --- a/benchmarks/disagg_benchmarks/zmq/test_request.py +++ b/benchmarks/disagg_benchmarks/zmq/test_request.py @@ -11,7 +11,7 @@ import aiohttp # 3. python test_request.py async def test_connect_completions(session): try: - base_url = "http://localhost:8001/v1/connect/completions" + base_url = "http://localhost:8001/v1/completions" body = { "temperature": 0.5, "top_p": 0.9, diff --git a/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py b/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py new file mode 100644 index 0000000000000..82a9a6184920e --- /dev/null +++ b/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py @@ -0,0 +1,72 @@ +import json + +import matplotlib.pyplot as plt +import pandas as pd + +if __name__ == "__main__": + data = [] + for name in ['disagg_prefill_http', 'disagg_prefill_zmq', 'chunked_prefill']: + for qps in [2, 4, 6, 8, 10, 12]: + with open(f"results/{name}-qps-{qps}.json") as f: + x = json.load(f) + x['name'] = name + x['qps'] = qps + data.append(x) + + df = pd.DataFrame.from_dict(data) + dis_http_df = df[df['name'] == 'disagg_prefill_http'] + dis_zmq_df = df[df['name'] == 'disagg_prefill_zmq'] + chu_df = df[df['name'] == 'chunked_prefill'] + + + plt.style.use('bmh') + plt.rcParams['font.size'] = 20 + + for key in [ + 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', + 'median_itl_ms', 'p99_itl_ms' + ]: + + fig, ax = plt.subplots(figsize=(11, 7)) + plt.plot(dis_http_df['qps'], + dis_http_df[key], + label='disagg_prefill_http', + marker='o', + linewidth=4) + plt.plot(dis_zmq_df['qps'], + dis_zmq_df[key], + label='disagg_prefill_zmq', + marker='o', + linewidth=4) + plt.plot(chu_df['qps'], + chu_df[key], + label='chunked_prefill', + marker='o', + linewidth=4) + ax.legend() + + ax.set_xlabel('QPS') + ax.set_ylabel(key) + ax.set_ylim(bottom=0) + fig.savefig(f'results/http_zmq_chunk/{key}.png') + plt.close(fig) + + + fig1, ax1 = plt.subplots(figsize=(11, 7)) + plt.plot(dis_http_df['qps'], + dis_http_df[key], + label='disagg_prefill_http', + marker='o', + linewidth=4) + plt.plot(dis_zmq_df['qps'], + dis_zmq_df[key], + label='disagg_prefill_zmq', + marker='o', + linewidth=4) + ax1.legend() + + ax1.set_xlabel('QPS') + ax1.set_ylabel(key) + ax1.set_ylim(bottom=0) + fig1.savefig(f'results/http_zmq/{key}.png') + plt.close(fig1) diff --git a/vllm/entrypoints/disagg_connector.py b/vllm/entrypoints/disagg_connector.py index 843aa5ae9e649..5dea31f1c67a1 100644 --- a/vllm/entrypoints/disagg_connector.py +++ b/vllm/entrypoints/disagg_connector.py @@ -11,25 +11,27 @@ from contextlib import asynccontextmanager from typing import AsyncGenerator import uvicorn +import uvloop import zmq import zmq.asyncio from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser # default prefill and decode addr -time_out = 3 -fastapi_port = 8001 +time_out = 180 +fastapi_port = 8000 prefill_addr = "ipc://localhost:7010" -socket_prefill_num = 20 +socket_prefill_num = 100 decode_addr = "ipc://localhost:7020" -socket_decode_num = 20 +socket_decode_num = 100 context_type_json = "application/json" context_type_error = "error" # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) -logger = init_logger('vllm.entrypoints.connect') +logger = init_logger('vllm.entrypoints.disagg_connector') @asynccontextmanager @@ -146,7 +148,7 @@ async def decode(route: str, header: dict, original_request_data: dict): media_type="text/event-stream") -@app.post('/v1/connect/completions') +@app.post('/v1/completions') async def chat_completions(request: Request): try: # Add the X-Request-Id header to the raw headers list @@ -210,5 +212,25 @@ async def run_disagg_connector(args, **uvicorn_kwargs) -> None: if __name__ == "__main__": - # url = 'tcp://127.0.0.1:5555' - uvicorn.run(app, host="0.0.0.0", port=fastapi_port) + # NOTE(simon): + # This section should be in sync with vllm/scripts.py for CLI entrypoints. + parser = FlexibleArgumentParser( + description="vLLM disagg zmq server.") + parser.add_argument("--port", + type=int, + default=8000, + help="The fastapi server port") + parser.add_argument("--prefill-addr", + type=str, + required=True, + help="The prefill address IP:PORT") + parser.add_argument("--decode-addr", + type=str, + required=True, + help="The decode address IP:PORT") + + args = parser.parse_args() + + uvloop.run(run_disagg_connector(args)) + + # uvicorn.run(app, host="0.0.0.0", port=fastapi_port) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index d96ad42cbbff8..869dbae466410 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -91,7 +91,7 @@ async def serve_zmq(arg, zmq_server_port: int, app: FastAPI) -> None: try: tasks = [ asyncio.create_task(worker_routine(workers_addr, app, context, i)) - for i in range(20) + for i in range(100) ] logger.info("zmq tasks: %s", tasks) # thread safety proxy create socket in the background: diff --git a/vllm/entrypoints/openai/connect_worker.py b/vllm/entrypoints/openai/connect_worker.py index adefac87c0921..c9c42df8fb613 100644 --- a/vllm/entrypoints/openai/connect_worker.py +++ b/vllm/entrypoints/openai/connect_worker.py @@ -94,8 +94,6 @@ async def worker_routine(worker_addr: str, app: FastAPI, json.dumps(generator.model_dump()).encode('utf-8')]) else: async for chunk in generator: - logger.info("worker-%d Sending response chunk: [ %s ]", - i, chunk) await socket.send_multipart([identity, b"text/event-stream", chunk.encode('utf-8')])