From 298298f97d82c7d84d2fba8193ac54ca3e23ac9d Mon Sep 17 00:00:00 2001 From: clark Date: Mon, 20 Jan 2025 13:42:47 +0800 Subject: [PATCH] remove invalid zmq benchmark code Signed-off-by: clark --- .../disagg_performance_benchmark_zmq_http.sh | 207 ------------------ .../disagg_benchmarks/zmq/test_request.py | 109 --------- .../visualize_benchmark_results_zmq_http.py | 72 ------ 3 files changed, 388 deletions(-) delete mode 100644 benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh delete mode 100644 benchmarks/disagg_benchmarks/zmq/test_request.py delete mode 100644 benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py diff --git a/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh b/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh deleted file mode 100644 index 11a2f6c2622f4..0000000000000 --- a/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash - -# Requirement: 2x GPUs. - - -# Model: meta-llama/Meta-Llama-3.1-8B-Instruct -# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests -# Resource: 2x GPU -# Approaches: -# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 -# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance -# Prefilling instance: max_output_token=1 -# Decoding instance: force the input tokens be the same across requests to bypass prefilling - -set -ex - -kill_gpu_processes() { - # kill all processes on GPU. - pgrep pt_main_thread | xargs -r kill -9 - pgrep python3 | xargs -r kill -9 - for port in 7010 7011 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done - sleep 1 -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - local port=$1 - timeout 1200 bash -c " - until curl -s localhost:${port}/v1/completions > /dev/null; do - sleep 1 - done" && return 0 || return 1 -} - -launch_chunked_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - gpu_memory_utilization=0.6 - max_model_len=10000 - VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --max-model-len $max_model_len \ - --enable-chunked-prefill \ - --gpu-memory-utilization $gpu_memory_utilization & - VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --max-model-len $max_model_len \ - --enable-chunked-prefill \ - --gpu-memory-utilization $gpu_memory_utilization & - wait_for_server 8100 - wait_for_server 8200 - python3 ../round_robin_proxy.py & - sleep 1 -} - -launch_disagg_prefill_http() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - # disagg prefill - VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - wait_for_server 8100 - wait_for_server 8200 - python3 ../disagg_prefill_proxy_server.py & - sleep 1 -} - - - -launch_disagg_prefill_zmq() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - gpu_memory_utilization=0.6 - max_model_len=10000 - # disagg prefill - VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --zmq-server-port 7010 \ - --max-model-len $max_model_len \ - --gpu-memory-utilization $gpu_memory_utilization \ - --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --zmq-server-port 7011 \ - --max-model-len $max_model_len \ - --gpu-memory-utilization $gpu_memory_utilization \ - --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - python3 \ - -m vllm.entrypoints.disagg_connector \ - --port 8000 \ - --prefill-addr 127.0.0.1:7010 \ - --decode-addr 127.0.0.1:7011 & - - wait_for_server 8100 - wait_for_server 8200 - wait_for_server 8000 - sleep 1 -} - - -benchmark() { - results_folder="./results" - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - dataset_name="sonnet" - dataset_path="../../sonnet_4x.txt" - num_prompts=100 - qps=$1 - prefix_len=50 - input_len=1024 - output_len=$2 - tag=$3 - - python3 ../../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8000 \ - --save-result \ - --result-dir $results_folder \ - --result-filename "$tag"-qps-"$qps".json \ - --request-rate "$qps" - - sleep 2 -} - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get -y install jq) - (which socat) || (apt-get -y install socat) - (which lsof) || (apt-get -y install lsof) - pip install quart httpx matplotlib aiohttp datasets - cd "$(dirname "$0")" - cd ../.. - # create sonnet-4x.txt so that we can sample 2048 tokens for input - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - cd disagg_benchmarks/zmq - - rm -rf results - mkdir results - mkdir results/http_zmq_chunk - mkdir results/http_zmq - - default_output_len=6 - - export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') - - launch_chunked_prefill - for qps in 2 4 6 8 10 12; do - benchmark $qps $default_output_len chunked_prefill - done - kill_gpu_processes - - - launch_disagg_prefill_http - for qps in 2 4 6 8 10 12; do - benchmark $qps $default_output_len disagg_prefill_http - done - kill_gpu_processes - - launch_disagg_prefill_zmq - for qps in 2 4 6 8 10 12; do - benchmark $qps $default_output_len disagg_prefill_zmq - done - kill_gpu_processes - - python3 visualize_benchmark_results_zmq_http.py - -} - - -main "$@" diff --git a/benchmarks/disagg_benchmarks/zmq/test_request.py b/benchmarks/disagg_benchmarks/zmq/test_request.py deleted file mode 100644 index 9a3e82f1ccf45..0000000000000 --- a/benchmarks/disagg_benchmarks/zmq/test_request.py +++ /dev/null @@ -1,109 +0,0 @@ -import asyncio -import json - -import aiohttp - - -# test connect completions we assume prefill and decode are on the same node -# 1. node:vllm serve facebook/opt-125m --port 7001 --zmq-server-port 7010 \ -# --chat-template ~/vllm/examples/template_chatglm2.jinja -# 2. vllm connect --prefill-addr 127.0.0.1:7010 --decode-addr 127.0.0.1:7010 -# 3. python test_request.py -async def test_connect_completions(session): - try: - base_url = "http://localhost:8001/v1/completions" - body = { - "temperature": 0.5, - "top_p": 0.9, - "max_tokens": 150, - "frequency_penalty": 1.3, - "presence_penalty": 0.2, - "repetition_penalty": 1.2, - "model": "facebook/opt-125m", - "prompt": "Can you introduce vllm?", - # "stream": False, - "stream": True, - "stream_options": { - "include_usage": True - } - } - print(f"Sending request to {base_url}, body {body}") - async with session.post(base_url, json=body) as response: - - print(response.status) - print(response.headers) - responseText = "" - if response.status == 200: - transfer_encoding = response.headers.get('Transfer-Encoding') - content_type = response.headers.get('Content-Type') - print(f"Transfer-Encoding: {transfer_encoding}") - if transfer_encoding == 'chunked': - async for chunk in response.content.iter_chunked(1024): - try: - decoded_chunk = chunk.decode('utf-8') - # print(f"Decoded chunk: {decoded_chunk!r}") - responseText += decoded_chunk - except UnicodeDecodeError: - print(f"Error decoding chunk: {chunk!r}") - elif 'application/json' in content_type: - responseText = await response.json() - print(f"response {responseText!r}") - else: - # Print the headers and JSON response - print("Unexpected Transfer-Encoding: {} {} {}".format( - transfer_encoding, response.headers, await - response.json())) - else: - print(f"Request failed with status code {response.status}") - print(f"Response : {await response.json()}") - print(f"baseurl {base_url}") - print(f"response data {extract_data(responseText)}") - except aiohttp.ClientError as e: - print(f"Error: {e}") - - -def is_json(data): - try: - json.loads(data) - return True - except ValueError: - return False - - -def extract_data(responseText): - reply = "" - if responseText == "": - return reply - if is_json(responseText): - return responseText - - for data in responseText.split("\n\n"): - if data.startswith('data: '): - content = data[6:] - if content == "[DONE]": - print("DONE") - break - try: - json_data = json.loads(content) - choices = json_data["choices"] - if len(choices) > 0: - content = choices[0]["text"] - reply += content - except json.JSONDecodeError: - print(f"Error: Invalid data format: {data}") - return reply - else: - print(f"Error: Invalid data format: {data}") - - return reply - - -async def main(): - async with aiohttp.ClientSession() as session: - tasks = [] - for _ in range(2): - tasks.append(test_connect_completions(session)) - await asyncio.gather(*tasks) - - -asyncio.run(main()) diff --git a/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py b/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py deleted file mode 100644 index 155a5d5f88986..0000000000000 --- a/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py +++ /dev/null @@ -1,72 +0,0 @@ -import json - -import matplotlib.pyplot as plt -import pandas as pd - -if __name__ == "__main__": - data = [] - for name in [ - 'disagg_prefill_http', 'disagg_prefill_zmq', 'chunked_prefill' - ]: - for qps in [2, 4, 6, 8, 10, 12]: - with open(f"results/{name}-qps-{qps}.json") as f: - x = json.load(f) - x['name'] = name - x['qps'] = qps - data.append(x) - - df = pd.DataFrame.from_dict(data) - dis_http_df = df[df['name'] == 'disagg_prefill_http'] - dis_zmq_df = df[df['name'] == 'disagg_prefill_zmq'] - chu_df = df[df['name'] == 'chunked_prefill'] - - plt.style.use('bmh') - plt.rcParams['font.size'] = 20 - - for key in [ - 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', - 'median_itl_ms', 'p99_itl_ms' - ]: - - fig, ax = plt.subplots(figsize=(11, 7)) - plt.plot(dis_http_df['qps'], - dis_http_df[key], - label='disagg_prefill_http', - marker='o', - linewidth=4) - plt.plot(dis_zmq_df['qps'], - dis_zmq_df[key], - label='disagg_prefill_zmq', - marker='o', - linewidth=4) - plt.plot(chu_df['qps'], - chu_df[key], - label='chunked_prefill', - marker='o', - linewidth=4) - ax.legend() - - ax.set_xlabel('QPS') - ax.set_ylabel(key) - ax.set_ylim(bottom=0) - fig.savefig(f'results/http_zmq_chunk/{key}.png') - plt.close(fig) - - fig1, ax1 = plt.subplots(figsize=(11, 7)) - plt.plot(dis_http_df['qps'], - dis_http_df[key], - label='disagg_prefill_http', - marker='o', - linewidth=4) - plt.plot(dis_zmq_df['qps'], - dis_zmq_df[key], - label='disagg_prefill_zmq', - marker='o', - linewidth=4) - ax1.legend() - - ax1.set_xlabel('QPS') - ax1.set_ylabel(key) - ax1.set_ylim(bottom=0) - fig1.savefig(f'results/http_zmq/{key}.png') - plt.close(fig1)