mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 08:07:03 +08:00
remove invalid zmq benchmark code
Signed-off-by: clark <panf2333@gmail.com>
This commit is contained in:
parent
6c8fae82dd
commit
298298f97d
@ -1,207 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Requirement: 2x GPUs.
|
||||
|
||||
|
||||
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
|
||||
# Resource: 2x GPU
|
||||
# Approaches:
|
||||
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
|
||||
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
|
||||
# Prefilling instance: max_output_token=1
|
||||
# Decoding instance: force the input tokens be the same across requests to bypass prefilling
|
||||
|
||||
set -ex
|
||||
|
||||
kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pgrep pt_main_thread | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
for port in 7010 7011 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
||||
sleep 1
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
local port=$1
|
||||
timeout 1200 bash -c "
|
||||
until curl -s localhost:${port}/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
launch_chunked_prefill() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
gpu_memory_utilization=0.6
|
||||
max_model_len=10000
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--max-model-len $max_model_len \
|
||||
--enable-chunked-prefill \
|
||||
--gpu-memory-utilization $gpu_memory_utilization &
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--max-model-len $max_model_len \
|
||||
--enable-chunked-prefill \
|
||||
--gpu-memory-utilization $gpu_memory_utilization &
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
python3 ../round_robin_proxy.py &
|
||||
sleep 1
|
||||
}
|
||||
|
||||
launch_disagg_prefill_http() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
# disagg prefill
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
python3 ../disagg_prefill_proxy_server.py &
|
||||
sleep 1
|
||||
}
|
||||
|
||||
|
||||
|
||||
launch_disagg_prefill_zmq() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
gpu_memory_utilization=0.6
|
||||
max_model_len=10000
|
||||
# disagg prefill
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--zmq-server-port 7010 \
|
||||
--max-model-len $max_model_len \
|
||||
--gpu-memory-utilization $gpu_memory_utilization \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--zmq-server-port 7011 \
|
||||
--max-model-len $max_model_len \
|
||||
--gpu-memory-utilization $gpu_memory_utilization \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
python3 \
|
||||
-m vllm.entrypoints.disagg_connector \
|
||||
--port 8000 \
|
||||
--prefill-addr 127.0.0.1:7010 \
|
||||
--decode-addr 127.0.0.1:7011 &
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
wait_for_server 8000
|
||||
sleep 1
|
||||
}
|
||||
|
||||
|
||||
benchmark() {
|
||||
results_folder="./results"
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
dataset_name="sonnet"
|
||||
dataset_path="../../sonnet_4x.txt"
|
||||
num_prompts=100
|
||||
qps=$1
|
||||
prefix_len=50
|
||||
input_len=1024
|
||||
output_len=$2
|
||||
tag=$3
|
||||
|
||||
python3 ../../benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--sonnet-input-len $input_len \
|
||||
--sonnet-output-len "$output_len" \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--num-prompts $num_prompts \
|
||||
--port 8000 \
|
||||
--save-result \
|
||||
--result-dir $results_folder \
|
||||
--result-filename "$tag"-qps-"$qps".json \
|
||||
--request-rate "$qps"
|
||||
|
||||
sleep 2
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get -y install jq)
|
||||
(which socat) || (apt-get -y install socat)
|
||||
(which lsof) || (apt-get -y install lsof)
|
||||
pip install quart httpx matplotlib aiohttp datasets
|
||||
cd "$(dirname "$0")"
|
||||
cd ../..
|
||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
||||
echo "" > sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat sonnet.txt >> sonnet_4x.txt
|
||||
done
|
||||
cd disagg_benchmarks/zmq
|
||||
|
||||
rm -rf results
|
||||
mkdir results
|
||||
mkdir results/http_zmq_chunk
|
||||
mkdir results/http_zmq
|
||||
|
||||
default_output_len=6
|
||||
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
launch_chunked_prefill
|
||||
for qps in 2 4 6 8 10 12; do
|
||||
benchmark $qps $default_output_len chunked_prefill
|
||||
done
|
||||
kill_gpu_processes
|
||||
|
||||
|
||||
launch_disagg_prefill_http
|
||||
for qps in 2 4 6 8 10 12; do
|
||||
benchmark $qps $default_output_len disagg_prefill_http
|
||||
done
|
||||
kill_gpu_processes
|
||||
|
||||
launch_disagg_prefill_zmq
|
||||
for qps in 2 4 6 8 10 12; do
|
||||
benchmark $qps $default_output_len disagg_prefill_zmq
|
||||
done
|
||||
kill_gpu_processes
|
||||
|
||||
python3 visualize_benchmark_results_zmq_http.py
|
||||
|
||||
}
|
||||
|
||||
|
||||
main "$@"
|
||||
@ -1,109 +0,0 @@
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
# test connect completions we assume prefill and decode are on the same node
|
||||
# 1. node:vllm serve facebook/opt-125m --port 7001 --zmq-server-port 7010 \
|
||||
# --chat-template ~/vllm/examples/template_chatglm2.jinja
|
||||
# 2. vllm connect --prefill-addr 127.0.0.1:7010 --decode-addr 127.0.0.1:7010
|
||||
# 3. python test_request.py
|
||||
async def test_connect_completions(session):
|
||||
try:
|
||||
base_url = "http://localhost:8001/v1/completions"
|
||||
body = {
|
||||
"temperature": 0.5,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 150,
|
||||
"frequency_penalty": 1.3,
|
||||
"presence_penalty": 0.2,
|
||||
"repetition_penalty": 1.2,
|
||||
"model": "facebook/opt-125m",
|
||||
"prompt": "Can you introduce vllm?",
|
||||
# "stream": False,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True
|
||||
}
|
||||
}
|
||||
print(f"Sending request to {base_url}, body {body}")
|
||||
async with session.post(base_url, json=body) as response:
|
||||
|
||||
print(response.status)
|
||||
print(response.headers)
|
||||
responseText = ""
|
||||
if response.status == 200:
|
||||
transfer_encoding = response.headers.get('Transfer-Encoding')
|
||||
content_type = response.headers.get('Content-Type')
|
||||
print(f"Transfer-Encoding: {transfer_encoding}")
|
||||
if transfer_encoding == 'chunked':
|
||||
async for chunk in response.content.iter_chunked(1024):
|
||||
try:
|
||||
decoded_chunk = chunk.decode('utf-8')
|
||||
# print(f"Decoded chunk: {decoded_chunk!r}")
|
||||
responseText += decoded_chunk
|
||||
except UnicodeDecodeError:
|
||||
print(f"Error decoding chunk: {chunk!r}")
|
||||
elif 'application/json' in content_type:
|
||||
responseText = await response.json()
|
||||
print(f"response {responseText!r}")
|
||||
else:
|
||||
# Print the headers and JSON response
|
||||
print("Unexpected Transfer-Encoding: {} {} {}".format(
|
||||
transfer_encoding, response.headers, await
|
||||
response.json()))
|
||||
else:
|
||||
print(f"Request failed with status code {response.status}")
|
||||
print(f"Response : {await response.json()}")
|
||||
print(f"baseurl {base_url}")
|
||||
print(f"response data {extract_data(responseText)}")
|
||||
except aiohttp.ClientError as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
|
||||
def is_json(data):
|
||||
try:
|
||||
json.loads(data)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def extract_data(responseText):
|
||||
reply = ""
|
||||
if responseText == "":
|
||||
return reply
|
||||
if is_json(responseText):
|
||||
return responseText
|
||||
|
||||
for data in responseText.split("\n\n"):
|
||||
if data.startswith('data: '):
|
||||
content = data[6:]
|
||||
if content == "[DONE]":
|
||||
print("DONE")
|
||||
break
|
||||
try:
|
||||
json_data = json.loads(content)
|
||||
choices = json_data["choices"]
|
||||
if len(choices) > 0:
|
||||
content = choices[0]["text"]
|
||||
reply += content
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error: Invalid data format: {data}")
|
||||
return reply
|
||||
else:
|
||||
print(f"Error: Invalid data format: {data}")
|
||||
|
||||
return reply
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = []
|
||||
for _ in range(2):
|
||||
tasks.append(test_connect_completions(session))
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
@ -1,72 +0,0 @@
|
||||
import json
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = []
|
||||
for name in [
|
||||
'disagg_prefill_http', 'disagg_prefill_zmq', 'chunked_prefill'
|
||||
]:
|
||||
for qps in [2, 4, 6, 8, 10, 12]:
|
||||
with open(f"results/{name}-qps-{qps}.json") as f:
|
||||
x = json.load(f)
|
||||
x['name'] = name
|
||||
x['qps'] = qps
|
||||
data.append(x)
|
||||
|
||||
df = pd.DataFrame.from_dict(data)
|
||||
dis_http_df = df[df['name'] == 'disagg_prefill_http']
|
||||
dis_zmq_df = df[df['name'] == 'disagg_prefill_zmq']
|
||||
chu_df = df[df['name'] == 'chunked_prefill']
|
||||
|
||||
plt.style.use('bmh')
|
||||
plt.rcParams['font.size'] = 20
|
||||
|
||||
for key in [
|
||||
'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
|
||||
'median_itl_ms', 'p99_itl_ms'
|
||||
]:
|
||||
|
||||
fig, ax = plt.subplots(figsize=(11, 7))
|
||||
plt.plot(dis_http_df['qps'],
|
||||
dis_http_df[key],
|
||||
label='disagg_prefill_http',
|
||||
marker='o',
|
||||
linewidth=4)
|
||||
plt.plot(dis_zmq_df['qps'],
|
||||
dis_zmq_df[key],
|
||||
label='disagg_prefill_zmq',
|
||||
marker='o',
|
||||
linewidth=4)
|
||||
plt.plot(chu_df['qps'],
|
||||
chu_df[key],
|
||||
label='chunked_prefill',
|
||||
marker='o',
|
||||
linewidth=4)
|
||||
ax.legend()
|
||||
|
||||
ax.set_xlabel('QPS')
|
||||
ax.set_ylabel(key)
|
||||
ax.set_ylim(bottom=0)
|
||||
fig.savefig(f'results/http_zmq_chunk/{key}.png')
|
||||
plt.close(fig)
|
||||
|
||||
fig1, ax1 = plt.subplots(figsize=(11, 7))
|
||||
plt.plot(dis_http_df['qps'],
|
||||
dis_http_df[key],
|
||||
label='disagg_prefill_http',
|
||||
marker='o',
|
||||
linewidth=4)
|
||||
plt.plot(dis_zmq_df['qps'],
|
||||
dis_zmq_df[key],
|
||||
label='disagg_prefill_zmq',
|
||||
marker='o',
|
||||
linewidth=4)
|
||||
ax1.legend()
|
||||
|
||||
ax1.set_xlabel('QPS')
|
||||
ax1.set_ylabel(key)
|
||||
ax1.set_ylim(bottom=0)
|
||||
fig1.savefig(f'results/http_zmq/{key}.png')
|
||||
plt.close(fig1)
|
||||
Loading…
x
Reference in New Issue
Block a user