mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-14 16:01:21 +08:00
Serving Benchmark Refactoring (#2433)
This commit is contained in:
parent
563836496a
commit
a4211a4dc3
@ -6,15 +6,16 @@ set -o pipefail
|
|||||||
# cd into parent directory of this file
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(wget && curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
|
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
@ -22,11 +23,14 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai \
|
||||||
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
--num-prompts 20 \
|
--num-prompts 20 \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
--tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
|
--tokenizer meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--save-result \
|
||||||
|
2>&1 | tee benchmark_serving.txt
|
||||||
bench_serving_exit_code=$?
|
bench_serving_exit_code=$?
|
||||||
kill $server_pid
|
kill $server_pid
|
||||||
|
|
||||||
@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
|
|||||||
echo "### Serving Benchmarks" >> benchmark_results.md
|
echo "### Serving Benchmarks" >> benchmark_results.md
|
||||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||||
echo "" >> benchmark_results.md
|
echo "" >> benchmark_results.md
|
||||||
tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
|
tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
|
||||||
|
|
||||||
# upload the results to buildkite
|
# upload the results to buildkite
|
||||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||||
@ -61,3 +65,5 @@ fi
|
|||||||
if [ $bench_serving_exit_code -ne 0 ]; then
|
if [ $bench_serving_exit_code -ne 0 ]; then
|
||||||
exit $bench_serving_exit_code
|
exit $bench_serving_exit_code
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
/workspace/buildkite-agent artifact upload openai-*.json
|
||||||
|
|||||||
284
benchmarks/backend_request_func.py
Normal file
284
benchmarks/backend_request_func.py
Normal file
@ -0,0 +1,284 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncInput:
|
||||||
|
prompt: str
|
||||||
|
api_url: str
|
||||||
|
prompt_len: int
|
||||||
|
output_len: int
|
||||||
|
model: str
|
||||||
|
best_of: int = 1
|
||||||
|
use_beam_search: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncOutput:
|
||||||
|
generated_text: str = ""
|
||||||
|
success: bool = False
|
||||||
|
latency: float = 0
|
||||||
|
ttft: float = 0
|
||||||
|
prompt_len: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_tgi(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
params = {
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
|
"max_new_tokens": request_func_input.output_len,
|
||||||
|
"do_sample": True,
|
||||||
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"inputs": request_func_input.prompt,
|
||||||
|
"parameters": params,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
ttft = 0
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for data in response.content.iter_any():
|
||||||
|
if ttft == 0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
|
||||||
|
body = data.decode("utf-8").lstrip("data:")
|
||||||
|
output.generated_text = json.loads(body)["generated_text"]
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_vllm(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
payload = {
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"n": 1,
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
|
"use_beam_search": request_func_input.use_beam_search,
|
||||||
|
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"ignore_eos": True,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
ttft = 0
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for data in response.content.iter_any():
|
||||||
|
if ttft == 0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
|
||||||
|
# When streaming, '\0' is appended to the end of the response.
|
||||||
|
body = data.decode("utf-8").strip("\0")
|
||||||
|
output.generated_text = json.loads(
|
||||||
|
body)["text"][0][len(request_func_input.prompt):]
|
||||||
|
output.success = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_trt_llm(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
|
payload = {
|
||||||
|
"accumulate_tokens": True,
|
||||||
|
"text_input": request_func_input.prompt,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
ttft = 0
|
||||||
|
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
async for data in resp.content.iter_any():
|
||||||
|
if ttft == 0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
|
||||||
|
body = data.decode("utf-8").lstrip("data:")
|
||||||
|
output.generated_text = json.loads(body)["text_output"]
|
||||||
|
output.success = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_deepspeed_mii(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"prompts": request_func_input.prompt,
|
||||||
|
"max_new_tokens": request_func_input.output_len,
|
||||||
|
"ignore_eos": True,
|
||||||
|
"do_sample": True,
|
||||||
|
"temperature":
|
||||||
|
0.01, # deepspeed-mii does not accept 0.0 temperature.
|
||||||
|
"top_p": 1.0,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
# DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
|
||||||
|
# https://github.com/microsoft/DeepSpeed-MII/pull/311
|
||||||
|
output.ttft = 0
|
||||||
|
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=request_func_input.api_url,
|
||||||
|
json=payload) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
parsed_resp = await resp.json()
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
output.generated_text = parsed_resp[0]["generated_text"]
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("v1/completions")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk in response.content:
|
||||||
|
if ttft == 0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
chunk = chunk.strip()
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk.decode("utf-8").lstrip("data: ")
|
||||||
|
if chunk == "[DONE]":
|
||||||
|
latency = time.perf_counter() - st
|
||||||
|
else:
|
||||||
|
body = json.loads(chunk)
|
||||||
|
generated_text += body["choices"][0]["text"]
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = latency
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
ASYNC_REQUEST_FUNCS = {
|
||||||
|
"tgi": async_request_tgi,
|
||||||
|
"vllm": async_request_vllm,
|
||||||
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
|
"openai": async_request_openai_completions,
|
||||||
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
}
|
||||||
@ -20,16 +20,36 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
from typing import AsyncGenerator, List, Tuple
|
from typing import AsyncGenerator, List, Tuple
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
# (prompt len, output len, latency)
|
from backend_request_func import (
|
||||||
REQUEST_LATENCY: List[Tuple[int, int, float]] = []
|
ASYNC_REQUEST_FUNCS,
|
||||||
|
RequestFuncInput,
|
||||||
|
RequestFuncOutput,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkMetrics:
|
||||||
|
completed: int
|
||||||
|
total_input: int
|
||||||
|
total_output: int
|
||||||
|
request_throughput: float
|
||||||
|
input_throughput: float
|
||||||
|
output_throughput: float
|
||||||
|
mean_ttft_ms: float
|
||||||
|
median_ttft_ms: float
|
||||||
|
p99_ttft_ms: float
|
||||||
|
mean_tpot_ms: float
|
||||||
|
median_tpot_ms: float
|
||||||
|
p99_tpot_ms: float
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
@ -46,6 +66,11 @@ def sample_requests(
|
|||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [(data["conversations"][0]["value"],
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
|
|
||||||
|
# some of these will be filtered out, so sample more than we need
|
||||||
|
sampled_indices = random.sample(range(len(dataset)),
|
||||||
|
int(num_requests * 1.2))
|
||||||
|
dataset = [dataset[i] for i in sampled_indices]
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Tokenize the prompts and completions.
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
prompts = [prompt for prompt, _ in dataset]
|
||||||
prompt_token_ids = tokenizer(prompts).input_ids
|
prompt_token_ids = tokenizer(prompts).input_ids
|
||||||
@ -92,80 +117,125 @@ async def get_request(
|
|||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
async def send_request(backend: str, model: str, api_url: str, prompt: str,
|
def calculate_metrics(
|
||||||
prompt_len: int, output_len: int, best_of: int,
|
input_requests: List[Tuple[str, int, int]],
|
||||||
use_beam_search: bool, pbar: tqdm) -> None:
|
outputs: List[RequestFuncOutput],
|
||||||
request_start_time = time.perf_counter()
|
dur_s: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> BenchmarkMetrics:
|
||||||
|
total_output = 0
|
||||||
|
total_input = 0
|
||||||
|
completed = 0
|
||||||
|
per_token_latencies = []
|
||||||
|
ttfts = []
|
||||||
|
for i in range(len(outputs)):
|
||||||
|
if outputs[i].success:
|
||||||
|
output_len = len(tokenizer.encode(outputs[i].generated_text))
|
||||||
|
total_output += output_len
|
||||||
|
total_input += input_requests[i][1]
|
||||||
|
per_token_latencies.append(outputs[i].latency / output_len)
|
||||||
|
ttfts.append(outputs[i].ttft)
|
||||||
|
completed += 1
|
||||||
|
|
||||||
headers = {"User-Agent": "Benchmark Client"}
|
metrics = BenchmarkMetrics(
|
||||||
if backend == "vllm":
|
completed=completed,
|
||||||
pload = {
|
total_input=total_input,
|
||||||
"prompt": prompt,
|
total_output=total_output,
|
||||||
"n": 1,
|
request_throughput=completed / dur_s,
|
||||||
"best_of": best_of,
|
input_throughput=total_input / dur_s,
|
||||||
"use_beam_search": use_beam_search,
|
output_throughput=total_output / dur_s,
|
||||||
"temperature": 0.0 if use_beam_search else 1.0,
|
mean_ttft_ms=np.mean(ttfts) * 1000,
|
||||||
"top_p": 1.0,
|
median_ttft_ms=np.median(ttfts) * 1000,
|
||||||
"max_tokens": output_len,
|
p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
|
||||||
"ignore_eos": True,
|
mean_tpot_ms=np.mean(per_token_latencies) * 1000,
|
||||||
"stream": False,
|
median_tpot_ms=np.median(per_token_latencies) * 1000,
|
||||||
}
|
p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
|
||||||
if model is not None:
|
)
|
||||||
pload["model"] = model
|
|
||||||
elif backend == "tgi":
|
|
||||||
assert not use_beam_search
|
|
||||||
params = {
|
|
||||||
"best_of": best_of,
|
|
||||||
"max_new_tokens": output_len,
|
|
||||||
"do_sample": True,
|
|
||||||
}
|
|
||||||
pload = {
|
|
||||||
"inputs": prompt,
|
|
||||||
"parameters": params,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
|
||||||
|
|
||||||
timeout = aiohttp.ClientTimeout(total=3 * 3600)
|
return metrics
|
||||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
||||||
while True:
|
|
||||||
async with session.post(api_url, headers=headers,
|
|
||||||
json=pload) as response:
|
|
||||||
chunks = []
|
|
||||||
async for chunk, _ in response.content.iter_chunks():
|
|
||||||
chunks.append(chunk)
|
|
||||||
output = b"".join(chunks).decode("utf-8")
|
|
||||||
output = json.loads(output)
|
|
||||||
|
|
||||||
# Re-send the request if it failed.
|
|
||||||
if "error" not in output:
|
|
||||||
break
|
|
||||||
|
|
||||||
request_end_time = time.perf_counter()
|
|
||||||
request_latency = request_end_time - request_start_time
|
|
||||||
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
|
|
||||||
pbar.update(1)
|
|
||||||
|
|
||||||
|
|
||||||
async def benchmark(
|
async def benchmark(
|
||||||
backend: str,
|
backend: str,
|
||||||
model: str,
|
|
||||||
api_url: str,
|
api_url: str,
|
||||||
|
model_id: str,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
) -> None:
|
disable_tqdm: bool,
|
||||||
tasks: List[asyncio.Task] = []
|
):
|
||||||
pbar = tqdm(total=len(input_requests))
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
|
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
|
print(f"Traffic request rate: {request_rate}")
|
||||||
|
|
||||||
|
benchmark_start_time = time.perf_counter()
|
||||||
|
tasks = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len = request
|
||||||
task = asyncio.create_task(
|
request_func_input = RequestFuncInput(
|
||||||
send_request(backend, model, api_url, prompt, prompt_len,
|
model=model_id,
|
||||||
output_len, best_of, use_beam_search, pbar))
|
prompt=prompt,
|
||||||
tasks.append(task)
|
api_url=api_url,
|
||||||
await asyncio.gather(*tasks)
|
prompt_len=prompt_len,
|
||||||
pbar.close()
|
output_len=output_len,
|
||||||
|
best_of=best_of,
|
||||||
|
use_beam_search=use_beam_search,
|
||||||
|
)
|
||||||
|
tasks.append(
|
||||||
|
asyncio.create_task(
|
||||||
|
request_func(request_func_input=request_func_input,
|
||||||
|
pbar=pbar)))
|
||||||
|
outputs = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
if not disable_tqdm:
|
||||||
|
pbar.close()
|
||||||
|
|
||||||
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
|
|
||||||
|
metrics = calculate_metrics(
|
||||||
|
input_requests=input_requests,
|
||||||
|
outputs=outputs,
|
||||||
|
dur_s=benchmark_duration,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Successful requests: {metrics.completed}")
|
||||||
|
print(f"Benchmark duration: {benchmark_duration:2f} s")
|
||||||
|
print(f"Total input tokens: {metrics.total_input}")
|
||||||
|
print(f"Total generated tokens: {metrics.total_output}")
|
||||||
|
print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
|
||||||
|
print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
|
||||||
|
print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
|
||||||
|
print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
|
||||||
|
print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
|
||||||
|
print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
|
||||||
|
print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
|
||||||
|
print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
|
||||||
|
print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"duration": benchmark_duration,
|
||||||
|
"completed": metrics.completed,
|
||||||
|
"total_input_tokens": metrics.total_input,
|
||||||
|
"total_output_tokens": metrics.total_output,
|
||||||
|
"request_inthroughput": metrics.request_throughput,
|
||||||
|
"input_throughput": metrics.input_throughput,
|
||||||
|
"output_throughput": metrics.output_throughput,
|
||||||
|
"mean_ttft_ms": metrics.mean_ttft_ms,
|
||||||
|
"median_ttft_ms": metrics.median_ttft_ms,
|
||||||
|
"p99_ttft_ms": metrics.p99_ttft_ms,
|
||||||
|
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||||
|
"median_tpot_ms": metrics.median_tpot_ms,
|
||||||
|
"p99_tpot_ms": metrics.p99_tpot_ms
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@ -173,77 +243,145 @@ def main(args: argparse.Namespace):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
np.random.seed(args.seed)
|
np.random.seed(args.seed)
|
||||||
|
|
||||||
api_url = f"{args.protocol}://{args.host}:{args.port}{args.endpoint}"
|
backend = args.backend
|
||||||
tokenizer = get_tokenizer(args.tokenizer,
|
model_id = args.model
|
||||||
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||||
|
|
||||||
|
if args.base_url is not None:
|
||||||
|
api_url = f"{args.base_url}{args.endpoint}"
|
||||||
|
else:
|
||||||
|
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||||
|
|
||||||
|
tokenizer = get_tokenizer(tokenizer_id,
|
||||||
trust_remote_code=args.trust_remote_code)
|
trust_remote_code=args.trust_remote_code)
|
||||||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_result = asyncio.run(
|
||||||
asyncio.run(
|
benchmark(
|
||||||
benchmark(args.backend, args.model, api_url, input_requests,
|
backend=backend,
|
||||||
args.best_of, args.use_beam_search, args.request_rate))
|
api_url=api_url,
|
||||||
benchmark_end_time = time.perf_counter()
|
model_id=model_id,
|
||||||
benchmark_time = benchmark_end_time - benchmark_start_time
|
tokenizer=tokenizer,
|
||||||
print(f"Total time: {benchmark_time:.2f} s")
|
input_requests=input_requests,
|
||||||
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
best_of=args.best_of,
|
||||||
|
use_beam_search=args.use_beam_search,
|
||||||
|
request_rate=args.request_rate,
|
||||||
|
disable_tqdm=args.disable_tqdm,
|
||||||
|
))
|
||||||
|
|
||||||
# Compute the latency statistics.
|
# Save config and results to json
|
||||||
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
|
if args.save_result:
|
||||||
print(f"Average latency: {avg_latency:.2f} s")
|
result_json = {}
|
||||||
avg_per_token_latency = np.mean([
|
|
||||||
latency / (prompt_len + output_len)
|
# Setup
|
||||||
for prompt_len, output_len, latency in REQUEST_LATENCY
|
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
])
|
result_json["date"] = current_dt
|
||||||
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
|
result_json["backend"] = backend
|
||||||
avg_per_output_token_latency = np.mean(
|
result_json["version"] = args.version
|
||||||
[latency / output_len for _, output_len, latency in REQUEST_LATENCY])
|
result_json["model_id"] = model_id
|
||||||
print("Average latency per output token: "
|
result_json["tokenizer_id"] = tokenizer_id
|
||||||
f"{avg_per_output_token_latency:.2f} s")
|
result_json["best_of"] = args.best_of
|
||||||
|
result_json["use_beam_search"] = args.use_beam_search
|
||||||
|
result_json["num_prompts"] = args.num_prompts
|
||||||
|
|
||||||
|
# Traffic
|
||||||
|
result_json["request_rate"] = (
|
||||||
|
args.request_rate if args.request_rate < float("inf") else "inf")
|
||||||
|
|
||||||
|
# Merge with benchmark result
|
||||||
|
result_json = {**result_json, **benchmark_result}
|
||||||
|
|
||||||
|
# Save to file
|
||||||
|
base_model_id = model_id.split("/")[-1]
|
||||||
|
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
|
with open(file_name, "w") as outfile:
|
||||||
|
json.dump(result_json, outfile)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument(
|
||||||
type=str,
|
"--backend",
|
||||||
default="vllm",
|
type=str,
|
||||||
choices=["vllm", "tgi"])
|
default="vllm",
|
||||||
parser.add_argument("--protocol",
|
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||||
type=str,
|
)
|
||||||
default="http",
|
parser.add_argument(
|
||||||
choices=["http", "https"])
|
"--version",
|
||||||
|
type=str,
|
||||||
|
default="N/A",
|
||||||
|
help="Version of the serving backend/engine.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-url",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Server or API base url if not using http host and port.",
|
||||||
|
)
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
parser.add_argument("--host", type=str, default="localhost")
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser.add_argument("--endpoint", type=str, default="/generate")
|
parser.add_argument(
|
||||||
parser.add_argument("--model", type=str, default=None)
|
"--endpoint",
|
||||||
|
type=str,
|
||||||
|
default="/generate",
|
||||||
|
help="API endpoint.",
|
||||||
|
)
|
||||||
parser.add_argument("--dataset",
|
parser.add_argument("--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="Path to the dataset.")
|
help="Path to the dataset.")
|
||||||
parser.add_argument("--tokenizer",
|
parser.add_argument(
|
||||||
type=str,
|
"--model",
|
||||||
required=True,
|
type=str,
|
||||||
help="Name or path of the tokenizer.")
|
required=True,
|
||||||
parser.add_argument("--best-of",
|
help="Name of the model.",
|
||||||
type=int,
|
)
|
||||||
default=1,
|
parser.add_argument(
|
||||||
help="Generates `best_of` sequences per prompt and "
|
"--tokenizer",
|
||||||
"returns the best one.")
|
type=str,
|
||||||
|
help=
|
||||||
|
"Name or path of the tokenizer, if not using the default model tokenizer.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--best-of",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Generates `best_of` sequences per prompt and "
|
||||||
|
"returns the best one.",
|
||||||
|
)
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
parser.add_argument("--num-prompts",
|
parser.add_argument(
|
||||||
type=int,
|
"--num-prompts",
|
||||||
default=1000,
|
type=int,
|
||||||
help="Number of prompts to process.")
|
default=1000,
|
||||||
parser.add_argument("--request-rate",
|
help="Number of prompts to process.",
|
||||||
type=float,
|
)
|
||||||
default=float("inf"),
|
parser.add_argument(
|
||||||
help="Number of requests per second. If this is inf, "
|
"--request-rate",
|
||||||
"then all the requests are sent at time 0. "
|
type=float,
|
||||||
"Otherwise, we use Poisson process to synthesize "
|
default=float("inf"),
|
||||||
"the request arrival times.")
|
help="Number of requests per second. If this is inf, "
|
||||||
|
"then all the requests are sent at time 0. "
|
||||||
|
"Otherwise, we use Poisson process to synthesize "
|
||||||
|
"the request arrival times.",
|
||||||
|
)
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument(
|
||||||
action='store_true',
|
"--trust-remote-code",
|
||||||
help='trust remote code from huggingface')
|
action="store_true",
|
||||||
|
help="Trust remote code from huggingface",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-tqdm",
|
||||||
|
action="store_true",
|
||||||
|
help="Specify to disbale tqdm progress bar.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-result",
|
||||||
|
action="store_true",
|
||||||
|
help="Specify to save benchmark results to a json file",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ TOKENS=$2
|
|||||||
|
|
||||||
docker run --gpus all --shm-size 1g -p $PORT:80 \
|
docker run --gpus all --shm-size 1g -p $PORT:80 \
|
||||||
-v $PWD/data:/data \
|
-v $PWD/data:/data \
|
||||||
ghcr.io/huggingface/text-generation-inference:0.8 \
|
ghcr.io/huggingface/text-generation-inference:1.4.0 \
|
||||||
--model-id $MODEL \
|
--model-id $MODEL \
|
||||||
--sharded false \
|
--sharded false \
|
||||||
--max-input-length 1024 \
|
--max-input-length 1024 \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user