diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 332142ba5d17..6d494f64f14f 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. **Benchmarking Duration**: about 1hr. @@ -34,6 +34,7 @@ Runtime environment variables: See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. > ### Latency test diff --git a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md index 8bb16bd3cf37..b9437ac5ca99 100644 --- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md @@ -5,7 +5,7 @@ - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: end-to-end latency (mean, median, p99). @@ -16,7 +16,7 @@ - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput. @@ -28,7 +28,7 @@ - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 9447ceffd7e2..99a5a5e334f8 100644 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -15,6 +15,8 @@ check_gpus() { declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) elif command -v amd-smi; then declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + elif command -v hl-smi; then + declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) fi if [[ $gpu_count -gt 0 ]]; then @@ -23,10 +25,16 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi + + declare -g arch_suffix='' + if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') elif command -v amd-smi; then declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') + elif command -v hl-smi; then + declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') + arch_suffix='-hpu' fi echo "GPU type is $gpu_type" } @@ -138,6 +146,10 @@ kill_gpu_processes() { while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do sleep 1 done + elif command -v hl-smi; then + while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do + sleep 1 + done fi # remove vllm config file @@ -451,6 +463,7 @@ main() { ARCH='-cpu' else check_gpus + ARCH="$arch_suffix" fi check_hf_token diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json new file mode 100644 index 000000000000..296380f72a66 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json @@ -0,0 +1,55 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json new file mode 100644 index 000000000000..8c6b34bd9fa3 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -0,0 +1,82 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json new file mode 100644 index 000000000000..3127bf2f6bce --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json @@ -0,0 +1,61 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + } +]