diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh new file mode 100755 index 0000000000000..209d9c4341cdd --- /dev/null +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -euo pipefail + +docker_root=$(docker info -f '{{.DockerRootDir}}') +if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 +fi +echo "Docker root directory: $docker_root" +# Check disk usage of the filesystem where Docker's root directory is located +disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') +# Define the threshold +threshold=70 +if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." +else + echo "Disk usage is below $threshold%. No cleanup needed." +fi diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env new file mode 100644 index 0000000000000..4417586473474 --- /dev/null +++ b/.buildkite/scripts/tpu/config_v6e_1.env @@ -0,0 +1,14 @@ +# Environment config +TEST_NAME=llama8b +CONTAINER_NAME=vllm-tpu + +# vllm config +MODEL=meta-llama/Llama-3.1-8B-Instruct +MAX_NUM_SEQS=512 +MAX_NUM_BATCHED_TOKENS=512 +TENSOR_PARALLEL_SIZE=1 +MAX_MODEL_LEN=2048 +DOWNLOAD_DIR=/mnt/disks/persist +EXPECTED_THROUGHPUT=8.0 +INPUT_LEN=1800 +OUTPUT_LEN=128 diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh new file mode 100755 index 0000000000000..6705da03e3d76 --- /dev/null +++ b/.buildkite/scripts/tpu/docker_run_bm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +if [ ! -f "$1" ]; then + echo "Error: The env file '$1' does not exist." + exit 1 # Exit the script with a non-zero status to indicate an error +fi + +ENV_FILE=$1 + +# For testing on local vm, use `set -a` to export all variables +source /etc/environment +source $ENV_FILE + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; + docker rm -f $CONTAINER_NAME || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build docker image. +# TODO: build the image outside the script and share the image with other +# tpu test if building time is too long. +DOCKER_BUILDKIT=1 docker build \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=0 \ + --tag vllm/vllm-tpu-bm \ + --progress plain -f docker/Dockerfile.tpu . + +LOG_ROOT=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $LOG_ROOT" + +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN is not set or is empty." + exit 1 +fi + +# Make sure mounted disk or dir exists +if [ ! -d "$DOWNLOAD_DIR" ]; then + echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." + exit 1 +fi + +echo "Run model $MODEL" +echo + +echo "starting docker...$CONTAINER_NAME" +echo +docker run \ + -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ + --env-file $ENV_FILE \ + -e HF_TOKEN="$HF_TOKEN" \ + -e TARGET_COMMIT=$BUILDKITE_COMMIT \ + -e MODEL=$MODEL \ + -e WORKSPACE=/workspace \ + --name $CONTAINER_NAME \ + -d \ + --privileged \ + --network host \ + -v /dev/shm:/dev/shm \ + vllm/vllm-tpu-bm tail -f /dev/null + +echo "run script..." +echo +docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh" + +echo "copy result back..." +VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt +BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt +docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" +docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" + +if [ "$BUILDKITE" = "true" ]; then + echo "Running inside Buildkite" + buildkite-agent artifact upload "$VLLM_LOG" + buildkite-agent artifact upload "$BM_LOG" +else + echo "Not running inside Buildkite" +fi + +# +# compare the throughput with EXPECTED_THROUGHPUT +# and assert meeting the expectation +# +if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + echo "Failed to get the throughput" + exit 1 +fi + +if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then + echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" + exit 1 +fi diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh new file mode 100755 index 0000000000000..877669cd956ac --- /dev/null +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -euo pipefail + +VLLM_LOG="$WORKSPACE/vllm_log.txt" +BM_LOG="$WORKSPACE/bm_log.txt" + +if [ -n "$TARGET_COMMIT" ]; then + head_hash=$(git rev-parse HEAD) + if [ "$TARGET_COMMIT" != "$head_hash" ]; then + echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" + exit 1 + fi +fi + +echo "model: $MODEL" +echo + +# +# create a log folder +# +mkdir "$WORKSPACE/log" + +# TODO: Move to image building. +pip install pandas +pip install datasets + +# +# create sonnet_4x +# +echo "Create sonnet_4x.txt" +echo "" > benchmarks/sonnet_4x.txt +for _ in {1..4} + do + cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt +done + +# +# start vllm service in backend +# +echo "lanching vllm..." +echo "logging to $VLLM_LOG" +echo + +VLLM_USE_V1=1 vllm serve $MODEL \ + --seed 42 \ + --disable-log-requests \ + --max-num-seqs $MAX_NUM_SEQS \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --no-enable-prefix-caching \ + --download_dir $DOWNLOAD_DIR \ + --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & + + +echo "wait for 20 minutes.." +echo +# sleep 1200 +# wait for 10 minutes... +for i in {1..120}; do + # TODO: detect other type of errors. + if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then + echo "Detected RuntimeError, exiting." + exit 1 + elif grep -Fq "Application startup complete" "$VLLM_LOG"; then + echo "Application started" + break + else + echo "wait for 10 seconds..." + sleep 10 + fi +done + +# +# run test +# +echo "run benchmark test..." +echo "logging to $BM_LOG" +echo +python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL \ + --dataset-name sonnet \ + --dataset-path benchmarks/sonnet_4x.txt \ + --sonnet-input-len $INPUT_LEN \ + --sonnet-output-len $OUTPUT_LEN \ + --ignore-eos > "$BM_LOG" + +echo "completed..." +echo + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput: $throughput" +echo