diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 2d375d7e9d87..eb82da3a883e 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -13,91 +13,140 @@ remove_docker_container # For HF_TOKEN. source /etc/environment -# Run a simple end-to-end example. + docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest pytest-asyncio tpu-info \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && export VLLM_XLA_CACHE_PATH= \ - && export VLLM_USE_V1=1 \ - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ - && echo HARDWARE \ - && tpu-info \ - && { \ - echo TEST_0: Running test_perf.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ - echo TEST_0_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_1: Running test_compilation.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ - echo TEST_1_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_2: Running test_basic.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ - echo TEST_2_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - echo TEST_3_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_4: Running test_quantization_accuracy.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ - echo TEST_4_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_5: Running examples/offline_inference/tpu.py; \ - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ - echo TEST_5_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_6: Running test_tpu_model_runner.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ - echo TEST_6_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_7: Running test_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ - echo TEST_7_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_8: Running test_topk_topp_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ - echo TEST_8_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_9: Running test_multimodal.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ - echo TEST_9_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_10: Running test_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ - echo TEST_10_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_11: Running test_struct_output_generate.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ - echo TEST_11_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_12: Running test_moe_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ - echo TEST_12_EXIT_CODE: \$?; \ - } & \ - # Disable the TPU LoRA tests until the feature is activated - # & { \ - # echo TEST_13: Running test_moe_pallas.py; \ - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ - # echo TEST_13_EXIT_CODE: \$?; \ - # } & \ - wait \ - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ -" + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 0 "test_perf.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" +run_and_track_test 1 "test_compilation.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" +run_and_track_test 2 "test_basic.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" +run_and_track_test 4 "test_quantization_accuracy.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" +run_and_track_test 5 "examples/offline_inference/tpu.py" \ + "python3 /workspace/vllm/examples/offline_inference/tpu.py" +run_and_track_test 6 "test_tpu_model_runner.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" +run_and_track_test 7 "test_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" +run_and_track_test 8 "test_topk_topp_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" +run_and_track_test 9 "test_multimodal.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" +run_and_track_test 10 "test_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" +run_and_track_test 11 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" +run_and_track_test 12 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" + +# Disable the TPU LoRA tests until the feature is activated +# run_and_track_test 13 "test_lora (directory)" \ +# "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi # TODO: This test fails because it uses RANDOM_SEED sampling -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \