From 950b71186f9696a60f1a53cd1a033f7fef952500 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 6 May 2025 21:00:10 -0400 Subject: [PATCH] Replace lm-eval bash script with pytest and use enforce_eager for faster CI (#17717) Signed-off-by: mgoin --- .buildkite/lm-eval-harness/conftest.py | 39 ++++++++++++ .buildkite/lm-eval-harness/run-tests.sh | 59 ------------------- .../test_lm_eval_correctness.py | 41 ++++--------- .buildkite/test-pipeline.yaml | 4 +- 4 files changed, 52 insertions(+), 91 deletions(-) create mode 100644 .buildkite/lm-eval-harness/conftest.py delete mode 100644 .buildkite/lm-eval-harness/run-tests.sh diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py new file mode 100644 index 0000000000000..a0bcc993ed4aa --- /dev/null +++ b/.buildkite/lm-eval-harness/conftest.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +from pathlib import Path + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--config-list-file", + action="store", + help="Path to the file listing model config YAMLs (one per line)") + parser.addoption("--tp-size", + action="store", + default="1", + help="Tensor parallel size to use for evaluation") + + +@pytest.fixture(scope="session") +def config_list_file(pytestconfig, config_dir): + rel_path = pytestconfig.getoption("--config-list-file") + return config_dir / rel_path + + +@pytest.fixture(scope="session") +def tp_size(pytestconfig): + return pytestconfig.getoption("--tp-size") + + +def pytest_generate_tests(metafunc): + if "config_filename" in metafunc.fixturenames: + rel_path = metafunc.config.getoption("--config-list-file") + config_list_file = Path(rel_path).resolve() + config_dir = config_list_file.parent + with open(config_list_file, encoding="utf-8") as f: + configs = [ + config_dir / line.strip() for line in f + if line.strip() and not line.startswith("#") + ] + metafunc.parametrize("config_filename", configs) diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh deleted file mode 100644 index 26f33b744289a..0000000000000 --- a/.buildkite/lm-eval-harness/run-tests.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -usage() { - echo`` - echo "Runs lm eval harness on GSM8k using vllm and compares to " - echo "precomputed baseline (measured by HF transformers.)" - echo - echo "usage: ${0} " - echo - echo " -c - path to the test data config (e.g. configs/small-models.txt)" - echo " -t - tensor parallel size" - echo -} - -SUCCESS=0 - -while getopts "c:t:" OPT; do - case ${OPT} in - c ) - CONFIG="$OPTARG" - ;; - t ) - TP_SIZE="$OPTARG" - ;; - \? ) - usage - exit 1 - ;; - esac -done - -# Parse list of configs. -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" - -for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" -do - LOCAL_SUCCESS=0 - - echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" - - export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} - export LM_EVAL_TP_SIZE=$TP_SIZE - pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? - - if [[ $LOCAL_SUCCESS == 0 ]]; then - echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" - else - echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" - fi - - SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) - -done - -if [ "${SUCCESS}" -eq "0" ]; then - exit 0 -else - exit 1 -fi diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 6015a83e82950..c5411daf0df6d 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -3,35 +3,25 @@ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml -* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml -* export LM_EVAL_TP_SIZE=4 -* pytest -s test_lm_eval_correctness.py +pytest -s -v test_lm_eval_correctness.py \ + --config-list-file=configs/models-small.txt \ + --tp-size=1 """ -import os -from pathlib import Path - import lm_eval -import numpy -import pytest +import numpy as np import yaml RTOL = 0.08 -TEST_DATA_FILE = os.environ.get( - "LM_EVAL_TEST_DATA_FILE", - ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") - -TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) -def launch_lm_eval(eval_config): +def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get('trust_remote_code', False) - model_args = f"pretrained={eval_config['model_name']}," \ - f"tensor_parallel_size={TP_SIZE}," \ + f"tensor_parallel_size={tp_size}," \ + f"enforce_eager=true," \ f"add_bos_token=true," \ f"trust_remote_code={trust_remote_code}" - results = lm_eval.simple_evaluate( model="vllm", model_args=model_args, @@ -39,22 +29,14 @@ def launch_lm_eval(eval_config): num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], batch_size="auto") - return results -def test_lm_eval_correctness(): - eval_config = yaml.safe_load( - Path(TEST_DATA_FILE).read_text(encoding="utf-8")) +def test_lm_eval_correctness_param(config_filename, tp_size): + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) - if eval_config[ - "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 - pytest.skip("FBGEMM is currently failing on main.") + results = launch_lm_eval(eval_config, tp_size) - # Launch eval requests. - results = launch_lm_eval(eval_config) - - # Confirm scores match ground truth. success = True for task in eval_config["tasks"]: for metric in task["metrics"]: @@ -62,8 +44,7 @@ def test_lm_eval_correctness(): measured_value = results["results"][task["name"]][metric["name"]] print(f'{task["name"]} | {metric["name"]}: ' f'ground_truth={ground_truth} | measured={measured_value}') - success = success and numpy.isclose( + success = success and np.isclose( ground_truth, measured_value, rtol=RTOL) - # Assert at the end, print all scores even on failure for debugging. assert success diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b3005b1b4b062..01d04759f5362 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -408,7 +408,7 @@ steps: - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-small.txt -t 1 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - label: OpenAI API correctness source_file_dependencies: @@ -713,4 +713,4 @@ steps: - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-large.txt -t 4 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4