Replace lm-eval bash script with pytest and use enforce_eager for faster CI (#17717)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-06-24 10:07:13 +08:00 · 2025-05-06 21:00:10 -04:00 · 2025-05-06 21:00:10 -04:00 · 950b71186f
commit 950b71186f
parent e50a1f1a9c
4 changed files with 52 additions and 91 deletions
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        help="Path to the file listing model config YAMLs (one per line)")
+    parser.addoption("--tp-size",
+                     action="store",
+                     default="1",
+                     help="Tensor parallel size to use for evaluation")
+
+
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+
+
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+
+
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+        rel_path = metafunc.config.getoption("--config-list-file")
+        config_list_file = Path(rel_path).resolve()
+        config_dir = config_list_file.parent
+        with open(config_list_file, encoding="utf-8") as f:
+            configs = [
+                config_dir / line.strip() for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -3,35 +3,25 @@
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4 
-* pytest -s test_lm_eval_correctness.py
+pytest -s -v test_lm_eval_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
 """

-import os
-from pathlib import Path
-
 import lm_eval
-import numpy
-import pytest
+import numpy as np
 import yaml

 RTOL = 0.08
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


-def launch_lm_eval(eval_config):
+def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get('trust_remote_code', False)
-
    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"tensor_parallel_size={tp_size}," \
+                 f"enforce_eager=true," \
                 f"add_bos_token=true," \
                 f"trust_remote_code={trust_remote_code}"
-
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
@ -39,22 +29,14 @@ def launch_lm_eval(eval_config):
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
        batch_size="auto")
-
    return results


-def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+def test_lm_eval_correctness_param(config_filename, tp_size):
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

-    if eval_config[
-            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
-        pytest.skip("FBGEMM is currently failing on main.")
+    results = launch_lm_eval(eval_config, tp_size)

-    # Launch eval requests.
-    results = launch_lm_eval(eval_config)
-
-    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@ -62,8 +44,7 @@ def test_lm_eval_correctness():
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and numpy.isclose(
+            success = success and np.isclose(
                ground_truth, measured_value, rtol=RTOL)

-    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -408,7 +408,7 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: OpenAI API correctness
  source_file_dependencies:
@ -713,4 +713,4 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4