mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-03 17:27:05 +08:00
Replace lm-eval bash script with pytest and use enforce_eager for faster CI (#17717)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
e50a1f1a9c
commit
950b71186f
39
.buildkite/lm-eval-harness/conftest.py
Normal file
39
.buildkite/lm-eval-harness/conftest.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--config-list-file",
|
||||||
|
action="store",
|
||||||
|
help="Path to the file listing model config YAMLs (one per line)")
|
||||||
|
parser.addoption("--tp-size",
|
||||||
|
action="store",
|
||||||
|
default="1",
|
||||||
|
help="Tensor parallel size to use for evaluation")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def config_list_file(pytestconfig, config_dir):
|
||||||
|
rel_path = pytestconfig.getoption("--config-list-file")
|
||||||
|
return config_dir / rel_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tp_size(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--tp-size")
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
if "config_filename" in metafunc.fixturenames:
|
||||||
|
rel_path = metafunc.config.getoption("--config-list-file")
|
||||||
|
config_list_file = Path(rel_path).resolve()
|
||||||
|
config_dir = config_list_file.parent
|
||||||
|
with open(config_list_file, encoding="utf-8") as f:
|
||||||
|
configs = [
|
||||||
|
config_dir / line.strip() for line in f
|
||||||
|
if line.strip() and not line.startswith("#")
|
||||||
|
]
|
||||||
|
metafunc.parametrize("config_filename", configs)
|
||||||
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
|
||||||
echo "precomputed baseline (measured by HF transformers.)"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
|
||||||
echo " -t - tensor parallel size"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
SUCCESS=0
|
|
||||||
|
|
||||||
while getopts "c:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
c )
|
|
||||||
CONFIG="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Parse list of configs.
|
|
||||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
|
||||||
|
|
||||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
|
||||||
do
|
|
||||||
LOCAL_SUCCESS=0
|
|
||||||
|
|
||||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
|
||||||
|
|
||||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
|
||||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
|
||||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
|
||||||
|
|
||||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
|
||||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
else
|
|
||||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "${SUCCESS}" -eq "0" ]; then
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -3,35 +3,25 @@
|
|||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
pytest -s -v test_lm_eval_correctness.py \
|
||||||
* export LM_EVAL_TP_SIZE=4
|
--config-list-file=configs/models-small.txt \
|
||||||
* pytest -s test_lm_eval_correctness.py
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy
|
import numpy as np
|
||||||
import pytest
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.08
|
||||||
TEST_DATA_FILE = os.environ.get(
|
|
||||||
"LM_EVAL_TEST_DATA_FILE",
|
|
||||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
|
||||||
|
|
||||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
|
|
||||||
model_args = f"pretrained={eval_config['model_name']}," \
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"tensor_parallel_size={TP_SIZE}," \
|
f"tensor_parallel_size={tp_size}," \
|
||||||
|
f"enforce_eager=true," \
|
||||||
f"add_bos_token=true," \
|
f"add_bos_token=true," \
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
@ -39,22 +29,14 @@ def launch_lm_eval(eval_config):
|
|||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto")
|
batch_size="auto")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness():
|
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||||
eval_config = yaml.safe_load(
|
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
|
||||||
|
|
||||||
if eval_config[
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
|
||||||
pytest.skip("FBGEMM is currently failing on main.")
|
|
||||||
|
|
||||||
# Launch eval requests.
|
|
||||||
results = launch_lm_eval(eval_config)
|
|
||||||
|
|
||||||
# Confirm scores match ground truth.
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@ -62,8 +44,7 @@ def test_lm_eval_correctness():
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
success = success and numpy.isclose(
|
success = success and np.isclose(
|
||||||
ground_truth, measured_value, rtol=RTOL)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
# Assert at the end, print all scores even on failure for debugging.
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -408,7 +408,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
- label: OpenAI API correctness
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -713,4 +713,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user