From 1a4f35e2eaa3ebdecb8ef9ff8302b01e289305c9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 10 Jul 2025 22:27:32 +0900 Subject: [PATCH] Normalize lm-eval command between baseline and correctness test (#18560) Signed-off-by: mgoin --- .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh | 2 +- .buildkite/lm-eval-harness/test_lm_eval_correctness.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 65be3c5d93b20..b98d42aa7b822 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do done lm_eval --model vllm \ - --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 930adfaf3e192..ceea01166b7f4 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -18,12 +18,14 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) + max_model_len = eval_config.get("max_model_len", 4096) model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," - f"trust_remote_code={trust_remote_code}" + f"trust_remote_code={trust_remote_code}," + f"max_model_len={max_model_len}" ) results = lm_eval.simple_evaluate( model="vllm",