diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 65be3c5d93b20..b98d42aa7b822 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do done lm_eval --model vllm \ - --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 930adfaf3e192..ceea01166b7f4 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -18,12 +18,14 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) + max_model_len = eval_config.get("max_model_len", 4096) model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," - f"trust_remote_code={trust_remote_code}" + f"trust_remote_code={trust_remote_code}," + f"max_model_len={max_model_len}" ) results = lm_eval.simple_evaluate( model="vllm",