From d201807339697c6c8206ae08d2cdccfc25cb1ce1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 24 Dec 2025 21:39:13 +0800 Subject: [PATCH] [Chore] Bump `lm-eval` version (#31264) Signed-off-by: DarkLight1337 --- .../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh | 2 +- .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh | 2 +- .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh | 2 +- .../lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- docs/features/quantization/fp8.md | 2 +- docs/features/quantization/int4.md | 2 +- docs/features/quantization/int8.md | 2 +- docs/features/quantization/quark.md | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/rocm-test.txt | 2 +- requirements/test.in | 3 +-- requirements/test.txt | 2 +- 14 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index c8db951381b0b..0745da8dc418d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.9 +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index 897f84d1e360d..5c17a06245bcf 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 792f355c47a51..1b617ff17c41c 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index d85a1721db9a5..12336d7f85bc9 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index cbb2527a4ff0a..6959f81eab373 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index f022fa3672eeb..eafc82b98439b 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index d4a6176b236f1..f17ef89a5cbf9 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio Install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` Load and run the model in `vllm`: diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 9752039097d63..049a7ceed079b 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -18,7 +18,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 701ca6378cb16..8af3e24c7357c 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -23,7 +23,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index c54d7d2251999..bbab97740ff19 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -20,7 +20,7 @@ for more installation details. Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 7b2c665448a3b..a5f6ac00d1c89 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 3f0fd235fba50..e4a3dd379d272 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -58,7 +58,7 @@ schemathesis==3.39.15 # OpenAI schema test # Evaluation and benchmarking -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval[api]>=0.4.9.2 jiwer==4.0.0 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test diff --git a/requirements/test.in b/requirements/test.in index 55452ce83f232..b3fd733fb1bc0 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,8 +34,7 @@ num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -# TODO: Use lm-eval[api]==0.4.10 once released -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/test.txt b/requirements/test.txt index ea2093e4347fe..4012c2d3b212b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -441,7 +441,7 @@ lightning-utilities==0.14.3 # torchmetrics llvmlite==0.44.0 # via numba -lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval==0.4.9.2 # via -r requirements/test.in lxml==5.3.0 # via