From 5be1bed79058ddc1016f2639c52dfb5b597bf39c Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Thu, 30 Oct 2025 00:50:56 -0700 Subject: [PATCH] [CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113) Signed-off-by: Huamin Li <3ericli@gmail.com> --- .../configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml | 14 ++++++++++++++ .../lm-eval-harness/configs/models-large-h100.txt | 1 - .../configs/models-large-hopper.txt | 1 + .../lm-eval-harness/test_lm_eval_correctness.py | 14 +++++++++++--- .buildkite/test-pipeline.yaml | 13 +++++++++++++ 5 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml delete mode 100644 .buildkite/lm-eval-harness/configs/models-large-h100.txt create mode 100644 .buildkite/lm-eval-harness/configs/models-large-hopper.txt diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml new file mode 100644 index 000000000000..514c15d6098e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.82 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 +enforce_eager: false # we use false to speed up the eval process +kv_cache_dtype: fp8 # we use fp8 to speed up the eval process +max_model_len: 40960 +apply_chat_template: true +fewshot_as_multiturn: true +gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>" diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt deleted file mode 100644 index 4fb0b84bc4d8..000000000000 --- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt +++ /dev/null @@ -1 +0,0 @@ -Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt new file mode 100644 index 000000000000..5552391d9eab --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt @@ -0,0 +1 @@ +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index f10de82b1d8e..3627b760eddc 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size): max_model_len = eval_config.get("max_model_len", 4096) batch_size = eval_config.get("batch_size", "auto") backend = eval_config.get("backend", "vllm") + enforce_eager = eval_config.get("enforce_eager", "true") + kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," - f"enforce_eager=true," + f"enforce_eager={enforce_eager}," + f"kv_cache_dtype={kv_cache_dtype}," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," @@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size): limit=eval_config["limit"], # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm. - apply_chat_template=backend == "vllm-vlm", + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), batch_size=batch_size, ) return results diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d556073cd104..339e3aab6c03 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1186,6 +1186,19 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + ##### H200 test ##### - label: Distributed Tests (H200) # optional gpu: h200