mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 01:05:01 +08:00
[CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113)
Signed-off-by: Huamin Li <3ericli@gmail.com>
This commit is contained in:
parent
31b55ffc62
commit
5be1bed790
@ -0,0 +1,14 @@
|
||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.82
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
enforce_eager: false # we use false to speed up the eval process
|
||||
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
||||
max_model_len: 40960
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
||||
@ -1 +0,0 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -0,0 +1 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
enforce_eager = eval_config.get("enforce_eager", "true")
|
||||
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"enforce_eager={enforce_eager},"
|
||||
f"kv_cache_dtype={kv_cache_dtype},"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
@ -1186,6 +1186,19 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user