mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 10:46:08 +08:00
[CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113)
Signed-off-by: Huamin Li <3ericli@gmail.com>
This commit is contained in:
parent
31b55ffc62
commit
5be1bed790
@ -0,0 +1,14 @@
|
|||||||
|
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.82
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
|
enforce_eager: false # we use false to speed up the eval process
|
||||||
|
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
||||||
|
max_model_len: 40960
|
||||||
|
apply_chat_template: true
|
||||||
|
fewshot_as_multiturn: true
|
||||||
|
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
||||||
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
||||||
@ -0,0 +1 @@
|
|||||||
|
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||||
@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
batch_size = eval_config.get("batch_size", "auto")
|
||||||
backend = eval_config.get("backend", "vllm")
|
backend = eval_config.get("backend", "vllm")
|
||||||
|
enforce_eager = eval_config.get("enforce_eager", "true")
|
||||||
|
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager={enforce_eager},"
|
||||||
|
f"kv_cache_dtype={kv_cache_dtype},"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
# text models. however, this is regressing measured strict-match for
|
# text models. however, this is regressing measured strict-match for
|
||||||
# existing text models in CI, so only apply it for mm.
|
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||||
apply_chat_template=backend == "vllm-vlm",
|
apply_chat_template=eval_config.get(
|
||||||
|
"apply_chat_template", backend == "vllm-vlm"
|
||||||
|
),
|
||||||
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||||
|
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||||
|
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|||||||
@ -1186,6 +1186,19 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
##### H100 test #####
|
||||||
|
- label: LM Eval Large Models (H100) # optional
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user