mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-23 17:51:21 +08:00
[CI/Build][AMD] Add Llama4 Maverick FP8 to AMD CI (#28695)
Signed-off-by: zhewenli <zhewenli@meta.com>
This commit is contained in:
parent
4470ee2f90
commit
bcf43ab1f3
@ -8,3 +8,4 @@ tasks:
|
|||||||
value: 0.80
|
value: 0.80
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
rtol: 0.05
|
||||||
|
|||||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
|||||||
--tp-size=1
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
DEFAULT_RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def scoped_env_vars(new_env: dict[str, str]):
|
||||||
|
if not new_env:
|
||||||
|
# Fast path: nothing to do
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
old_values = {}
|
||||||
|
new_keys = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for key, value in new_env.items():
|
||||||
|
if key in os.environ:
|
||||||
|
old_values[key] = os.environ[key]
|
||||||
|
else:
|
||||||
|
new_keys.append(key)
|
||||||
|
os.environ[key] = str(value)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
# Restore / clean up
|
||||||
|
for key, value in old_values.items():
|
||||||
|
os.environ[key] = value
|
||||||
|
for key in new_keys:
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
|
||||||
model=backend,
|
env_vars = eval_config.get("env_vars", None)
|
||||||
model_args=model_args,
|
with scoped_env_vars(env_vars):
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
results = lm_eval.simple_evaluate(
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
model=backend,
|
||||||
limit=eval_config["limit"],
|
model_args=model_args,
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
# text models. however, this is regressing measured strict-match for
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
limit=eval_config["limit"],
|
||||||
apply_chat_template=eval_config.get(
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
# text models. however, this is regressing measured strict-match for
|
||||||
),
|
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
apply_chat_template=eval_config.get(
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
"apply_chat_template", backend == "vllm-vlm"
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
),
|
||||||
batch_size=batch_size,
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||||
)
|
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||||
|
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
|
|
||||||
|
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(
|
||||||
f"{task['name']} | {metric['name']}: "
|
f"{task['name']} | {metric['name']}: "
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
f"ground_truth={ground_truth:.3f} | "
|
||||||
|
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -718,17 +718,6 @@ steps:
|
|||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 15min
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: OpenAI API correctness # 10min
|
- label: OpenAI API correctness # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -974,19 +963,6 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/multimodal/
|
|
||||||
- vllm/inputs/
|
|
||||||
- vllm/v1/core/
|
|
||||||
commands:
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||||
timeout_in_minutes: 120
|
timeout_in_minutes: 120
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1162,21 +1138,6 @@ steps:
|
|||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: ROCm GPT-OSS Eval
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
agent_pool: mi325_1
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/evals/gpt_oss
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
- label: Blackwell Quantized MoE Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
@ -1194,16 +1155,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
@ -1380,7 +1331,7 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
|
||||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||||
# doesn't support LoRA yet
|
# doesn't support LoRA yet
|
||||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
|
|
||||||
@ -1446,37 +1397,6 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
||||||
|
|
||||||
##### H100 test #####
|
|
||||||
- label: LM Eval Large Models (H100) # optional
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1508,20 +1428,94 @@ steps:
|
|||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### E2E Eval Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: LM Eval Small Models (1 Card) # 15min
|
||||||
mirror_hardwares: [amdexperimental]
|
timeout_in_minutes: 20
|
||||||
agent_pool: mi325_2
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 30
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- csrc/
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: Blackwell LM Eval Small Models
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 Card)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (H100) # optional
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: ROCm LM Eval Large Models (8 Card)
|
||||||
|
mirror_hardwares: [amdproduction]
|
||||||
|
agent_pool: mi325_8
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||||
|
|
||||||
|
- label: ROCm GPT-OSS Eval
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
agent_pool: mi325_1
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -1554,4 +1548,19 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
##### RL Integration Tests #####
|
||||||
|
- label: Prime-RL Integration Test # 15min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_2
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user