[CI/Build][AMD] Add Llama4 Maverick FP8 to AMD CI (#28695)

Signed-off-by: zhewenli <zhewenli@meta.com>
2026-05-23 18:44:28 +08:00 · 2025-12-04 16:07:20 -08:00 · 2025-12-04 16:07:20 -08:00 · bcf43ab1f3
commit bcf43ab1f3
parent 4470ee2f90
4 changed files with 159 additions and 113 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -8,3 +8,4 @@ tasks:
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
 rtol: 0.05
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@ -0,0 +1 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
    --tp-size=1
 """
 import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
@contextmanager
 def scoped_env_vars(new_env: dict[str, str]):
    if not new_env:
        # Fast path: nothing to do
        yield
        return
    old_values = {}
    new_keys = []
    try:
        for key, value in new_env.items():
            if key in os.environ:
                old_values[key] = os.environ[key]
            else:
                new_keys.append(key)
            os.environ[key] = str(value)
        yield
    finally:
        # Restore / clean up
        for key, value in old_values.items():
            os.environ[key] = value
        for key in new_keys:
            os.environ.pop(key, None)
 def launch_lm_eval(eval_config, tp_size):
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
    )
-    results = lm_eval.simple_evaluate(
+
-        model=backend,
+    env_vars = eval_config.get("env_vars", None)
-        model_args=model_args,
+    with scoped_env_vars(env_vars):
-        tasks=[task["name"] for task in eval_config["tasks"]],
+        results = lm_eval.simple_evaluate(
-        num_fewshot=eval_config["num_fewshot"],
+            model=backend,
-        limit=eval_config["limit"],
+            model_args=model_args,
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+            tasks=[task["name"] for task in eval_config["tasks"]],
-        # text models. however, this is regressing measured strict-match for
+            num_fewshot=eval_config["num_fewshot"],
-        # existing text models in CI, so only apply it for mm, or explicitly set
+            limit=eval_config["limit"],
-        apply_chat_template=eval_config.get(
+            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-            "apply_chat_template", backend == "vllm-vlm"
+            # text models. however, this is regressing measured strict-match for
-        ),
+            # existing text models in CI, so only apply it for mm, or explicitly set
-        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+            apply_chat_template=eval_config.get(
-        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+                "apply_chat_template", backend == "vllm-vlm"
-        gen_kwargs=eval_config.get("gen_kwargs"),
+            ),
-        batch_size=batch_size,
+            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-    )
+            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
            gen_kwargs=eval_config.get("gen_kwargs"),
            batch_size=batch_size,
        )
    return results
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
    results = launch_lm_eval(eval_config, tp_size)
    rtol = eval_config.get("rtol", DEFAULT_RTOL)
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth} | measured={measured_value}"
+                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
    assert success
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -718,17 +718,6 @@ steps:
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 15min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: OpenAI API correctness # 10min
  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
@ -974,19 +963,6 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
  timeout_in_minutes: 70
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
@ -1162,21 +1138,6 @@ steps:
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 - label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  agent_pool: mi325_1
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@ -1194,16 +1155,6 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -1380,7 +1331,7 @@ steps:
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
-    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # Disabled for now because MXFP4 backend on non-cuda platform
    # doesn't support LoRA yet
    #- pytest -v -s -x lora/test_gptoss_tp.py
@ -1446,37 +1397,6 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: LM Eval Large Models # optional
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 ##### H100 test #####
 - label: LM Eval Large Models (H100) # optional
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
  mirror_hardwares: [amdexperimental]
@ -1508,20 +1428,94 @@ steps:
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
    - pytest -v -s tests/v1/distributed/test_dbo.py
-##### RL Integration Tests #####
+##### E2E Eval Tests #####
- label: Prime-RL Integration Test # 15min
+- label: LM Eval Small Models (1 Card) # 15min
-  mirror_hardwares: [amdexperimental]
+  timeout_in_minutes: 20
-  agent_pool: mi325_2
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
-  - vllm/
+  - csrc/
-  - .buildkite/scripts/run-prime-rl-test.sh
+  - vllm/model_executor/layers/quantization
  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
  timeout_in_minutes: 70
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (H100) # optional
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 - label: ROCm LM Eval Large Models (8 Card)
  mirror_hardwares: [amdproduction]
  agent_pool: mi325_8
  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
 - label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  agent_pool: mi325_1
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: DeepSeek V2-Lite Accuracy
  mirror_hardwares: [amdexperimental, amdproduction]
@ -1554,4 +1548,19 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
		`@ -0,0 +1 @@`
							`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`