diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml index 46f1a9fbf6ff..6c0b5540cbb6 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -8,3 +8,4 @@ tasks: value: 0.80 limit: 250 # will run on 250 * 14 subjects = 3500 samples num_fewshot: 5 +rtol: 0.05 diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt new file mode 100644 index 000000000000..4fb0b84bc4d8 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 3627b760eddc..f94d681197d2 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \ --tp-size=1 """ +import os +from contextlib import contextmanager + import lm_eval import numpy as np import yaml -RTOL = 0.08 +DEFAULT_RTOL = 0.08 + + +@contextmanager +def scoped_env_vars(new_env: dict[str, str]): + if not new_env: + # Fast path: nothing to do + yield + return + + old_values = {} + new_keys = [] + + try: + for key, value in new_env.items(): + if key in os.environ: + old_values[key] = os.environ[key] + else: + new_keys.append(key) + os.environ[key] = str(value) + yield + finally: + # Restore / clean up + for key, value in old_values.items(): + os.environ[key] = value + for key in new_keys: + os.environ.pop(key, None) def launch_lm_eval(eval_config, tp_size): @@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size): f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," ) - results = lm_eval.simple_evaluate( - model=backend, - model_args=model_args, - tasks=[task["name"] for task in eval_config["tasks"]], - num_fewshot=eval_config["num_fewshot"], - limit=eval_config["limit"], - # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help - # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm, or explicitly set - apply_chat_template=eval_config.get( - "apply_chat_template", backend == "vllm-vlm" - ), - fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), - # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) - gen_kwargs=eval_config.get("gen_kwargs"), - batch_size=batch_size, - ) + + env_vars = eval_config.get("env_vars", None) + with scoped_env_vars(env_vars): + results = lm_eval.simple_evaluate( + model=backend, + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), + batch_size=batch_size, + ) return results @@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): results = launch_lm_eval(eval_config, tp_size) + rtol = eval_config.get("rtol", DEFAULT_RTOL) + success = True for task in eval_config["tasks"]: for metric in task["metrics"]: @@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size): measured_value = results["results"][task["name"]][metric["name"]] print( f"{task['name']} | {metric['name']}: " - f"ground_truth={ground_truth} | measured={measured_value}" + f"ground_truth={ground_truth:.3f} | " + f"measured={measured_value:.3f} | rtol={rtol}" ) - success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) + success = success and np.isclose(ground_truth, measured_value, rtol=rtol) assert success diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 022b6ea236d5..6950ad774edd 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -718,17 +718,6 @@ steps: - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] @@ -974,19 +963,6 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] @@ -1162,21 +1138,6 @@ steps: # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -- label: ROCm GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - agent_pool: mi325_1 - mirror_hardwares: [amdexperimental, amdproduction] - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1194,16 +1155,6 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1380,7 +1331,7 @@ steps: - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - # Disabled for now because MXFP4 backend on non-cuda platform + # Disabled for now because MXFP4 backend on non-cuda platform # doesn't support LoRA yet #- pytest -v -s -x lora/test_gptoss_tp.py @@ -1446,37 +1397,6 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: LM Eval Large Models # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] @@ -1508,20 +1428,94 @@ steps: - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min - mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 +##### E2E Eval Tests ##### +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 # grade: Blocking - timeout_in_minutes: 30 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh + - csrc/ + - vllm/model_executor/layers/quantization commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +- label: Multi-Modal Accuracy Eval (Small Models) # 10min + timeout_in_minutes: 70 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: LM Eval Large Models (4 Card) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (H100) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: ROCm LM Eval Large Models (8 Card) + mirror_hardwares: [amdproduction] + agent_pool: mi325_8 + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi325_1 + mirror_hardwares: [amdexperimental, amdproduction] + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] @@ -1554,4 +1548,19 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh