[CI] GPT-OSS GPQA eval test for Blackwell (#24920)

Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-07-23 15:27:35 +08:00 · 2025-09-16 21:13:21 -04:00 · 2025-09-16 21:13:21 -04:00 · 493b10f8bf
commit 493b10f8bf
parent d119fc8614
4 changed files with 136 additions and 0 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -821,6 +821,20 @@ steps:
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 - label: GPT-OSS Eval (Blackwell)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
 #####  1 GPU test  #####
 #####  multi gpus test  #####
--- a/tests/evals/gpt_oss/init.py
+++ b/tests/evals/gpt_oss/init.py
@ -0,0 +1,2 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/tests/evals/gpt_oss/conftest.py
+++ b/tests/evals/gpt_oss/conftest.py
@ -0,0 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Pytest configuration for GPT-OSS evaluation tests.
 """
 def pytest_addoption(parser):
    """Add command line options for pytest."""
    parser.addoption("--model", action="store", help="Model name to evaluate")
    parser.addoption("--metric",
                     action="store",
                     type=float,
                     help="Expected metric threshold")
    parser.addoption("--server-args",
                     action="store",
                     default="",
                     help="Additional server arguments")
--- a/tests/evals/gpt_oss/test_gpqa_correctness.py
+++ b/tests/evals/gpt_oss/test_gpqa_correctness.py
@ -0,0 +1,102 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 GPQA evaluation using vLLM server and GPT-OSS evaluation package.
 Usage:
 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \
    --model openai/gpt-oss-20b \
    --metric 0.58 \
    --server-args "--tensor-parallel-size 2"
 """
 import subprocess
 import sys
 import regex as re
 from tests.utils import RemoteOpenAIServer
 TOL = 0.05  # Absolute tolerance for accuracy comparison
 def run_gpqa_eval(model_name: str, base_url: str) -> float:
    """Run GPQA evaluation using the gpt-oss evaluation package."""
    # Build the command to run the evaluation
    cmd = [
        sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
        model_name, "--reasoning-effort", "low", "--base-url", base_url
    ]
    try:
        # Run the evaluation
        result = subprocess.run(
            cmd,
            text=True,
            capture_output=True,
            timeout=1800,  # 30 minute timeout
            env={"OPENAI_API_KEY": "dummy"})
        print("Evaluation process output:\n", result.stdout)
        # Parse the output to extract the score
        match = re.search(r"'metric':\s*([\d.]+)", result.stdout)
        if match:
            return float(match.group(1))
        # If we still can't find it, raise an error
        raise ValueError(
            f"Could not parse score from evaluation output:\n{result.stdout}")
    except subprocess.TimeoutExpired as e:
        raise RuntimeError("Evaluation timed out") from e
    except subprocess.CalledProcessError as e:
        raise RuntimeError(
            f"Evaluation failed with exit code {e.returncode}:\n"
            f"stdout: {e.stdout}\nstderr: {e.stderr}") from e
 def test_gpqa_correctness(request):
    """Test GPQA correctness for GPT-OSS model."""
    # Get command line arguments
    model_name = request.config.getoption("--model")
    expected_metric = request.config.getoption("--metric")
    server_args_str = request.config.getoption("--server-args")
    # Parse server arguments
    server_args = []
    if server_args_str:
        server_args = server_args_str.split()
    # Add standard server arguments
    server_args.extend([
        "--max-model-len",
        "32768",
        "--trust-remote-code",
    ])
    print(f"Starting GPQA evaluation for model: {model_name}")
    print(f"Expected metric threshold: {expected_metric}")
    print(f"Server args: {' '.join(server_args)}")
    # Launch server and run evaluation
    with RemoteOpenAIServer(model_name, server_args,
                            max_wait_seconds=1800) as remote_server:
        base_url = remote_server.url_for("v1")
        print(f"Server started at: {base_url}")
        measured_metric = run_gpqa_eval(model_name, base_url)
        print(f"GPQA Results for {model_name}:")
        print(f"  Measured metric: {measured_metric:.4f}")
        print(f"  Expected metric: {expected_metric:.4f}")
        print(f"  Tolerance: {TOL:.4f}")
        # Verify metric is within tolerance
        assert measured_metric >= expected_metric - TOL, (
            f"GPQA metric too low: {measured_metric:.4f} < "
            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}")
        print(f"✅ GPQA test passed for {model_name}")
		`@ -0,0 +1,2 @@`
							`# SPDX-License-Identifier: Apache-2.0`
							`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`