mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 13:15:42 +08:00
[CI] GPT-OSS GPQA eval test for Blackwell (#24920)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
d119fc8614
commit
493b10f8bf
@ -821,6 +821,20 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
|
||||||
|
- label: GPT-OSS Eval (Blackwell)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
# optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
|
|||||||
2
tests/evals/gpt_oss/__init__.py
Normal file
2
tests/evals/gpt_oss/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
18
tests/evals/gpt_oss/conftest.py
Normal file
18
tests/evals/gpt_oss/conftest.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Pytest configuration for GPT-OSS evaluation tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
"""Add command line options for pytest."""
|
||||||
|
parser.addoption("--model", action="store", help="Model name to evaluate")
|
||||||
|
parser.addoption("--metric",
|
||||||
|
action="store",
|
||||||
|
type=float,
|
||||||
|
help="Expected metric threshold")
|
||||||
|
parser.addoption("--server-args",
|
||||||
|
action="store",
|
||||||
|
default="",
|
||||||
|
help="Additional server arguments")
|
||||||
102
tests/evals/gpt_oss/test_gpqa_correctness.py
Normal file
102
tests/evals/gpt_oss/test_gpqa_correctness.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
GPQA evaluation using vLLM server and GPT-OSS evaluation package.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \
|
||||||
|
--model openai/gpt-oss-20b \
|
||||||
|
--metric 0.58 \
|
||||||
|
--server-args "--tensor-parallel-size 2"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
from tests.utils import RemoteOpenAIServer
|
||||||
|
|
||||||
|
TOL = 0.05 # Absolute tolerance for accuracy comparison
|
||||||
|
|
||||||
|
|
||||||
|
def run_gpqa_eval(model_name: str, base_url: str) -> float:
|
||||||
|
"""Run GPQA evaluation using the gpt-oss evaluation package."""
|
||||||
|
|
||||||
|
# Build the command to run the evaluation
|
||||||
|
cmd = [
|
||||||
|
sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
|
||||||
|
model_name, "--reasoning-effort", "low", "--base-url", base_url
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run the evaluation
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
text=True,
|
||||||
|
capture_output=True,
|
||||||
|
timeout=1800, # 30 minute timeout
|
||||||
|
env={"OPENAI_API_KEY": "dummy"})
|
||||||
|
|
||||||
|
print("Evaluation process output:\n", result.stdout)
|
||||||
|
|
||||||
|
# Parse the output to extract the score
|
||||||
|
match = re.search(r"'metric':\s*([\d.]+)", result.stdout)
|
||||||
|
if match:
|
||||||
|
return float(match.group(1))
|
||||||
|
|
||||||
|
# If we still can't find it, raise an error
|
||||||
|
raise ValueError(
|
||||||
|
f"Could not parse score from evaluation output:\n{result.stdout}")
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired as e:
|
||||||
|
raise RuntimeError("Evaluation timed out") from e
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Evaluation failed with exit code {e.returncode}:\n"
|
||||||
|
f"stdout: {e.stdout}\nstderr: {e.stderr}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def test_gpqa_correctness(request):
|
||||||
|
"""Test GPQA correctness for GPT-OSS model."""
|
||||||
|
|
||||||
|
# Get command line arguments
|
||||||
|
model_name = request.config.getoption("--model")
|
||||||
|
expected_metric = request.config.getoption("--metric")
|
||||||
|
server_args_str = request.config.getoption("--server-args")
|
||||||
|
|
||||||
|
# Parse server arguments
|
||||||
|
server_args = []
|
||||||
|
if server_args_str:
|
||||||
|
server_args = server_args_str.split()
|
||||||
|
|
||||||
|
# Add standard server arguments
|
||||||
|
server_args.extend([
|
||||||
|
"--max-model-len",
|
||||||
|
"32768",
|
||||||
|
"--trust-remote-code",
|
||||||
|
])
|
||||||
|
|
||||||
|
print(f"Starting GPQA evaluation for model: {model_name}")
|
||||||
|
print(f"Expected metric threshold: {expected_metric}")
|
||||||
|
print(f"Server args: {' '.join(server_args)}")
|
||||||
|
|
||||||
|
# Launch server and run evaluation
|
||||||
|
with RemoteOpenAIServer(model_name, server_args,
|
||||||
|
max_wait_seconds=1800) as remote_server:
|
||||||
|
base_url = remote_server.url_for("v1")
|
||||||
|
print(f"Server started at: {base_url}")
|
||||||
|
|
||||||
|
measured_metric = run_gpqa_eval(model_name, base_url)
|
||||||
|
|
||||||
|
print(f"GPQA Results for {model_name}:")
|
||||||
|
print(f" Measured metric: {measured_metric:.4f}")
|
||||||
|
print(f" Expected metric: {expected_metric:.4f}")
|
||||||
|
print(f" Tolerance: {TOL:.4f}")
|
||||||
|
|
||||||
|
# Verify metric is within tolerance
|
||||||
|
assert measured_metric >= expected_metric - TOL, (
|
||||||
|
f"GPQA metric too low: {measured_metric:.4f} < "
|
||||||
|
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}")
|
||||||
|
|
||||||
|
print(f"✅ GPQA test passed for {model_name}")
|
||||||
Loading…
x
Reference in New Issue
Block a user