diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b3d10f75ab50..c131192c56fc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -845,7 +845,7 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 diff --git a/tests/evals/gpt_oss/test_gpqa_correctness.py b/tests/evals/gpt_oss/test_gpqa_correctness.py index 4cc4041a60ce..07c04f00cd0d 100644 --- a/tests/evals/gpt_oss/test_gpqa_correctness.py +++ b/tests/evals/gpt_oss/test_gpqa_correctness.py @@ -26,7 +26,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float: # Build the command to run the evaluation cmd = [ sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model", - model_name, "--reasoning-effort", "low", "--base-url", base_url + model_name, "--reasoning-effort", "low", "--base-url", base_url, + "--n-threads", "200" ] try: @@ -72,8 +73,6 @@ def test_gpqa_correctness(request): # Add standard server arguments server_args.extend([ - "--max-model-len", - "32768", "--trust-remote-code", ])