[CI] Generalize gsm8k test args and add Qwen3-Next MTP B200 test (#30723)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin 2025-12-16 14:28:34 -05:00 committed by GitHub
parent 66c3537e5d
commit 10ee1c64cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 78 additions and 57 deletions

View File

@ -654,7 +654,7 @@ steps:
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 22min
timeout_in_minutes: 30
@ -1064,7 +1064,7 @@ steps:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####

View File

@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite)
```bash
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt
```
### Run standalone evaluation script
@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold: 0.54 # Minimum expected accuracy
num_questions: 1319 # Number of questions (default: full test set)
num_fewshot: 5 # Few-shot examples from train set
max_model_len: 4096 # Model context length
server_args: "--max-model-len 4096 --tensor-parallel-size 2" # Server arguments
env: # Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4: "1"
```
The `server_args` field accepts any arguments that can be passed to `vllm serve`.
The `env` field accepts a dictionary of environment variables to set for the server process.

View File

@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold: 0.72
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold: 0.74
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold: 0.31
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold: 0.45
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold: 0.60
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold: 0.375
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold: 0.89
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
server_args: "--enforce-eager --max-model-len 4096"

View File

@ -0,0 +1,12 @@
model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold: 0.75
num_questions: 1319
num_fewshot: 5
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"

View File

@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml

View File

@ -11,14 +11,12 @@ def pytest_addoption(parser):
default="configs/models-small.txt",
help="File containing list of config files to test",
)
parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
def pytest_generate_tests(metafunc):
"""Generate test parameters from config files."""
if "config_filename" in metafunc.fixturenames:
config_list_file = metafunc.config.getoption("--config-list-file")
tp_size = metafunc.config.getoption("--tp-size")
# Handle both relative and absolute paths
config_list_path = Path(config_list_file)
@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
# Generate test parameters
if config_files:
metafunc.parametrize(
["config_filename", "tp_size"],
[(config_file, int(tp_size)) for config_file in config_files],
ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
"config_filename",
config_files,
ids=[config_file.stem for config_file in config_files],
)
else:
print("No config files found, test will be skipped")

View File

@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
Replacement for lm-eval-harness with better performance and control.
Usage:
pytest -s -v test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt
"""
import shlex
import yaml
from tests.utils import RemoteOpenAIServer
from .gsm8k_eval import evaluate_gsm8k
RTOL = 0.08 # Relative tolerance for accuracy comparison
TOL = 0.08 # Absolute tolerance for accuracy comparison
def launch_gsm8k_eval(eval_config, server_url, tp_size):
"""Launch GSM8K evaluation using our isolated script."""
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
"""Run GSM8K evaluation using our isolated script."""
# Extract host and port from server URL
if "://" in server_url:
server_url = server_url.split("://")[1]
host_port = server_url.split("/")[0] # Remove path if present
if ":" in host_port:
host, port = host_port.split(":")
port = int(port)
host, p = host_port.split(":")
port = int(p)
else:
host = host_port
port = 8000
@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
return results
def test_gsm8k_correctness_param(config_filename, tp_size):
def test_gsm8k_correctness(config_filename):
"""Test GSM8K correctness for a given model configuration."""
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
# Server arguments
server_args = [
"--max-model-len",
str(eval_config.get("max_model_len", 4096)),
"--enforce-eager",
"--trust-remote-code",
"--tensor-parallel-size",
str(tp_size),
]
# Parse server arguments from config (use shlex to handle quoted strings)
server_args_str = eval_config.get("server_args", "")
server_args = shlex.split(server_args_str) if server_args_str else []
# Add standard server arguments
server_args.extend(
[
"--trust-remote-code",
]
)
env_dict = eval_config.get("env", None)
print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}")
# Launch server and run evaluation
with RemoteOpenAIServer(
eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
eval_config["model_name"],
server_args,
env_dict=env_dict,
max_wait_seconds=600,
) as remote_server:
server_url = remote_server.url_for("v1")
print(f"Server started at: {server_url}")
results = launch_gsm8k_eval(eval_config, server_url, tp_size)
results = run_gsm8k_eval(eval_config, server_url)
# Check accuracy against threshold
measured_accuracy = results["accuracy"]
expected_accuracy = eval_config["accuracy_threshold"]
measured_metric = results["accuracy"]
expected_metric = eval_config["accuracy_threshold"]
print(f"GSM8K Results for {eval_config['model_name']}:")
print(f" Accuracy: {measured_accuracy:.3f}")
print(f" Expected: {expected_accuracy:.3f}")
print(f" Measured metric: {measured_metric:.4f}")
print(f" Expected metric: {expected_metric:.4f}")
print(f" Tolerance: {TOL:.4f}")
print(f" Questions: {results['num_questions']}")
print(f" Invalid rate: {results['invalid_rate']:.3f}")
print(f" Latency: {results['latency']:.1f}s")
print(f" QPS: {results['questions_per_second']:.1f}")
# Verify accuracy is within tolerance
assert measured_accuracy >= expected_accuracy - RTOL, (
f"Accuracy too low: {measured_accuracy:.3f} < "
f"{expected_accuracy:.3f} - {RTOL:.3f}"
# Verify metric is within tolerance
assert measured_metric >= expected_metric - TOL, (
f"GSM8K metric too low: {measured_metric:.4f} < "
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
)
print(f"✅ GSM8K test passed for {eval_config['model_name']}")

View File

@ -626,17 +626,11 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
else:
# If no modular kernel is provided, use cutlass_moe_fp4 for TP case
# only (no EP).
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
assert layer.expert_map is None, (
"Expert Parallelism / expert_map "
"is currently not supported for "
"CompressedTensorsW4A4Nvfp4MoEMethod."
)
assert self.moe_quant_config is not None
# Cutlass moe takes in activations in BF16/Half precision
# and fp4 quantized weights loaded from the checkpoint
return cutlass_moe_fp4(
a=x,
w1_fp4=layer.w13_weight,
@ -644,6 +638,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
topk_weights=topk_weights,
topk_ids=topk_ids,
quant_config=self.moe_quant_config,
expert_map=layer.expert_map,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
# TODO(bnell): derive these from arguments
m=x.shape[0],