[CI] Generalize gsm8k test args and add Qwen3-Next MTP B200 test (#30723)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-12-23 23:15:41 +08:00 · 2025-12-16 14:28:34 -05:00 · 2025-12-16 14:28:34 -05:00 · 10ee1c64cf
commit 10ee1c64cf
parent 66c3537e5d
14 changed files with 78 additions and 57 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -654,7 +654,7 @@ steps:
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: OpenAI API correctness # 22min
  timeout_in_minutes: 30
@ -1064,7 +1064,7 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

 #####  1 GPU test  #####
 #####  multi gpus test  #####
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
 ### Run tests with pytest (like buildkite)

 ```bash
-pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 ```

 ### Run standalone evaluation script
@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 accuracy_threshold: 0.54  # Minimum expected accuracy
 num_questions: 1319       # Number of questions (default: full test set)
 num_fewshot: 5            # Few-shot examples from train set
-max_model_len: 4096       # Model context length
+server_args: "--max-model-len 4096 --tensor-parallel-size 2"  # Server arguments
+env:                      # Environment variables (optional)
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
 ```
+
+The `server_args` field accepts any arguments that can be passed to `vllm serve`.
+
+The `env` field accepts a dictionary of environment variables to set for the server process.
--- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 accuracy_threshold: 0.72
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 accuracy_threshold: 0.74
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
 accuracy_threshold: 0.31
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 accuracy_threshold: 0.60
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
 accuracy_threshold: 0.375
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
 accuracy_threshold: 0.89
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
@ -0,0 +1,12 @@
+model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
+Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@ -11,14 +11,12 @@ def pytest_addoption(parser):
        default="configs/models-small.txt",
        help="File containing list of config files to test",
    )
-    parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")


 def pytest_generate_tests(metafunc):
    """Generate test parameters from config files."""
    if "config_filename" in metafunc.fixturenames:
        config_list_file = metafunc.config.getoption("--config-list-file")
-        tp_size = metafunc.config.getoption("--tp-size")

        # Handle both relative and absolute paths
        config_list_path = Path(config_list_file)
@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
        # Generate test parameters
        if config_files:
            metafunc.parametrize(
-                ["config_filename", "tp_size"],
-                [(config_file, int(tp_size)) for config_file in config_files],
-                ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
+                "config_filename",
+                config_files,
+                ids=[config_file.stem for config_file in config_files],
            )
        else:
            print("No config files found, test will be skipped")
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
 Replacement for lm-eval-harness with better performance and control.

 Usage:
-pytest -s -v test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 """

+import shlex
+
 import yaml

 from tests.utils import RemoteOpenAIServer

 from .gsm8k_eval import evaluate_gsm8k

-RTOL = 0.08  # Relative tolerance for accuracy comparison
+TOL = 0.08  # Absolute tolerance for accuracy comparison


-def launch_gsm8k_eval(eval_config, server_url, tp_size):
-    """Launch GSM8K evaluation using our isolated script."""
+def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
+    """Run GSM8K evaluation using our isolated script."""
    # Extract host and port from server URL
    if "://" in server_url:
        server_url = server_url.split("://")[1]

    host_port = server_url.split("/")[0]  # Remove path if present
    if ":" in host_port:
-        host, port = host_port.split(":")
-        port = int(port)
+        host, p = host_port.split(":")
+        port = int(p)
    else:
        host = host_port
        port = 8000
@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
    return results


-def test_gsm8k_correctness_param(config_filename, tp_size):
+def test_gsm8k_correctness(config_filename):
    """Test GSM8K correctness for a given model configuration."""
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

-    # Server arguments
-    server_args = [
-        "--max-model-len",
-        str(eval_config.get("max_model_len", 4096)),
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--tensor-parallel-size",
-        str(tp_size),
-    ]
+    # Parse server arguments from config (use shlex to handle quoted strings)
+    server_args_str = eval_config.get("server_args", "")
+    server_args = shlex.split(server_args_str) if server_args_str else []
+
+    # Add standard server arguments
+    server_args.extend(
+        [
+            "--trust-remote-code",
+        ]
+    )

    env_dict = eval_config.get("env", None)

+    print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
+    print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
+    print(f"Number of questions: {eval_config['num_questions']}")
+    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
+    print(f"Server args: {' '.join(server_args)}")
+
    # Launch server and run evaluation
    with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
+        eval_config["model_name"],
+        server_args,
+        env_dict=env_dict,
+        max_wait_seconds=600,
    ) as remote_server:
        server_url = remote_server.url_for("v1")
+        print(f"Server started at: {server_url}")

-        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+        results = run_gsm8k_eval(eval_config, server_url)

-        # Check accuracy against threshold
-        measured_accuracy = results["accuracy"]
-        expected_accuracy = eval_config["accuracy_threshold"]
+        measured_metric = results["accuracy"]
+        expected_metric = eval_config["accuracy_threshold"]

        print(f"GSM8K Results for {eval_config['model_name']}:")
-        print(f"  Accuracy: {measured_accuracy:.3f}")
-        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Measured metric: {measured_metric:.4f}")
+        print(f"  Expected metric: {expected_metric:.4f}")
+        print(f"  Tolerance: {TOL:.4f}")
        print(f"  Questions: {results['num_questions']}")
        print(f"  Invalid rate: {results['invalid_rate']:.3f}")
        print(f"  Latency: {results['latency']:.1f}s")
        print(f"  QPS: {results['questions_per_second']:.1f}")

-        # Verify accuracy is within tolerance
-        assert measured_accuracy >= expected_accuracy - RTOL, (
-            f"Accuracy too low: {measured_accuracy:.3f} < "
-            f"{expected_accuracy:.3f} - {RTOL:.3f}"
+        # Verify metric is within tolerance
+        assert measured_metric >= expected_metric - TOL, (
+            f"GSM8K metric too low: {measured_metric:.4f} < "
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
        )

        print(f"✅ GSM8K test passed for {eval_config['model_name']}")
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -626,17 +626,11 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                apply_router_weight_on_input=layer.apply_router_weight_on_input,
            )
        else:
+            # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
+            # only (no EP).
            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4

-            assert layer.expert_map is None, (
-                "Expert Parallelism / expert_map "
-                "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoEMethod."
-            )
            assert self.moe_quant_config is not None
-
-            # Cutlass moe takes in activations in BF16/Half precision
-            # and fp4 quantized weights loaded from the checkpoint
            return cutlass_moe_fp4(
                a=x,
                w1_fp4=layer.w13_weight,
@ -644,6 +638,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                quant_config=self.moe_quant_config,
+                expert_map=layer.expert_map,
                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                # TODO(bnell): derive these from arguments
                m=x.shape[0],