[Misc] add w8a8 asym models (#11075)

2025-12-15 04:35:01 +08:00 · 2024-12-23 13:33:20 -05:00 · 2024-12-23 13:33:20 -05:00 · 8cef6e02dc
commit 8cef6e02dc
parent b866cdbd05
1 changed files with 10 additions and 6 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -79,12 +79,12 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        assert output
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("model_path", [
-    "model_path",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    [
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-        # TODO static & asymmetric
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-    ])
+])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
@ -92,6 +92,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                          max_tokens, num_logprobs):
    dtype = "bfloat16"
    # skip language translation prompt for the static per tensor asym model
    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
        example_prompts = example_prompts[0:-1]
    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)