[CI Sprint] Quantization CI Cleanup (#24130)

Signed-off-by: Alex Yun <alexyun04@gmail.com>
2026-06-26 12:57:15 +08:00 · 2025-11-18 08:21:48 -06:00 · 2025-11-18 08:21:48 -06:00 · f6aa122698
commit f6aa122698
parent 184b12fdc6
10 changed files with 32 additions and 26 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
    ],
 )
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
            example_prompts, max_tokens, num_logprobs
        )

-    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs
        )
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
    # Test loading a quantized checkpoint
    compare_two_settings(
        "neuralmagic/Qwen2-1.5B-Instruct-FP8",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
        max_wait_seconds=480,
    )

@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
    # Test GPTQ Marlin
    compare_two_settings(
        "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
        max_wait_seconds=480,
    )

@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
    # Test AWQ Marlin
    compare_two_settings(
        "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
        max_wait_seconds=480,
    )

@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
    # Test wNa16
    compare_two_settings(
        "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
        max_wait_seconds=480,
    )
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
 )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_model_experts_int8_startup(
    hf_runner,
    vllm_runner,
@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_transformers_version(on_fail="skip")

-    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@ -45,10 +45,10 @@ def test_model_load_and_run(
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

-    with vllm_runner(model_id) as llm:
+    with vllm_runner(model_id, enforce_eager=True) as llm:
        # note: this does not test accuracy, just that we can run through
        # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
        print(outputs[0][1])


@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(

    # `LLM.apply_model` requires pickling a function.
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+    with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:

        def check_model(model):
            attn = model.model.layers[0].self_attn.attn
@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(

        # note: this does not test accuracy, just that we can run through
        # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
        print(outputs[0][1])


@ -142,7 +142,10 @@ def test_load_fp16_model(
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

    with vllm_runner(
-        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+        "facebook/opt-125m",
+        quantization="fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
    ) as llm:

        def check_model(model):
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
    assert output
    print(output)
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@ -49,4 +49,4 @@ def test_lm_head(

        vllm_model.apply_model(check_model)

-        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
        llm.apply_model(check_model)

        # Run a simple generation test to ensure the model works
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
        assert output
        print(f"ModelOpt FP8 output: {output}")
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
            "facebook/opt-125m",
            dtype=dtype,
            quantization="ptpc_fp8",
+            enforce_eager=True,
            kv_cache_dtype=kv_cache_dtype,
        )
    except AssertionError as e:
@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:

        llm.apply_model(check_model)

-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
        assert output
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import (
    get_quantization_config,
    register_quantization_config,
 )
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig,
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,  # noqa: E501
 )


@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch):

        llm.apply_model(check_model)

-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=1)
        assert output
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
            assert not has_int4_preshuffled_tensor

        assert weight_attrs == [False, 1, 0, True]
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)

        assert output