From 47e9038d2386d31b8493ac995094bdc1aec710ce Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 28 Mar 2025 10:29:32 -0600
Subject: [PATCH] Fix cpu offload testing for gptq/awq/ct (#15648)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/quantization/test_cpu_offload.py | 12 +++++++---
 tests/utils.py                         | 33 ++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 79afcc916f2bb..a7d6518514c72 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test GPTQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"],
@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                     reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test AWQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                          ["--cpu-offload-gb", "1"],
@@ -61,7 +65,9 @@ def test_cpu_offload_awq():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test wNa16
     compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
                          ["--cpu-offload-gb", "1"],
diff --git a/tests/utils.py b/tests/utils.py
index a827b6d4b9bfe..8915453ebd0a3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -317,6 +317,37 @@ def _test_completion_close(
     return results
 
 
+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -512,6 +543,8 @@ def compare_all_settings(model: str,
                 results += _test_completion(client, model, prompt, token_ids)
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
                     client, model,