From 47e9038d2386d31b8493ac995094bdc1aec710ce Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 28 Mar 2025 10:29:32 -0600 Subject: [PATCH] Fix cpu offload testing for gptq/awq/ct (#15648) Signed-off-by: mgoin --- tests/quantization/test_cpu_offload.py | 12 +++++++--- tests/utils.py | 33 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index 79afcc916f2bb..a7d6518514c72 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -33,7 +33,9 @@ def test_cpu_offload_fp8(): @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") -def test_cpu_offload_gptq(): +def test_cpu_offload_gptq(monkeypatch): + # This quant method is sensitive to dummy weights, so we force real weights + monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test GPTQ Marlin compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], ["--cpu-offload-gb", "1"], @@ -47,7 +49,9 @@ def test_cpu_offload_gptq(): @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), reason="awq_marlin is not supported on this GPU type.") -def test_cpu_offload_awq(): +def test_cpu_offload_awq(monkeypatch): + # This quant method is sensitive to dummy weights, so we force real weights + monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test AWQ Marlin compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [], ["--cpu-offload-gb", "1"], @@ -61,7 +65,9 @@ def test_cpu_offload_awq(): @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") -def test_cpu_offload_compressed_tensors(): +def test_cpu_offload_compressed_tensors(monkeypatch): + # This quant method is sensitive to dummy weights, so we force real weights + monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test wNa16 compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [], ["--cpu-offload-gb", "1"], diff --git a/tests/utils.py b/tests/utils.py index a827b6d4b9bfe..8915453ebd0a3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -317,6 +317,37 @@ def _test_completion_close( return results +def _test_chat( + client: openai.OpenAI, + model: str, + prompt: str, +): + results = [] + + messages = [{ + "role": "user", + "content": [{ + "type": "text", + "text": prompt + }] + }] + + # test with text prompt + chat_response = client.chat.completions.create(model=model, + messages=messages, + max_tokens=5, + temperature=0.0) + + results.append({ + "test": "completion_close", + "text": chat_response.choices[0].message.content, + "finish_reason": chat_response.choices[0].finish_reason, + "usage": chat_response.usage, + }) + + return results + + def _test_embeddings( client: openai.OpenAI, model: str, @@ -512,6 +543,8 @@ def compare_all_settings(model: str, results += _test_completion(client, model, prompt, token_ids) elif method == "generate_close": results += _test_completion_close(client, model, prompt) + elif method == "generate_chat": + results += _test_chat(client, model, prompt) elif method == "generate_with_image": results += _test_image_text( client, model,