mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:45:01 +08:00
Fix cpu offload testing for gptq/awq/ct (#15648)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
432cf22a6a
commit
47e9038d23
@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
|
|||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||||
reason="gptq_marlin is not supported on this GPU type.")
|
reason="gptq_marlin is not supported on this GPU type.")
|
||||||
def test_cpu_offload_gptq():
|
def test_cpu_offload_gptq(monkeypatch):
|
||||||
|
# This quant method is sensitive to dummy weights, so we force real weights
|
||||||
|
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
|
||||||
# Test GPTQ Marlin
|
# Test GPTQ Marlin
|
||||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
|
||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
|
|||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
|
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
|
||||||
reason="awq_marlin is not supported on this GPU type.")
|
reason="awq_marlin is not supported on this GPU type.")
|
||||||
def test_cpu_offload_awq():
|
def test_cpu_offload_awq(monkeypatch):
|
||||||
|
# This quant method is sensitive to dummy weights, so we force real weights
|
||||||
|
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
|
||||||
# Test AWQ Marlin
|
# Test AWQ Marlin
|
||||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
|
||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
@ -61,7 +65,9 @@ def test_cpu_offload_awq():
|
|||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||||
reason="gptq_marlin is not supported on this GPU type.")
|
reason="gptq_marlin is not supported on this GPU type.")
|
||||||
def test_cpu_offload_compressed_tensors():
|
def test_cpu_offload_compressed_tensors(monkeypatch):
|
||||||
|
# This quant method is sensitive to dummy weights, so we force real weights
|
||||||
|
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
|
||||||
# Test wNa16
|
# Test wNa16
|
||||||
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
|
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
|
||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
|
|||||||
@ -317,6 +317,37 @@ def _test_completion_close(
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _test_chat(
|
||||||
|
client: openai.OpenAI,
|
||||||
|
model: str,
|
||||||
|
prompt: str,
|
||||||
|
):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
messages = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
|
||||||
|
# test with text prompt
|
||||||
|
chat_response = client.chat.completions.create(model=model,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"test": "completion_close",
|
||||||
|
"text": chat_response.choices[0].message.content,
|
||||||
|
"finish_reason": chat_response.choices[0].finish_reason,
|
||||||
|
"usage": chat_response.usage,
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _test_embeddings(
|
def _test_embeddings(
|
||||||
client: openai.OpenAI,
|
client: openai.OpenAI,
|
||||||
model: str,
|
model: str,
|
||||||
@ -512,6 +543,8 @@ def compare_all_settings(model: str,
|
|||||||
results += _test_completion(client, model, prompt, token_ids)
|
results += _test_completion(client, model, prompt, token_ids)
|
||||||
elif method == "generate_close":
|
elif method == "generate_close":
|
||||||
results += _test_completion_close(client, model, prompt)
|
results += _test_completion_close(client, model, prompt)
|
||||||
|
elif method == "generate_chat":
|
||||||
|
results += _test_chat(client, model, prompt)
|
||||||
elif method == "generate_with_image":
|
elif method == "generate_with_image":
|
||||||
results += _test_image_text(
|
results += _test_image_text(
|
||||||
client, model,
|
client, model,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user