mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 04:27:53 +08:00
[CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
b3dda72c23
commit
01c977e96d
@ -26,7 +26,7 @@ MODELS = [
|
|||||||
)
|
)
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
def test_auto_round(vllm_runner, model):
|
def test_auto_round(vllm_runner, model):
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
|
||||||
assert output
|
assert output
|
||||||
print(f"{output[0][1]}")
|
print(f"{output[0][1]}")
|
||||||
|
|||||||
@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
|
|||||||
2560,
|
2560,
|
||||||
True,
|
True,
|
||||||
),
|
),
|
||||||
(
|
|
||||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
|
||||||
"channel",
|
|
||||||
QuantizationType.INT,
|
|
||||||
2560,
|
|
||||||
True,
|
|
||||||
),
|
|
||||||
(
|
(
|
||||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||||
"tensor",
|
"tensor",
|
||||||
@ -138,7 +131,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -146,12 +139,9 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
|||||||
"model_path",
|
"model_path",
|
||||||
[
|
[
|
||||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [8])
|
||||||
@pytest.mark.parametrize("num_logprobs", [10])
|
@pytest.mark.parametrize("num_logprobs", [10])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||||
@ -211,7 +201,7 @@ def test_compressed_tensors_w8a8_logprobs(
|
|||||||
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||||
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
||||||
with vllm_runner(model_path) as llm:
|
with vllm_runner(model_path) as llm:
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -219,15 +209,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
|||||||
"model_args",
|
"model_args",
|
||||||
[
|
[
|
||||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
|
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
|
||||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
|
|
||||||
(
|
(
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||||
"channel",
|
"channel",
|
||||||
),
|
),
|
||||||
(
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
|
||||||
"channel",
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
|||||||
# this will enable VLLM_ROCM_USE_AITER_LINEAR
|
# this will enable VLLM_ROCM_USE_AITER_LINEAR
|
||||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||||
|
|
||||||
with vllm_runner(model_path, dtype=torch.float16) as llm:
|
with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -268,7 +253,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -283,38 +268,6 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
|||||||
True,
|
True,
|
||||||
False,
|
False,
|
||||||
),
|
),
|
||||||
(
|
|
||||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
|
||||||
"group",
|
|
||||||
128,
|
|
||||||
8,
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
|
||||||
"channel",
|
|
||||||
None,
|
|
||||||
4,
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
|
|
||||||
"group",
|
|
||||||
128,
|
|
||||||
8,
|
|
||||||
False,
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
|
|
||||||
"channel",
|
|
||||||
None,
|
|
||||||
8,
|
|
||||||
False,
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
(
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
|
||||||
"group",
|
"group",
|
||||||
@ -330,7 +283,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||||
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
|
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -348,7 +301,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -357,7 +310,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
||||||
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
|
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
|
||||||
with vllm_runner(model_path) as llm:
|
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -370,13 +323,13 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
def test_compressed_tensors_fp8(vllm_runner):
|
def test_compressed_tensors_fp8(vllm_runner):
|
||||||
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
with vllm_runner(model_path) as llm:
|
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -399,7 +352,7 @@ def test_compressed_tensors_fp8(vllm_runner):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -412,8 +365,8 @@ def test_compressed_tensors_fp8(vllm_runner):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||||
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
||||||
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
|
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
|
||||||
output = llm.generate_greedy("Hello world!", max_tokens=20)
|
output = llm.generate_greedy("Hello world!", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
||||||
model, weight_strategy, input_strategy = args_2of4
|
model, weight_strategy, input_strategy = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -476,7 +429,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -512,7 +465,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
||||||
model, weight_strategy, input_strategy = args_2of4
|
model, weight_strategy, input_strategy = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -528,7 +481,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -564,7 +517,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
||||||
model, weight_strategy, input_strategy = args_2of4
|
model, weight_strategy, input_strategy = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -580,7 +533,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -611,7 +564,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
||||||
model, weight_strategy, input_strategy = args_2of4
|
model, weight_strategy, input_strategy = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -622,7 +575,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -637,7 +590,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
||||||
model = args_2of4
|
model = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -656,7 +609,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -670,7 +623,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
|||||||
)
|
)
|
||||||
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||||
model = args_2of4
|
model = args_2of4
|
||||||
with vllm_runner(model) as llm:
|
with vllm_runner(model, enforce_eager=True) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -689,7 +642,7 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -723,7 +676,7 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
|
|||||||
assert qkv_proj.scheme.group_size == 16
|
assert qkv_proj.scheme.group_size == 16
|
||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -758,7 +711,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
|
|||||||
assert proj.scheme.group_size == 128
|
assert proj.scheme.group_size == 128
|
||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
print(output)
|
print(output)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
|
|||||||
|
|
||||||
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||||
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
||||||
with vllm_runner(model_path) as llm:
|
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||||
fp8_dtype = current_platform.fp8_dtype()
|
fp8_dtype = current_platform.fp8_dtype()
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
@ -816,5 +769,5 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|||||||
@ -16,13 +16,6 @@ from ..utils import compare_two_settings
|
|||||||
reason="fp8 is not supported on this GPU type.",
|
reason="fp8 is not supported on this GPU type.",
|
||||||
)
|
)
|
||||||
def test_cpu_offload_fp8():
|
def test_cpu_offload_fp8():
|
||||||
# Test quantization of an unquantized checkpoint
|
|
||||||
compare_two_settings(
|
|
||||||
"meta-llama/Llama-3.2-1B-Instruct",
|
|
||||||
["--quantization", "fp8"],
|
|
||||||
["--quantization", "fp8", "--cpu-offload-gb", "1"],
|
|
||||||
max_wait_seconds=480,
|
|
||||||
)
|
|
||||||
# Test loading a quantized checkpoint
|
# Test loading a quantized checkpoint
|
||||||
compare_two_settings(
|
compare_two_settings(
|
||||||
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
|
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
|
||||||
@ -46,13 +39,6 @@ def test_cpu_offload_gptq(monkeypatch):
|
|||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
max_wait_seconds=480,
|
max_wait_seconds=480,
|
||||||
)
|
)
|
||||||
# Test GPTQ
|
|
||||||
compare_two_settings(
|
|
||||||
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
|
||||||
["--quantization", "gptq"],
|
|
||||||
["--quantization", "gptq", "--cpu-offload-gb", "1"],
|
|
||||||
max_wait_seconds=480,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
@ -69,13 +55,6 @@ def test_cpu_offload_awq(monkeypatch):
|
|||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
max_wait_seconds=480,
|
max_wait_seconds=480,
|
||||||
)
|
)
|
||||||
# Test AWQ
|
|
||||||
compare_two_settings(
|
|
||||||
"Qwen/Qwen2-1.5B-Instruct-AWQ",
|
|
||||||
["--quantization", "awq"],
|
|
||||||
["--quantization", "awq", "--cpu-offload-gb", "1"],
|
|
||||||
max_wait_seconds=480,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
@ -92,17 +71,3 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
|
|||||||
["--cpu-offload-gb", "1"],
|
["--cpu-offload-gb", "1"],
|
||||||
max_wait_seconds=480,
|
max_wait_seconds=480,
|
||||||
)
|
)
|
||||||
# Test w4a16_marlin24
|
|
||||||
compare_two_settings(
|
|
||||||
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
|
||||||
[],
|
|
||||||
["--cpu-offload-gb", "1"],
|
|
||||||
max_wait_seconds=480,
|
|
||||||
)
|
|
||||||
# Test w8a8
|
|
||||||
compare_two_settings(
|
|
||||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
|
||||||
[],
|
|
||||||
["--cpu-offload-gb", "1"],
|
|
||||||
max_wait_seconds=480,
|
|
||||||
)
|
|
||||||
|
|||||||
@ -18,7 +18,6 @@ from vllm.platforms import current_platform
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
|
||||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -49,8 +48,6 @@ def test_model_load_and_run(
|
|||||||
|
|
||||||
|
|
||||||
KV_CACHE_MODELS = [
|
KV_CACHE_MODELS = [
|
||||||
# Deprecated AutoFP8 format using .kv_scale
|
|
||||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
|
||||||
# AutoFP8 format using separate .k_scale and .v_scale
|
# AutoFP8 format using separate .k_scale and .v_scale
|
||||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -40,7 +40,9 @@ def test_gptq_with_dynamic(
|
|||||||
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
|
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
|
||||||
)
|
)
|
||||||
|
|
||||||
with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
|
with vllm_runner(
|
||||||
|
model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
|
||||||
|
) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
for name, submodule in model.named_modules():
|
for name, submodule in model.named_modules():
|
||||||
|
|||||||
@ -31,7 +31,9 @@ def test_lm_head(
|
|||||||
) -> None:
|
) -> None:
|
||||||
# `LLM.apply_model` requires pickling a function.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model:
|
with vllm_runner(
|
||||||
|
model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
|
||||||
|
) as vllm_model:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
lm_head_layer = model.lm_head
|
lm_head_layer = model.lm_head
|
||||||
|
|||||||
@ -56,7 +56,10 @@ def enable_pickle(monkeypatch):
|
|||||||
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
||||||
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
|
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
|
model_path,
|
||||||
|
enforce_eager=True,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
|
tensor_parallel_size=tp,
|
||||||
) as llm:
|
) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp", [1])
|
@pytest.mark.parametrize("tp", [1])
|
||||||
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
||||||
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
|
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
|
||||||
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
|
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp", [1])
|
@pytest.mark.parametrize("tp", [1])
|
||||||
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
||||||
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
||||||
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
|
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
|||||||
|
|
||||||
llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,6 @@ import pytest
|
|||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"microsoft/Phi-3-mini-4k-instruct", # dense model
|
|
||||||
"ai21labs/Jamba-tiny-dev", # MoE model
|
"ai21labs/Jamba-tiny-dev", # MoE model
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -30,5 +29,7 @@ def test_model_rtn_startup(
|
|||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
|
with vllm_runner(
|
||||||
|
model, enforce_eager=True, dtype=dtype, quantization="rtn"
|
||||||
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ def test_pre_quantized_model(vllm_runner):
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@ -39,8 +39,9 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca
|
|||||||
quantization="torchao",
|
quantization="torchao",
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
pt_load_map_location=pt_load_map_location,
|
pt_load_map_location=pt_load_map_location,
|
||||||
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -54,8 +55,9 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
|
|||||||
quantization="torchao",
|
quantization="torchao",
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
pt_load_map_location="cuda:0",
|
pt_load_map_location="cuda:0",
|
||||||
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -69,8 +71,9 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
|
|||||||
quantization="torchao",
|
quantization="torchao",
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
pt_load_map_location="cuda:0",
|
pt_load_map_location="cuda:0",
|
||||||
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -90,7 +93,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
pt_load_map_location="cuda:0",
|
pt_load_map_location="cuda:0",
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -122,8 +125,9 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
|
|||||||
pt_load_map_location="cuda:0",
|
pt_load_map_location="cuda:0",
|
||||||
quantization="torchao",
|
quantization="torchao",
|
||||||
hf_overrides=hf_overrides,
|
hf_overrides=hf_overrides,
|
||||||
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -156,8 +160,9 @@ def test_on_the_fly_quant_config_file(vllm_runner):
|
|||||||
pt_load_map_location="cuda:0",
|
pt_load_map_location="cuda:0",
|
||||||
quantization="torchao",
|
quantization="torchao",
|
||||||
hf_overrides=hf_overrides,
|
hf_overrides=hf_overrides,
|
||||||
|
enforce_eager=True,
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -228,7 +233,7 @@ def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_
|
|||||||
"torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
|
"torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
|
||||||
)
|
)
|
||||||
with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
|
with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
@ -245,7 +250,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
|
|||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
|
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
|
||||||
) as llm:
|
) as llm:
|
||||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||||
|
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user