mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 10:07:11 +08:00
[CI Sprint] Quantization CI Cleanup (#24130)
Signed-off-by: Alex Yun <alexyun04@gmail.com>
This commit is contained in:
parent
184b12fdc6
commit
f6aa122698
@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
@pytest.mark.parametrize(
|
||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_path, dtype=dtype) as vllm_model:
|
||||
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
|
||||
# Test loading a quantized checkpoint
|
||||
compare_two_settings(
|
||||
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
|
||||
# Test GPTQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
|
||||
# Test AWQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-AWQ",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
|
||||
# Test wNa16
|
||||
compare_two_settings(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
def test_model_experts_int8_startup(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
|
||||
with vllm_runner(
|
||||
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@ -45,10 +45,10 @@ def test_model_load_and_run(
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(model_id) as llm:
|
||||
with vllm_runner(model_id, enforce_eager=True) as llm:
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
|
||||
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
attn = model.model.layers[0].self_attn.attn
|
||||
@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
|
||||
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@ -142,7 +142,10 @@ def test_load_fp16_model(
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(
|
||||
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
|
||||
"facebook/opt-125m",
|
||||
quantization="fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
|
||||
@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
@ -49,4 +49,4 @@ def test_lm_head(
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
|
||||
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
|
||||
|
||||
@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# Run a simple generation test to ensure the model works
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
print(f"ModelOpt FP8 output: {output}")
|
||||
|
||||
@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||
"facebook/opt-125m",
|
||||
dtype=dtype,
|
||||
quantization="ptpc_fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
except AssertionError as e:
|
||||
@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import (
|
||||
get_quantization_config,
|
||||
register_quantization_config,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
|
||||
QuantizationConfig,
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=1)
|
||||
assert output
|
||||
|
||||
@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
|
||||
assert not has_int4_preshuffled_tensor
|
||||
|
||||
assert weight_attrs == [False, 1, 0, True]
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user