[CI Sprint] Quantization CI Cleanup (#24130)

Signed-off-by: Alex Yun <alexyun04@gmail.com>
This commit is contained in:
Alex 2025-11-18 08:21:48 -06:00 committed by GitHub
parent 184b12fdc6
commit f6aa122698
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 32 additions and 26 deletions

View File

@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
],
)
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_path, dtype=dtype) as vllm_model:
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)

View File

@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
# Test loading a quantized checkpoint
compare_two_settings(
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
# Test GPTQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
# Test AWQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-AWQ",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
# Test wNa16
compare_two_settings(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)

View File

@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("max_tokens", [4])
def test_model_experts_int8_startup(
hf_runner,
vllm_runner,
@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
with vllm_runner(
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@ -45,10 +45,10 @@ def test_model_load_and_run(
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner(model_id) as llm:
with vllm_runner(model_id, enforce_eager=True) as llm:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
def check_model(model):
attn = model.model.layers[0].self_attn.attn
@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@ -142,7 +142,10 @@ def test_load_fp16_model(
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner(
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
"facebook/opt-125m",
quantization="fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
) as llm:
def check_model(model):

View File

@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
print(output)

View File

@ -49,4 +49,4 @@ def test_lm_head(
vllm_model.apply_model(check_model)
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])

View File

@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
llm.apply_model(check_model)
# Run a simple generation test to ensure the model works
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8 output: {output}")

View File

@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
"facebook/opt-125m",
dtype=dtype,
quantization="ptpc_fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
)
except AssertionError as e:
@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output

View File

@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import (
get_quantization_config,
register_quantization_config,
)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig,
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, # noqa: E501
)
@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
output = llm.generate_greedy("Hello my name is", max_tokens=1)
assert output

View File

@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
assert not has_int4_preshuffled_tensor
assert weight_attrs == [False, 1, 0, True]
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output