diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py index 69632ae6cac70..9f5db82195012 100644 --- a/tests/quantization/test_auto_round.py +++ b/tests/quantization/test_auto_round.py @@ -26,7 +26,7 @@ MODELS = [ ) @pytest.mark.parametrize("model", MODELS) def test_auto_round(vllm_runner, model): - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert output print(f"{output[0][1]}") diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 5aeb002238cf9..1040cf70eb81e 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -66,13 +66,6 @@ def enable_pickle(monkeypatch): 2560, True, ), - ( - "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", - "channel", - QuantizationType.INT, - 2560, - True, - ), ( "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor", @@ -138,7 +131,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): llm.apply_model(check_model) - output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output @@ -146,12 +139,9 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): "model_path", [ "neuralmagic/Llama-3.2-1B-quantized.w8a8", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", ], ) -@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("max_tokens", [8]) @pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize( "use_aiter", [True, False] if current_platform.is_rocm() else [False] @@ -211,7 +201,7 @@ def test_compressed_tensors_w8a8_logprobs( def test_compressed_tensors_no_enforce_eager(vllm_runner): model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path) as llm: - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output @@ -219,15 +209,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): "model_args", [ ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), - ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"), ( "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel", ), - ( - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", - "channel", - ), ], ) @pytest.mark.parametrize( @@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( # this will enable VLLM_ROCM_USE_AITER_LINEAR monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - with vllm_runner(model_path, dtype=torch.float16) as llm: + with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm: def check_model(model): layer = model.model.layers[0] @@ -268,7 +253,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( llm.apply_model(check_model) - output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output @@ -283,38 +268,6 @@ def test_compressed_tensors_w8a8_dynamic_per_token( True, False, ), - ( - "nm-testing/tinyllama-oneshot-w4a16-group128-v2", - "group", - 128, - 8, - True, - False, - ), - ( - "nm-testing/tinyllama-oneshot-w8a16-per-channel", - "channel", - None, - 4, - True, - False, - ), - ( - "nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", - "group", - 128, - 8, - False, - False, - ), - ( - "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel", - "channel", - None, - 8, - False, - False, - ), ( "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder", "group", @@ -330,7 +283,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( ) def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -348,7 +301,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output @@ -357,7 +310,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): ) def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" - with vllm_runner(model_path) as llm: + with vllm_runner(model_path, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -370,13 +323,13 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" - with vllm_runner(model_path) as llm: + with vllm_runner(model_path, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -399,7 +352,7 @@ def test_compressed_tensors_fp8(vllm_runner): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output @@ -412,8 +365,8 @@ def test_compressed_tensors_fp8(vllm_runner): ) def test_compressed_tensors_kv_cache(vllm_runner): model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" - with vllm_runner(model_path, kv_cache_dtype="fp8") as llm: - output = llm.generate_greedy("Hello world!", max_tokens=20) + with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm: + output = llm.generate_greedy("Hello world!", max_tokens=4) assert output @@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d ) def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -476,7 +429,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -512,7 +465,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): ) def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -528,7 +481,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -564,7 +517,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4): ) def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -580,7 +533,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -611,7 +564,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4): ) def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -622,7 +575,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -637,7 +590,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): ) def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -656,7 +609,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -670,7 +623,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): ) def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): model = args_2of4 - with vllm_runner(model) as llm: + with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] @@ -689,7 +642,7 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -723,7 +676,7 @@ def test_compressed_tensors_nvfp4(vllm_runner, args): assert qkv_proj.scheme.group_size == 16 llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -758,7 +711,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args): assert proj.scheme.group_size == 128 llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) print(output) assert output @@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity( def test_compressed_tensors_fp8_block_enabled(vllm_runner): model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK" - with vllm_runner(model_path) as llm: + with vllm_runner(model_path, enforce_eager=True) as llm: fp8_dtype = current_platform.fp8_dtype() def check_model(model): @@ -816,5 +769,5 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index 25d1dc59f6174..a3fb4a6953474 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -16,13 +16,6 @@ from ..utils import compare_two_settings reason="fp8 is not supported on this GPU type.", ) def test_cpu_offload_fp8(): - # Test quantization of an unquantized checkpoint - compare_two_settings( - "meta-llama/Llama-3.2-1B-Instruct", - ["--quantization", "fp8"], - ["--quantization", "fp8", "--cpu-offload-gb", "1"], - max_wait_seconds=480, - ) # Test loading a quantized checkpoint compare_two_settings( "neuralmagic/Qwen2-1.5B-Instruct-FP8", @@ -46,13 +39,6 @@ def test_cpu_offload_gptq(monkeypatch): ["--cpu-offload-gb", "1"], max_wait_seconds=480, ) - # Test GPTQ - compare_two_settings( - "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", - ["--quantization", "gptq"], - ["--quantization", "gptq", "--cpu-offload-gb", "1"], - max_wait_seconds=480, - ) @pytest.mark.skipif( @@ -69,13 +55,6 @@ def test_cpu_offload_awq(monkeypatch): ["--cpu-offload-gb", "1"], max_wait_seconds=480, ) - # Test AWQ - compare_two_settings( - "Qwen/Qwen2-1.5B-Instruct-AWQ", - ["--quantization", "awq"], - ["--quantization", "awq", "--cpu-offload-gb", "1"], - max_wait_seconds=480, - ) @pytest.mark.skipif( @@ -92,17 +71,3 @@ def test_cpu_offload_compressed_tensors(monkeypatch): ["--cpu-offload-gb", "1"], max_wait_seconds=480, ) - # Test w4a16_marlin24 - compare_two_settings( - "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", - [], - ["--cpu-offload-gb", "1"], - max_wait_seconds=480, - ) - # Test w8a8 - compare_two_settings( - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - [], - ["--cpu-offload-gb", "1"], - max_wait_seconds=480, - ) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 6b9a33059815f..7f863a169d5f9 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -18,7 +18,6 @@ from vllm.platforms import current_platform MODELS = [ "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/Phi-3-mini-128k-instruct-FP8", "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", ] @@ -49,8 +48,6 @@ def test_model_load_and_run( KV_CACHE_MODELS = [ - # Deprecated AutoFP8 format using .kv_scale - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", # AutoFP8 format using separate .k_scale and .v_scale "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", ] diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index c71f4b8156113..37fe2dd3243aa 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -40,7 +40,9 @@ def test_gptq_with_dynamic( GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod) ) - with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: + with vllm_runner( + model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True + ) as llm: def check_model(model): for name, submodule in model.named_modules(): diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index bae8b7f7d535b..f009a4cfb870d 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -31,7 +31,9 @@ def test_lm_head( ) -> None: # `LLM.apply_model` requires pickling a function. monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: + with vllm_runner( + model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True + ) as vllm_model: def check_model(model): lm_head_layer = model.lm_head diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 8875fdd1170aa..0af27aff9359d 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -56,7 +56,10 @@ def enable_pickle(monkeypatch): def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp): model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" with vllm_runner( - model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp + model_path, + enforce_eager=True, + kv_cache_dtype=kv_cache_dtype, + tensor_parallel_size=tp, ) as llm: def check_model(model): @@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output @pytest.mark.parametrize("tp", [1]) def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp): model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts" - with vllm_runner(model_path, tensor_parallel_size=tp) as llm: + with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm: def check_model(model): layer = model.model.layers[0] @@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output @pytest.mark.parametrize("tp", [1]) def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp): model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test" - with vllm_runner(model_path, tensor_parallel_size=tp) as llm: + with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm: def check_model(model): layer = model.model.layers[0] @@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py index 370625ed34792..195f1fbbdfc0c 100644 --- a/tests/quantization/test_rtn.py +++ b/tests/quantization/test_rtn.py @@ -10,7 +10,6 @@ import pytest from tests.quantization.utils import is_quant_method_supported MODELS = [ - "microsoft/Phi-3-mini-4k-instruct", # dense model "ai21labs/Jamba-tiny-dev", # MoE model ] @@ -30,5 +29,7 @@ def test_model_rtn_startup( dtype: str, max_tokens: int, ) -> None: - with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model: + with vllm_runner( + model, enforce_eager=True, dtype=dtype, quantization="rtn" + ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index bc24c51b57b28..cab198a2a15e2 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -19,7 +19,7 @@ def test_pre_quantized_model(vllm_runner): dtype="bfloat16", enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -39,8 +39,9 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca quantization="torchao", dtype="bfloat16", pt_load_map_location=pt_load_map_location, + enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -54,8 +55,9 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner): quantization="torchao", dtype="bfloat16", pt_load_map_location="cuda:0", + enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -69,8 +71,9 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): quantization="torchao", dtype="bfloat16", pt_load_map_location="cuda:0", + enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -90,7 +93,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner): dtype="bfloat16", pt_load_map_location="cuda:0", ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -122,8 +125,9 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner): pt_load_map_location="cuda:0", quantization="torchao", hf_overrides=hf_overrides, + enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -156,8 +160,9 @@ def test_on_the_fly_quant_config_file(vllm_runner): pt_load_map_location="cuda:0", quantization="torchao", hf_overrides=hf_overrides, + enforce_eager=True, ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -228,7 +233,7 @@ def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_ "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors" ) with vllm_runner(model_name=model_name, dtype="bfloat16") as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output @@ -245,7 +250,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner): with vllm_runner( model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0" ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output