diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index e7d902ed26aaa..31b65189b5ec3 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): "neuralmagic/Llama-3.2-1B-quantized.w8a8", ], ) -@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize( "use_aiter", [True, False] if current_platform.is_rocm() else [False] @@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs( example_prompts, max_tokens, num_logprobs ) - with vllm_runner(model_path, dtype=dtype) as vllm_model: + with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs ) diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index a3fb4a6953474..1591ce1c4f5ad 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -19,8 +19,8 @@ def test_cpu_offload_fp8(): # Test loading a quantized checkpoint compare_two_settings( "neuralmagic/Qwen2-1.5B-Instruct-FP8", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch): # Test GPTQ Marlin compare_two_settings( "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch): # Test AWQ Marlin compare_two_settings( "Qwen/Qwen2-1.5B-Instruct-AWQ", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) @@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch): # Test wNa16 compare_two_settings( "nm-testing/tinyllama-oneshot-w4a16-channel-v2", - [], - ["--cpu-offload-gb", "1"], + ["--enforce_eager"], + ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, ) diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index 2a72f734e431b..b992e976ac308 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"] ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [10]) +@pytest.mark.parametrize("max_tokens", [4]) def test_model_experts_int8_startup( hf_runner, vllm_runner, @@ -33,5 +33,7 @@ def test_model_experts_int8_startup( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_transformers_version(on_fail="skip") - with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model: + with vllm_runner( + model, dtype=dtype, enforce_eager=True, quantization="experts_int8" + ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index f02da2996ffea..7bcac9ad768e7 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -45,10 +45,10 @@ def test_model_load_and_run( if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") - with vllm_runner(model_id) as llm: + with vllm_runner(model_id, enforce_eager=True) as llm: # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy - outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) + outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4) print(outputs[0][1]) @@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run( # `LLM.apply_model` requires pickling a function. monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: + with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm: def check_model(model): attn = model.model.layers[0].self_attn.attn @@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run( # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy - outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) + outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4) print(outputs[0][1]) @@ -142,7 +142,10 @@ def test_load_fp16_model( monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") with vllm_runner( - "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype + "facebook/opt-125m", + quantization="fp8", + enforce_eager=True, + kv_cache_dtype=kv_cache_dtype, ) as llm: def check_model(model): diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index ae9b1df3377dc..4f3c52df6c283 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -26,7 +26,7 @@ DTYPE = ["bfloat16"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", DTYPE) def test_ipex_quant(vllm_runner, model, dtype): - with vllm_runner(model, dtype=dtype) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm: + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output print(output) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index f009a4cfb870d..d92dfaa2cc7b5 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -49,4 +49,4 @@ def test_lm_head( vllm_model.apply_model(check_model) - print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1]) + print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1]) diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 8abf65d29784d..0298994c396f6 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner): llm.apply_model(check_model) # Run a simple generation test to ensure the model works - output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index e8ea4148585bf..61efd2ce66c71 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: "facebook/opt-125m", dtype=dtype, quantization="ptpc_fp8", + enforce_eager=True, kv_cache_dtype=kv_cache_dtype, ) except AssertionError as e: @@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 8da048703df93..a09856c78559a 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import ( get_quantization_config, register_quantization_config, ) -from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 - QuantizationConfig, +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, # noqa: E501 ) @@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch): llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=20) + output = llm.generate_greedy("Hello my name is", max_tokens=1) assert output diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 82413f36e997f..fb8d6130c3779 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant( assert not has_int4_preshuffled_tensor assert weight_attrs == [False, 1, 0, True] - output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + output = llm.generate_greedy(["The capital of France is"], max_tokens=4) assert output