mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 15:02:18 +08:00
[misc][ci] fix quant test (#8449)
This commit is contained in:
parent
06311e2956
commit
a2469127db
@ -10,6 +10,8 @@ import torch
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
|
from ..utils import fork_new_process_for_each_test
|
||||||
|
|
||||||
models_4bit_to_test = [
|
models_4bit_to_test = [
|
||||||
('huggyllama/llama-7b', 'quantize model inflight'),
|
('huggyllama/llama-7b', 'quantize model inflight'),
|
||||||
]
|
]
|
||||||
@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
|
|||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_qaunt_4bit_to_test)
|
models_pre_qaunt_4bit_to_test)
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_quant_8bit_to_test)
|
models_pre_quant_8bit_to_test)
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
|
|||||||
model_name,
|
model_name,
|
||||||
hf_model_kwargs=None):
|
hf_model_kwargs=None):
|
||||||
|
|
||||||
if hf_model_kwargs is None:
|
# NOTE: run vLLM first, as it requires a clean process
|
||||||
hf_model_kwargs = {}
|
# when using distributed inference
|
||||||
|
|
||||||
# Run with HF runner
|
|
||||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
|
||||||
hf_outputs = llm.generate_greedy(prompts, 8)
|
|
||||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
|
||||||
|
|
||||||
# Clean up the GPU memory for the next test
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
#Run with vLLM runner
|
#Run with vLLM runner
|
||||||
with vllm_runner(model_name,
|
with vllm_runner(model_name,
|
||||||
@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
|
|||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
if hf_model_kwargs is None:
|
||||||
|
hf_model_kwargs = {}
|
||||||
|
|
||||||
|
# Run with HF runner
|
||||||
|
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||||
|
hf_outputs = llm.generate_greedy(prompts, 8)
|
||||||
|
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||||
|
|
||||||
|
# Clean up the GPU memory for the next test
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
# Compare the generated strings
|
# Compare the generated strings
|
||||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||||
hf_str = hf_log["generated_text"]
|
hf_str = hf_log["generated_text"]
|
||||||
|
|||||||
@ -1,12 +1,10 @@
|
|||||||
import torch
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
def is_quant_method_supported(quant_method: str) -> bool:
|
def is_quant_method_supported(quant_method: str) -> bool:
|
||||||
# Currently, all quantization methods require Nvidia or AMD GPUs
|
# Currently, all quantization methods require Nvidia or AMD GPUs
|
||||||
if not torch.cuda.is_available():
|
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
capability = current_platform.get_device_capability()
|
capability = current_platform.get_device_capability()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user