mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 23:15:31 +08:00
[Misc] Clean up mark to fork process in BNB tests (#20692)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
1a4f35e2ea
commit
77f77a951e
@ -13,8 +13,8 @@ from transformers import BitsAndBytesConfig
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..models.utils import check_embeddings_close
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
from ...utils import compare_two_settings, multi_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
@ -42,7 +42,6 @@ models_pre_quant_8bit_to_test = [
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -56,7 +55,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -68,7 +66,6 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -76,12 +73,10 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason='Test requires at least 2 GPUs.')
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@create_new_process_for_each_test()
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -96,12 +91,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
vllm_tp_size=2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason='Test requires at least 2 GPUs.')
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@create_new_process_for_each_test()
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
common_args = [
|
||||
"--disable-log-stats",
|
||||
@ -127,7 +120,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_4bit_to_embedding_test)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@create_new_process_for_each_test()
|
||||
def test_4bit_bnb_embedding_model(
|
||||
model_name,
|
||||
description,
|
||||
@ -146,6 +138,13 @@ def test_4bit_bnb_embedding_model(
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
# Inflight 4bit quantization
|
||||
with vllm_runner(model_name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes") as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True))
|
||||
with hf_runner(
|
||||
@ -156,12 +155,6 @@ def test_4bit_bnb_embedding_model(
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model_name,
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes") as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
Loading…
x
Reference in New Issue
Block a user