mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:35:01 +08:00
[Misc] Clean up mark to fork process in BNB tests (#20692)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
1a4f35e2ea
commit
77f77a951e
@ -13,8 +13,8 @@ from transformers import BitsAndBytesConfig
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
from ..models.utils import check_embeddings_close
|
from ...utils import compare_two_settings, multi_gpu_test
|
||||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
from ..utils import check_embeddings_close
|
||||||
|
|
||||||
models_4bit_to_test = [
|
models_4bit_to_test = [
|
||||||
("facebook/opt-125m", "quantize opt model inflight"),
|
("facebook/opt-125m", "quantize opt model inflight"),
|
||||||
@ -42,7 +42,6 @@ models_pre_quant_8bit_to_test = [
|
|||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -56,7 +55,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_qaunt_4bit_to_test)
|
models_pre_qaunt_4bit_to_test)
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -68,7 +66,6 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_quant_8bit_to_test)
|
models_pre_quant_8bit_to_test)
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -76,12 +73,10 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
model_name, True)
|
model_name, True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
|
||||||
reason='Test requires at least 2 GPUs.')
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@create_new_process_for_each_test()
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@ -96,12 +91,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
vllm_tp_size=2)
|
vllm_tp_size=2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
|
||||||
reason='Test requires at least 2 GPUs.')
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@create_new_process_for_each_test()
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||||
common_args = [
|
common_args = [
|
||||||
"--disable-log-stats",
|
"--disable-log-stats",
|
||||||
@ -127,7 +120,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
|||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_4bit_to_embedding_test)
|
models_4bit_to_embedding_test)
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_4bit_bnb_embedding_model(
|
def test_4bit_bnb_embedding_model(
|
||||||
model_name,
|
model_name,
|
||||||
description,
|
description,
|
||||||
@ -146,6 +138,13 @@ def test_4bit_bnb_embedding_model(
|
|||||||
example_prompts = [str(s).strip() for s in example_prompts]
|
example_prompts = [str(s).strip() for s in example_prompts]
|
||||||
|
|
||||||
# Inflight 4bit quantization
|
# Inflight 4bit quantization
|
||||||
|
with vllm_runner(model_name,
|
||||||
|
task="embed",
|
||||||
|
dtype=dtype,
|
||||||
|
gpu_memory_utilization=0.5,
|
||||||
|
quantization="bitsandbytes") as vllm_model:
|
||||||
|
vllm_outputs = vllm_model.embed(example_prompts)
|
||||||
|
|
||||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||||
load_in_4bit=True))
|
load_in_4bit=True))
|
||||||
with hf_runner(
|
with hf_runner(
|
||||||
@ -156,12 +155,6 @@ def test_4bit_bnb_embedding_model(
|
|||||||
) as hf_model:
|
) as hf_model:
|
||||||
hf_outputs = hf_model.encode(example_prompts)
|
hf_outputs = hf_model.encode(example_prompts)
|
||||||
|
|
||||||
with vllm_runner(model_name,
|
|
||||||
task="embed",
|
|
||||||
dtype=dtype,
|
|
||||||
gpu_memory_utilization=0.5,
|
|
||||||
quantization="bitsandbytes") as vllm_model:
|
|
||||||
vllm_outputs = vllm_model.embed(example_prompts)
|
|
||||||
check_embeddings_close(
|
check_embeddings_close(
|
||||||
embeddings_0_lst=hf_outputs,
|
embeddings_0_lst=hf_outputs,
|
||||||
embeddings_1_lst=vllm_outputs,
|
embeddings_1_lst=vllm_outputs,
|
||||||
Loading…
x
Reference in New Issue
Block a user