mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 03:55:42 +08:00
Consolidate Llama model usage in tests (#13094)
This commit is contained in:
parent
40932d7a05
commit
f2b20fe491
@ -17,7 +17,7 @@ from ..utils import multi_gpu_test
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"google/gemma-2-2b-it",
|
"google/gemma-2-2b-it",
|
||||||
"meta-llama/Llama-3.2-1B",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
]
|
]
|
||||||
|
|
||||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||||
@ -96,12 +96,12 @@ def test_models(
|
|||||||
"test_suite", [
|
"test_suite", [
|
||||||
("facebook/opt-125m", "ray", "", "L4"),
|
("facebook/opt-125m", "ray", "", "L4"),
|
||||||
("facebook/opt-125m", "mp", "", "L4"),
|
("facebook/opt-125m", "mp", "", "L4"),
|
||||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||||
("facebook/opt-125m", "ray", "", "A100"),
|
("facebook/opt-125m", "ray", "", "A100"),
|
||||||
("facebook/opt-125m", "mp", "", "A100"),
|
("facebook/opt-125m", "mp", "", "A100"),
|
||||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
|
||||||
])
|
])
|
||||||
def test_models_distributed(
|
def test_models_distributed(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
@ -116,7 +116,7 @@ def test_models_distributed(
|
|||||||
if test_suite != TARGET_TEST_SUITE:
|
if test_suite != TARGET_TEST_SUITE:
|
||||||
pytest.skip(f"Skip test for {test_suite}")
|
pytest.skip(f"Skip test for {test_suite}")
|
||||||
|
|
||||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||||
# test ray adag
|
# test ray adag
|
||||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from ..utils import multi_gpu_test
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
"meta-llama/Llama-3.2-1B",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -92,7 +92,7 @@ def test_models_distributed(
|
|||||||
) -> None:
|
) -> None:
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
override_backend_env_variable(monkeypatch, attention_backend)
|
||||||
|
|
||||||
if (model == "meta-llama/Llama-2-7b-hf"
|
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
and distributed_executor_backend == "ray"):
|
and distributed_executor_backend == "ray"):
|
||||||
# test ray adag
|
# test ray adag
|
||||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||||
@ -221,7 +221,7 @@ def test_with_prefix_caching(
|
|||||||
Checks exact match decode with and without prefix caching
|
Checks exact match decode with and without prefix caching
|
||||||
with chunked prefill enabled.
|
with chunked prefill enabled.
|
||||||
"""
|
"""
|
||||||
model = "meta-llama/Llama-2-7b-chat-hf"
|
model = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
# The common prompt has 142 tokens with Llama-2 tokenizer.
|
# The common prompt has 142 tokens with Llama-2 tokenizer.
|
||||||
common_prompt = "You are a helpful AI assistant " * 20
|
common_prompt = "You are a helpful AI assistant " * 20
|
||||||
unique_prompts = [
|
unique_prompts = [
|
||||||
|
|||||||
@ -4,5 +4,5 @@ from ..utils import compare_two_settings
|
|||||||
|
|
||||||
|
|
||||||
def test_cpu_offload():
|
def test_cpu_offload():
|
||||||
compare_two_settings("meta-llama/Llama-3.2-1B", [],
|
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
|
||||||
["--cpu-offload-gb", "1"])
|
["--cpu-offload-gb", "1"])
|
||||||
|
|||||||
@ -118,7 +118,7 @@ def test_cumem_with_cudagraph():
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model",
|
"model",
|
||||||
[
|
[
|
||||||
"meta-llama/Llama-3.2-1B", # sleep mode with safetensors
|
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
|
||||||
"facebook/opt-125m" # sleep mode with pytorch checkpoint
|
"facebook/opt-125m" # sleep mode with pytorch checkpoint
|
||||||
])
|
])
|
||||||
def test_end_to_end(model):
|
def test_end_to_end(model):
|
||||||
|
|||||||
@ -26,7 +26,7 @@ class TestSetting:
|
|||||||
test_settings = [
|
test_settings = [
|
||||||
# basic llama model
|
# basic llama model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
model="meta-llama/Llama-3.2-1B",
|
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
model_args=[],
|
model_args=[],
|
||||||
pp_size=2,
|
pp_size=2,
|
||||||
tp_size=2,
|
tp_size=2,
|
||||||
|
|||||||
@ -6,7 +6,6 @@ import torch
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import CompilationLevel
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
TEST_MODELS = [
|
TEST_MODELS = [
|
||||||
@ -15,14 +14,14 @@ TEST_MODELS = [
|
|||||||
"dtype": torch.float16,
|
"dtype": torch.float16,
|
||||||
"quantization": "compressed-tensors"
|
"quantization": "compressed-tensors"
|
||||||
}),
|
}),
|
||||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
|
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||||
"dtype": torch.float16,
|
"dtype": torch.float16,
|
||||||
"quantization": "fp8"
|
|
||||||
}),
|
|
||||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
|
||||||
"quantization": "compressed-tensors"
|
"quantization": "compressed-tensors"
|
||||||
}),
|
}),
|
||||||
("meta-llama/Meta-Llama-3-8B", {}),
|
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
||||||
|
"quantization": "compressed-tensors"
|
||||||
|
}),
|
||||||
|
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||||
]
|
]
|
||||||
|
|
||||||
if is_quant_method_supported("aqlm"):
|
if is_quant_method_supported("aqlm"):
|
||||||
@ -69,11 +68,6 @@ def check_full_graph_support(model,
|
|||||||
# make sure these models can be captured in full graph mode
|
# make sure these models can be captured in full graph mode
|
||||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||||
|
|
||||||
# The base meta llama uses too much memory.
|
|
||||||
if (model == "meta-llama/Meta-Llama-3-8B"
|
|
||||||
and optimization_level >= CompilationLevel.PIECEWISE):
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f"MODEL={model}")
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -162,7 +162,7 @@ TEXT_GENERATION_MODELS = {
|
|||||||
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
|
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
|
||||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||||
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
|
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
|
||||||
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
|
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
|
||||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
|
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
|
||||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
|
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
|
||||||
# Uses Llama
|
# Uses Llama
|
||||||
@ -230,7 +230,7 @@ MULTIMODAL_MODELS = {
|
|||||||
TEST_MODELS = [
|
TEST_MODELS = [
|
||||||
# [LANGUAGE GENERATION]
|
# [LANGUAGE GENERATION]
|
||||||
"microsoft/Phi-3.5-MoE-instruct",
|
"microsoft/Phi-3.5-MoE-instruct",
|
||||||
"meta-llama/Meta-Llama-3-8B",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"ibm/PowerLM-3b",
|
"ibm/PowerLM-3b",
|
||||||
# [LANGUAGE EMBEDDING]
|
# [LANGUAGE EMBEDDING]
|
||||||
"intfloat/e5-mistral-7b-instruct",
|
"intfloat/e5-mistral-7b-instruct",
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
|||||||
OpenAIServingModels)
|
OpenAIServingModels)
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
MODEL_NAME = "meta-llama/Llama-2-7b"
|
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
|
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
|
||||||
LORA_LOADING_SUCCESS_MESSAGE = (
|
LORA_LOADING_SUCCESS_MESSAGE = (
|
||||||
"Success: LoRA adapter '{lora_name}' added successfully.")
|
"Success: LoRA adapter '{lora_name}' added successfully.")
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import pytest
|
|||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
MODEL_NAME = "meta-llama/Llama-3.2-1B"
|
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@ -28,7 +28,7 @@ def setup_servers():
|
|||||||
"-m",
|
"-m",
|
||||||
"vllm.entrypoints.openai.api_server",
|
"vllm.entrypoints.openai.api_server",
|
||||||
"--model",
|
"--model",
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"--port",
|
"--port",
|
||||||
"8100",
|
"8100",
|
||||||
"--gpu-memory-utilization",
|
"--gpu-memory-utilization",
|
||||||
@ -49,7 +49,7 @@ def setup_servers():
|
|||||||
"-m",
|
"-m",
|
||||||
"vllm.entrypoints.openai.api_server",
|
"vllm.entrypoints.openai.api_server",
|
||||||
"--model",
|
"--model",
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"--port",
|
"--port",
|
||||||
"8200",
|
"8200",
|
||||||
"--gpu-memory-utilization",
|
"--gpu-memory-utilization",
|
||||||
@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
|
|||||||
response = requests.post("http://localhost:8100/v1/completions",
|
response = requests.post("http://localhost:8100/v1/completions",
|
||||||
headers={"Content-Type": "application/json"},
|
headers={"Content-Type": "application/json"},
|
||||||
json={
|
json={
|
||||||
"model":
|
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"max_tokens": 1,
|
"max_tokens": 1,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
|
|||||||
response = requests.post("http://localhost:8200/v1/completions",
|
response = requests.post("http://localhost:8200/v1/completions",
|
||||||
headers={"Content-Type": "application/json"},
|
headers={"Content-Type": "application/json"},
|
||||||
json={
|
json={
|
||||||
"model":
|
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"max_tokens": 10,
|
"max_tokens": 10,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
|
|||||||
@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|||||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
|
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
|
||||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
|
||||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"meta-llama/Llama-3.2-1B-Instruct"),
|
"meta-llama/Llama-3.2-1B-Instruct"),
|
||||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"meta-llama/Llama-2-7b-chat-hf")
|
"meta-llama/Llama-3.2-1B-Instruct")
|
||||||
])
|
])
|
||||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||||
@pytest.mark.parametrize("max_tokens", [4])
|
@pytest.mark.parametrize("max_tokens", [4])
|
||||||
|
|||||||
@ -141,7 +141,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
|
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
|
||||||
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
|
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
|
||||||
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
|
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
|
||||||
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
|
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
|
||||||
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
|
||||||
is_available_online=False),
|
is_available_online=False),
|
||||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||||
|
|||||||
@ -99,7 +99,7 @@ def test_register_quantization_config():
|
|||||||
|
|
||||||
@pytest.mark.parametrize(argnames="model",
|
@pytest.mark.parametrize(argnames="model",
|
||||||
argvalues=[
|
argvalues=[
|
||||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
])
|
])
|
||||||
def test_custom_quant(vllm_runner, model):
|
def test_custom_quant(vllm_runner, model):
|
||||||
"""Test infer with the custom quantization method."""
|
"""Test infer with the custom quantization method."""
|
||||||
|
|||||||
@ -10,7 +10,7 @@ from vllm import SamplingParams
|
|||||||
|
|
||||||
# We also test with llama because it has generation_config to specify EOS
|
# We also test with llama because it has generation_config to specify EOS
|
||||||
# (past regression).
|
# (past regression).
|
||||||
MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
|
MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from .conftest import get_output_from_llm_generator
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("common_llm_kwargs", [{
|
@pytest.mark.parametrize("common_llm_kwargs", [{
|
||||||
"model": "meta-llama/Llama-2-7b-chat-hf",
|
"model": "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"speculative_model": "JackFram/llama-68m",
|
"speculative_model": "JackFram/llama-68m",
|
||||||
"num_speculative_tokens": 5,
|
"num_speculative_tokens": 5,
|
||||||
}])
|
}])
|
||||||
@ -27,8 +27,8 @@ from .conftest import get_output_from_llm_generator
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
# Speculative max model len > target max model len should raise.
|
# Speculative max model len > target max model len should raise.
|
||||||
# https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
|
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
|
||||||
"speculative_max_model_len": 4096 + 1,
|
"speculative_max_model_len": 131072 + 1,
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||||
|
|||||||
@ -251,7 +251,7 @@ def test_rope_customization():
|
|||||||
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
|
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
|
||||||
("facebook/opt-125m", False),
|
("facebook/opt-125m", False),
|
||||||
("facebook/bart-base", True),
|
("facebook/bart-base", True),
|
||||||
("meta-llama/Llama-3.2-1B", False),
|
("meta-llama/Llama-3.2-1B-Instruct", False),
|
||||||
("meta-llama/Llama-3.2-11B-Vision", True),
|
("meta-llama/Llama-3.2-11B-Vision", True),
|
||||||
])
|
])
|
||||||
def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
||||||
|
|||||||
@ -46,9 +46,9 @@ def test_filter_subtensors():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llama_2_7b_files():
|
def llama_3p2_1b_files():
|
||||||
with TemporaryDirectory() as cache_dir:
|
with TemporaryDirectory() as cache_dir:
|
||||||
input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
|
input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
|
||||||
cache_dir=cache_dir,
|
cache_dir=cache_dir,
|
||||||
ignore_patterns=["*.bin*", "original/*"])
|
ignore_patterns=["*.bin*", "original/*"])
|
||||||
|
|
||||||
@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
|
|||||||
@pytest.mark.parametrize("enable_lora", [False, True])
|
@pytest.mark.parametrize("enable_lora", [False, True])
|
||||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||||
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||||
llama_2_7b_files):
|
llama_3p2_1b_files):
|
||||||
if num_gpus_available < tp_size:
|
if num_gpus_available < tp_size:
|
||||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||||
|
|
||||||
weights_patterns = ("*.safetensors", )
|
weights_patterns = ("*.safetensors", )
|
||||||
gpu_memory_utilization = 0.8
|
gpu_memory_utilization = 0.8
|
||||||
input_dir = llama_2_7b_files
|
input_dir = llama_3p2_1b_files
|
||||||
ctx = mp.get_context("spawn")
|
ctx = mp.get_context("spawn")
|
||||||
|
|
||||||
# Run in separate processes for memory & CUDA isolation
|
# Run in separate processes for memory & CUDA isolation
|
||||||
|
|||||||
@ -31,7 +31,7 @@ TOKENIZERS = [
|
|||||||
"bigscience/bloom-560m",
|
"bigscience/bloom-560m",
|
||||||
"mosaicml/mpt-7b",
|
"mosaicml/mpt-7b",
|
||||||
"tiiuae/falcon-7b",
|
"tiiuae/falcon-7b",
|
||||||
"meta-llama/Llama-2-7b-hf",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"codellama/CodeLlama-7b-hf",
|
"codellama/CodeLlama-7b-hf",
|
||||||
"mistralai/Pixtral-12B-2409",
|
"mistralai/Pixtral-12B-2409",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
|||||||
|
|
||||||
|
|
||||||
def test_get_llama3_eos_token():
|
def test_get_llama3_eos_token():
|
||||||
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
|
||||||
tokenizer = get_tokenizer(model_name)
|
tokenizer = get_tokenizer(model_name)
|
||||||
assert tokenizer.eos_token_id == 128009
|
assert tokenizer.eos_token_id == 128009
|
||||||
@ -17,7 +17,7 @@ def test_get_llama3_eos_token():
|
|||||||
generation_config = try_get_generation_config(model_name,
|
generation_config = try_get_generation_config(model_name,
|
||||||
trust_remote_code=False)
|
trust_remote_code=False)
|
||||||
assert generation_config is not None
|
assert generation_config is not None
|
||||||
assert generation_config.eos_token_id == [128001, 128009]
|
assert generation_config.eos_token_id == [128001, 128008, 128009]
|
||||||
|
|
||||||
|
|
||||||
def test_get_blip2_eos_token():
|
def test_get_blip2_eos_token():
|
||||||
|
|||||||
@ -17,7 +17,7 @@ if not current_platform.is_cuda():
|
|||||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||||
allow_module_level=True)
|
allow_module_level=True)
|
||||||
|
|
||||||
ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
|
ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
disable_log_requests=True)
|
disable_log_requests=True)
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from vllm import SamplingParams
|
|||||||
|
|
||||||
from ...conftest import VllmRunner
|
from ...conftest import VllmRunner
|
||||||
|
|
||||||
MODEL = "meta-llama/Llama-3.2-1B"
|
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
DTYPE = "half"
|
DTYPE = "half"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ RTOL = 0.03
|
|||||||
EXPECTED_VALUE = 0.62
|
EXPECTED_VALUE = 0.62
|
||||||
|
|
||||||
# FIXME(rob): enable prefix caching once supported.
|
# FIXME(rob): enable prefix caching once supported.
|
||||||
MODEL = "meta-llama/Llama-3.2-1B"
|
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
|
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
|
||||||
SERVER_ARGS = [
|
SERVER_ARGS = [
|
||||||
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
|
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user