mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 21:44:59 +08:00
[ci][test] add correctness test for cpu offloading (#6549)
This commit is contained in:
parent
2d4733ba2d
commit
f53b8f0d05
@ -46,6 +46,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|||||||
8
tests/basic_correctness/test_cpu_offload.py
Normal file
8
tests/basic_correctness/test_cpu_offload.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
from ..utils import compare_two_settings
|
||||||
|
|
||||||
|
|
||||||
|
def test_cpu_offload():
|
||||||
|
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
|
||||||
|
["--cpu-offload-gb", "4"])
|
||||||
|
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||||
|
[], ["--cpu-offload-gb", "1"])
|
||||||
@ -1,7 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
from ..utils import RemoteOpenAIServer
|
from ..utils import compare_two_settings
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -13,7 +12,6 @@ from ..utils import RemoteOpenAIServer
|
|||||||
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
|
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
|
||||||
])
|
])
|
||||||
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
||||||
|
|
||||||
pp_args = [
|
pp_args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
@ -48,85 +46,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
|||||||
pp_args.append("--enforce-eager")
|
pp_args.append("--enforce-eager")
|
||||||
tp_args.append("--enforce-eager")
|
tp_args.append("--enforce-eager")
|
||||||
|
|
||||||
prompt = "Hello, my name is"
|
compare_two_settings(MODEL_NAME, pp_args, tp_args)
|
||||||
token_ids = tokenizer(prompt)["input_ids"]
|
|
||||||
results = []
|
|
||||||
for args in (pp_args, tp_args):
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
|
||||||
client = server.get_client()
|
|
||||||
|
|
||||||
# test models list
|
|
||||||
models = client.models.list()
|
|
||||||
models = models.data
|
|
||||||
served_model = models[0]
|
|
||||||
results.append({
|
|
||||||
"test": "models_list",
|
|
||||||
"id": served_model.id,
|
|
||||||
"root": served_model.root,
|
|
||||||
})
|
|
||||||
|
|
||||||
# test with text prompt
|
|
||||||
completion = client.completions.create(model=MODEL_NAME,
|
|
||||||
prompt=prompt,
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0)
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"test": "single_completion",
|
|
||||||
"text": completion.choices[0].text,
|
|
||||||
"finish_reason": completion.choices[0].finish_reason,
|
|
||||||
"usage": completion.usage,
|
|
||||||
})
|
|
||||||
|
|
||||||
# test using token IDs
|
|
||||||
completion = client.completions.create(
|
|
||||||
model=MODEL_NAME,
|
|
||||||
prompt=token_ids,
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"test": "token_ids",
|
|
||||||
"text": completion.choices[0].text,
|
|
||||||
"finish_reason": completion.choices[0].finish_reason,
|
|
||||||
"usage": completion.usage,
|
|
||||||
})
|
|
||||||
|
|
||||||
# test simple list
|
|
||||||
batch = client.completions.create(
|
|
||||||
model=MODEL_NAME,
|
|
||||||
prompt=[prompt, prompt],
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"test": "simple_list",
|
|
||||||
"text0": batch.choices[0].text,
|
|
||||||
"text1": batch.choices[1].text,
|
|
||||||
})
|
|
||||||
|
|
||||||
# test streaming
|
|
||||||
batch = client.completions.create(
|
|
||||||
model=MODEL_NAME,
|
|
||||||
prompt=[prompt, prompt],
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
texts = [""] * 2
|
|
||||||
for chunk in batch:
|
|
||||||
assert len(chunk.choices) == 1
|
|
||||||
choice = chunk.choices[0]
|
|
||||||
texts[choice.index] += choice.text
|
|
||||||
results.append({
|
|
||||||
"test": "streaming",
|
|
||||||
"texts": texts,
|
|
||||||
})
|
|
||||||
|
|
||||||
n = len(results) // 2
|
|
||||||
pp_results = results[:n]
|
|
||||||
tp_results = results[n:]
|
|
||||||
for pp, tp in zip(pp_results, tp_results):
|
|
||||||
assert pp == tp
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from typing import Any, Dict, List
|
|||||||
import openai
|
import openai
|
||||||
import ray
|
import ray
|
||||||
import requests
|
import requests
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||||
init_distributed_environment)
|
init_distributed_environment)
|
||||||
@ -124,6 +125,99 @@ class RemoteOpenAIServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
|
||||||
|
"""
|
||||||
|
Launch API server with two different sets of arguments and compare the
|
||||||
|
results of the API calls. The arguments are after the model name.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
|
||||||
|
prompt = "Hello, my name is"
|
||||||
|
token_ids = tokenizer(prompt)["input_ids"]
|
||||||
|
results = []
|
||||||
|
for args in (arg1, arg2):
|
||||||
|
with RemoteOpenAIServer(model, args) as server:
|
||||||
|
client = server.get_client()
|
||||||
|
|
||||||
|
# test models list
|
||||||
|
models = client.models.list()
|
||||||
|
models = models.data
|
||||||
|
served_model = models[0]
|
||||||
|
results.append({
|
||||||
|
"test": "models_list",
|
||||||
|
"id": served_model.id,
|
||||||
|
"root": served_model.root,
|
||||||
|
})
|
||||||
|
|
||||||
|
# test with text prompt
|
||||||
|
completion = client.completions.create(model=model,
|
||||||
|
prompt=prompt,
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"test": "single_completion",
|
||||||
|
"text": completion.choices[0].text,
|
||||||
|
"finish_reason": completion.choices[0].finish_reason,
|
||||||
|
"usage": completion.usage,
|
||||||
|
})
|
||||||
|
|
||||||
|
# test using token IDs
|
||||||
|
completion = client.completions.create(
|
||||||
|
model=model,
|
||||||
|
prompt=token_ids,
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"test": "token_ids",
|
||||||
|
"text": completion.choices[0].text,
|
||||||
|
"finish_reason": completion.choices[0].finish_reason,
|
||||||
|
"usage": completion.usage,
|
||||||
|
})
|
||||||
|
|
||||||
|
# test simple list
|
||||||
|
batch = client.completions.create(
|
||||||
|
model=model,
|
||||||
|
prompt=[prompt, prompt],
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"test": "simple_list",
|
||||||
|
"text0": batch.choices[0].text,
|
||||||
|
"text1": batch.choices[1].text,
|
||||||
|
})
|
||||||
|
|
||||||
|
# test streaming
|
||||||
|
batch = client.completions.create(
|
||||||
|
model=model,
|
||||||
|
prompt=[prompt, prompt],
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
texts = [""] * 2
|
||||||
|
for chunk in batch:
|
||||||
|
assert len(chunk.choices) == 1
|
||||||
|
choice = chunk.choices[0]
|
||||||
|
texts[choice.index] += choice.text
|
||||||
|
results.append({
|
||||||
|
"test": "streaming",
|
||||||
|
"texts": texts,
|
||||||
|
})
|
||||||
|
|
||||||
|
n = len(results) // 2
|
||||||
|
arg1_results = results[:n]
|
||||||
|
arg2_results = results[n:]
|
||||||
|
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
|
||||||
|
assert arg1_result == arg2_result, \
|
||||||
|
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
|
||||||
|
|
||||||
|
|
||||||
def init_test_distributed_environment(
|
def init_test_distributed_environment(
|
||||||
tp_size: int,
|
tp_size: int,
|
||||||
pp_size: int,
|
pp_size: int,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user