mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 21:35:01 +08:00
[CI/Build] Add gpt-oss LoRA test (#27870)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
3857eb8725
commit
0384aa7150
@ -441,7 +441,7 @@ steps:
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_gptoss_tp.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
@ -1217,6 +1217,8 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
|
||||
@ -417,7 +417,7 @@ steps:
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_gptoss_tp.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
|
||||
parallelism: 4
|
||||
@ -1119,6 +1119,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
@ -237,7 +237,7 @@ def deepseekv2_lora_files():
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gptoss20b_lora_files():
|
||||
return snapshot_download(repo_id="LevinZheng/gpt-oss-20b-lora-adapter")
|
||||
return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# NOTE To avoid overloading the CI pipeline, this test script will
|
||||
# not be triggered on CI and is primarily intended for local testing
|
||||
# and verification.
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "openai/gpt-oss-20b"
|
||||
|
||||
PROMPT_TEMPLATE = "<|begin▁of▁sentence|>You are a helpful assistant.\n\nUser: {context}\n\nAssistant:" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(context="Who are you?"),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
# FIXME: Load gpt-oss adapter
|
||||
def test_gptoss20b_lora(gptoss20b_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
expected_lora_output = [
|
||||
"I am an AI language model developed by OpenAI. "
|
||||
"I am here to help you with any questions or "
|
||||
"tasks you may have."
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, gptoss20b_lora_files, lora_id=1)
|
||||
print(output1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i].startswith(expected_lora_output[i])
|
||||
106
tests/lora/test_gptoss_tp.py
Normal file
106
tests/lora/test_gptoss_tp.py
Normal file
@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "openai/gpt-oss-20b"
|
||||
|
||||
PROMPT_TEMPLATE = """<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
|
||||
Knowledge cutoff: 2024-06
|
||||
Current date: 2025-10-29
|
||||
|
||||
Reasoning: medium
|
||||
|
||||
# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
farm contains tables such as city, farm, farm_competition, competition_record. Table city has columns such as City_ID, Official_Name, Status, Area_km_2, Population, Census_Ranking. City_ID is the primary key.
|
||||
Table farm has columns such as Farm_ID, Year, Total_Horses, Working_Horses, Total_Cattle, Oxen, Bulls, Cows, Pigs, Sheep_and_Goats. Farm_ID is the primary key.
|
||||
Table farm_competition has columns such as Competition_ID, Year, Theme, Host_city_ID, Hosts. Competition_ID is the primary key.
|
||||
Table competition_record has columns such as Competition_ID, Farm_ID, Rank. Competition_ID is the primary key.
|
||||
The Host_city_ID of farm_competition is the foreign key of City_ID of city.
|
||||
The Farm_ID of competition_record is the foreign key of Farm_ID of farm.
|
||||
The Competition_ID of competition_record is the foreign key of Competition_ID of farm_competition.
|
||||
|
||||
|
||||
###Input:
|
||||
{context}
|
||||
|
||||
###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
|
||||
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
|
||||
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
|
||||
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
|
||||
]
|
||||
|
||||
|
||||
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="What is the average number of working horses of farms with more than 5000 total number of horses?" # noqa: E501
|
||||
), # noqa: E501
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Give the average number of working horses on farms with more than 5000 total horses." # noqa: E501
|
||||
), # noqa: E501
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="What are the maximum and minimum number of cows across all farms."
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the maximum and minimum number of cows across all farms."
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
|
||||
|
||||
|
||||
def test_gpt_oss_lora(gptoss20b_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=8,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
|
||||
@ -1,6 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
# NOTE To avoid overloading the CI pipeline, this test script will not
|
||||
# be triggered on CI and is primarily intended for local testing and verification.
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user