mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-11 23:55:42 +08:00
[Core] Remove tokenizer group in vLLM (#24078)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
parent
c15309a730
commit
6c47f6bfa4
@ -1,10 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import PreTrainedTokenizer
|
|
||||||
|
|
||||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||||
from vllm.inputs import token_inputs
|
from vllm.inputs import token_inputs
|
||||||
@ -54,10 +51,7 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
|
|||||||
- When the EOS token should be ignored, and the sequence continues
|
- When the EOS token should be ignored, and the sequence continues
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
stop_checker = StopChecker(max_model_len=1024)
|
||||||
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
|
|
||||||
stop_checker = StopChecker(max_model_len=1024,
|
|
||||||
get_tokenizer_for_seq=get_tokenizer_for_seq)
|
|
||||||
|
|
||||||
seq = sequence_with_eos(
|
seq = sequence_with_eos(
|
||||||
text=text_wo_eos,
|
text=text_wo_eos,
|
||||||
|
|||||||
@ -58,16 +58,13 @@ def deepseek_r1_qwen_tokenizer():
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stop_checker():
|
def stop_checker():
|
||||||
return StopChecker(max_model_len=10,
|
return StopChecker(max_model_len=10)
|
||||||
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stop_checker_with_reasoner():
|
def stop_checker_with_reasoner():
|
||||||
reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
|
reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
|
||||||
return StopChecker(max_model_len=10,
|
return StopChecker(max_model_len=10, reasoner=reasoner)
|
||||||
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer,
|
|
||||||
reasoner=reasoner)
|
|
||||||
|
|
||||||
|
|
||||||
def test_eos_token_stopping(stop_checker):
|
def test_eos_token_stopping(stop_checker):
|
||||||
|
|||||||
@ -208,25 +208,3 @@ def zephyr_lora_files():
|
|||||||
"""Download zephyr LoRA files once per test session."""
|
"""Download zephyr LoRA files once per test session."""
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
|
||||||
"""Create zephyr LoRA files with added tokens once per test session."""
|
|
||||||
import shutil
|
|
||||||
from tempfile import TemporaryDirectory
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
tmp_dir = TemporaryDirectory()
|
|
||||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
|
||||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|
|
||||||
# Copy tokenizer to adapter and add some unique tokens
|
|
||||||
# 32000, 32001, 32002
|
|
||||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
|
||||||
special_tokens=True)
|
|
||||||
assert added == 3
|
|
||||||
tokenizer.save_pretrained(tmp_model_dir)
|
|
||||||
yield tmp_model_dir
|
|
||||||
tmp_dir.cleanup()
|
|
||||||
|
|||||||
@ -29,11 +29,7 @@ def monkeypatch_module():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=[False, True])
|
@pytest.fixture(scope="module", params=[False, True])
|
||||||
def server(
|
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
|
||||||
request,
|
|
||||||
monkeypatch_module,
|
|
||||||
zephyr_lora_files, #noqa: F811
|
|
||||||
zephyr_lora_added_tokens_files): # noqa: F811
|
|
||||||
|
|
||||||
use_v1 = request.param
|
use_v1 = request.param
|
||||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||||
@ -49,7 +45,6 @@ def server(
|
|||||||
"--enable-lora",
|
"--enable-lora",
|
||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"zephyr-lora={zephyr_lora_files}",
|
f"zephyr-lora={zephyr_lora_files}",
|
||||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
|
||||||
"--max-lora-rank",
|
"--max-lora-rank",
|
||||||
"64",
|
"64",
|
||||||
"--max-cpu-loras",
|
"--max-cpu-loras",
|
||||||
@ -79,7 +74,7 @@ async def client(server):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# first test base model, then test loras
|
# first test base model, then test loras
|
||||||
"model_name",
|
"model_name",
|
||||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
[MODEL_NAME, "zephyr-lora"],
|
||||||
)
|
)
|
||||||
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||||
messages = [{
|
messages = [{
|
||||||
|
|||||||
@ -27,7 +27,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
|
def default_server_args(zephyr_lora_files):
|
||||||
return [
|
return [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
@ -41,7 +41,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
|
|||||||
"--enable-lora",
|
"--enable-lora",
|
||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"zephyr-lora={zephyr_lora_files}",
|
f"zephyr-lora={zephyr_lora_files}",
|
||||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
|
||||||
"--max-lora-rank",
|
"--max-lora-rank",
|
||||||
"64",
|
"64",
|
||||||
"--max-cpu-loras",
|
"--max-cpu-loras",
|
||||||
@ -87,7 +86,7 @@ async def client(server):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# first test base model, then test loras
|
# first test base model, then test loras
|
||||||
"model_name",
|
"model_name",
|
||||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
[MODEL_NAME, "zephyr-lora"],
|
||||||
)
|
)
|
||||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||||
completion = await client.completions.create(model=model_name,
|
completion = await client.completions.create(model=model_name,
|
||||||
@ -115,20 +114,6 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
|||||||
assert completion.choices[0].prompt_logprobs is None
|
assert completion.choices[0].prompt_logprobs is None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
|
|
||||||
# test using token IDs
|
|
||||||
completion = await client.completions.create(
|
|
||||||
model="zephyr-lora2",
|
|
||||||
prompt=[0, 0, 32000, 32001, 32002],
|
|
||||||
echo=True,
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
# Added tokens should appear in tokenized prompt
|
|
||||||
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||||
# test using token IDs
|
# test using token IDs
|
||||||
@ -147,7 +132,7 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# first test base model, then test loras
|
# first test base model, then test loras
|
||||||
"model_name",
|
"model_name",
|
||||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
[MODEL_NAME, "zephyr-lora"],
|
||||||
)
|
)
|
||||||
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||||
# test using token IDs
|
# test using token IDs
|
||||||
@ -713,7 +698,7 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# first test base model, then test loras
|
# first test base model, then test loras
|
||||||
"model_name",
|
"model_name",
|
||||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
[MODEL_NAME, "zephyr-lora"],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||||
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||||
|
|||||||
@ -21,10 +21,7 @@ CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def default_server_args(
|
def default_server_args() -> list[str]:
|
||||||
zephyr_lora_files,
|
|
||||||
zephyr_lora_added_tokens_files,
|
|
||||||
) -> list[str]:
|
|
||||||
return [
|
return [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@ -67,12 +67,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
|
|||||||
"base_model_name": MODEL_NAME
|
"base_model_name": MODEL_NAME
|
||||||
}
|
}
|
||||||
|
|
||||||
lora_module_2 = {
|
|
||||||
"name": "zephyr-lora2",
|
|
||||||
"path": zephyr_lora_files,
|
|
||||||
"base_model_name": MODEL_NAME
|
|
||||||
}
|
|
||||||
|
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
@ -84,7 +78,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
|
|||||||
"--enable-lora",
|
"--enable-lora",
|
||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
json.dumps(lora_module_1),
|
json.dumps(lora_module_1),
|
||||||
json.dumps(lora_module_2),
|
|
||||||
"--max-lora-rank",
|
"--max-lora-rank",
|
||||||
"64",
|
"64",
|
||||||
"--max-cpu-loras",
|
"--max-cpu-loras",
|
||||||
@ -121,7 +114,6 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
|
|||||||
for lora_model in lora_models)
|
for lora_model in lora_models)
|
||||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||||
assert lora_models[0].id == "zephyr-lora"
|
assert lora_models[0].id == "zephyr-lora"
|
||||||
assert lora_models[1].id == "zephyr-lora2"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@ -26,7 +26,6 @@ def server(zephyr_lora_files):
|
|||||||
"--enable-lora",
|
"--enable-lora",
|
||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"zephyr-lora={zephyr_lora_files}",
|
f"zephyr-lora={zephyr_lora_files}",
|
||||||
f"zephyr-lora2={zephyr_lora_files}",
|
|
||||||
"--max-lora-rank",
|
"--max-lora-rank",
|
||||||
"64",
|
"64",
|
||||||
"--max-cpu-loras",
|
"--max-cpu-loras",
|
||||||
@ -56,4 +55,3 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
|
|||||||
assert all(lora_model.root == zephyr_lora_files
|
assert all(lora_model.root == zephyr_lora_files
|
||||||
for lora_model in lora_models)
|
for lora_model in lora_models)
|
||||||
assert lora_models[0].id == "zephyr-lora"
|
assert lora_models[0].id == "zephyr-lora"
|
||||||
assert lora_models[1].id == "zephyr-lora2"
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
def server():
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
@ -24,12 +24,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
|||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"128",
|
"128",
|
||||||
# lora config
|
|
||||||
"--enable-lora",
|
|
||||||
"--lora-modules",
|
|
||||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
|
||||||
"--max-lora-rank",
|
|
||||||
"64",
|
|
||||||
"--enable-tokenizer-info-endpoint",
|
"--enable-tokenizer-info-endpoint",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -38,10 +32,8 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def tokenizer_name(model_name: str,
|
def tokenizer_name(model_name: str):
|
||||||
zephyr_lora_added_tokens_files: str): # noqa: F811
|
return model_name
|
||||||
return zephyr_lora_added_tokens_files if (
|
|
||||||
model_name == "zephyr-lora2") else model_name
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@ -53,7 +45,7 @@ async def client(server):
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name,tokenizer_name",
|
"model_name,tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_tokenize_completions(
|
async def test_tokenize_completions(
|
||||||
@ -86,7 +78,7 @@ async def test_tokenize_completions(
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name,tokenizer_name",
|
"model_name,tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_tokenize_chat(
|
async def test_tokenize_chat(
|
||||||
@ -148,7 +140,7 @@ async def test_tokenize_chat(
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name,tokenizer_name",
|
"model_name,tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_tokenize_chat_with_tools(
|
async def test_tokenize_chat_with_tools(
|
||||||
@ -225,7 +217,7 @@ async def test_tokenize_chat_with_tools(
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name, tokenizer_name",
|
"model_name, tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_tokenize_with_return_token_strs(
|
async def test_tokenize_with_return_token_strs(
|
||||||
@ -260,7 +252,7 @@ async def test_tokenize_with_return_token_strs(
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name,tokenizer_name",
|
"model_name,tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_detokenize(
|
async def test_detokenize(
|
||||||
@ -287,7 +279,7 @@ async def test_detokenize(
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name,tokenizer_name",
|
"model_name,tokenizer_name",
|
||||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
[(MODEL_NAME, MODEL_NAME)],
|
||||||
indirect=["tokenizer_name"],
|
indirect=["tokenizer_name"],
|
||||||
)
|
)
|
||||||
async def test_tokenizer_info_basic(
|
async def test_tokenizer_info_basic(
|
||||||
|
|||||||
@ -18,6 +18,8 @@ SERVER_ARGS = [
|
|||||||
"--enable-lora",
|
"--enable-lora",
|
||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"{LORA_MODEL}={LORA_MODEL}",
|
f"{LORA_MODEL}={LORA_MODEL}",
|
||||||
|
"--tokenizer",
|
||||||
|
f"{LORA_MODEL}",
|
||||||
]
|
]
|
||||||
|
|
||||||
TOOLS = [{
|
TOOLS = [{
|
||||||
|
|||||||
@ -23,7 +23,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
|||||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||||
encode_video_base64)
|
encode_video_base64)
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||||
|
|
||||||
from ..models.registry import HF_EXAMPLE_MODELS
|
from ..models.registry import HF_EXAMPLE_MODELS
|
||||||
@ -69,12 +69,7 @@ def phi3v_model_config_mm_interleaved():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def phi3v_tokenizer():
|
def phi3v_tokenizer():
|
||||||
return TokenizerGroup(
|
return get_tokenizer(PHI3V_MODEL_ID)
|
||||||
tokenizer_id=PHI3V_MODEL_ID,
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
@ -91,12 +86,7 @@ def qwen2_audio_model_config():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def qwen2_audio_tokenizer():
|
def qwen2_audio_tokenizer():
|
||||||
return TokenizerGroup(
|
return get_tokenizer(QWEN2AUDIO_MODEL_ID)
|
||||||
tokenizer_id=QWEN2AUDIO_MODEL_ID,
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
@ -115,12 +105,7 @@ def qwen25omni_model_config_mm_interleaved():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def qwen25omni_tokenizer():
|
def qwen25omni_tokenizer():
|
||||||
return TokenizerGroup(
|
return get_tokenizer(QWEN25OMNI_MODEL_ID)
|
||||||
tokenizer_id=QWEN25OMNI_MODEL_ID,
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
@ -136,12 +121,7 @@ def mistral_model_config():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def mistral_tokenizer():
|
def mistral_tokenizer():
|
||||||
return TokenizerGroup(
|
return get_tokenizer(MISTRAL_MODEL_ID)
|
||||||
tokenizer_id=MISTRAL_MODEL_ID,
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
@ -2250,15 +2230,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
|||||||
enforce_eager=model_info.enforce_eager,
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=model_info.dtype)
|
dtype=model_info.dtype)
|
||||||
|
|
||||||
# Build the tokenizer group and grab the underlying tokenizer
|
# Build the tokenizer
|
||||||
tokenizer_group = TokenizerGroup(
|
tokenizer = get_tokenizer(
|
||||||
model,
|
model,
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
tokenizer = tokenizer_group.tokenizer
|
|
||||||
|
|
||||||
tools = ([{
|
tools = ([{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
@ -2307,14 +2283,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
enforce_eager=model_info.enforce_eager,
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=model_info.dtype)
|
dtype=model_info.dtype)
|
||||||
|
|
||||||
tokenizer_group = TokenizerGroup(
|
tokenizer = get_tokenizer(
|
||||||
model,
|
model,
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
tokenizer = tokenizer_group.tokenizer
|
|
||||||
|
|
||||||
# Test detecting the tokenizer's chat_template
|
# Test detecting the tokenizer's chat_template
|
||||||
chat_template = resolve_hf_chat_template(
|
chat_template = resolve_hf_chat_template(
|
||||||
@ -2368,14 +2340,10 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
enforce_eager=model_info.enforce_eager,
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=model_info.dtype)
|
dtype=model_info.dtype)
|
||||||
|
|
||||||
tokenizer_group = TokenizerGroup(
|
tokenizer = get_tokenizer(
|
||||||
model_config.tokenizer,
|
model_config.tokenizer,
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
tokenizer = tokenizer_group.tokenizer
|
|
||||||
|
|
||||||
# Test detecting the tokenizer's chat_template
|
# Test detecting the tokenizer's chat_template
|
||||||
chat_template = resolve_hf_chat_template(
|
chat_template = resolve_hf_chat_template(
|
||||||
@ -2432,14 +2400,10 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
|||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_group = TokenizerGroup(
|
dummy_tokenizer = get_tokenizer(
|
||||||
PHI3V_MODEL_ID, # Dummy
|
PHI3V_MODEL_ID, # Dummy
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=5,
|
|
||||||
max_input_length=None,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
dummy_tokenizer = tokenizer_group.tokenizer
|
|
||||||
dummy_tokenizer.chat_template = None
|
dummy_tokenizer.chat_template = None
|
||||||
|
|
||||||
chat_template = load_chat_template(EXAMPLES_DIR / template_path)
|
chat_template = load_chat_template(EXAMPLES_DIR / template_path)
|
||||||
|
|||||||
@ -13,14 +13,6 @@ from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
|
|||||||
|
|
||||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||||
|
|
||||||
EXPECTED_NO_LORA_OUTPUT = [
|
|
||||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501
|
|
||||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501
|
|
||||||
"\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501
|
|
||||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501
|
|
||||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501
|
|
||||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501
|
|
||||||
]
|
|
||||||
EXPECTED_LORA_OUTPUT = [
|
EXPECTED_LORA_OUTPUT = [
|
||||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
||||||
@ -79,23 +71,12 @@ def generate_and_test(llm,
|
|||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
tensorizer_config_dict: Union[dict, None] = None):
|
tensorizer_config_dict: Union[dict, None] = None):
|
||||||
print("lora adapter created")
|
print("lora adapter created")
|
||||||
assert do_sample(llm,
|
|
||||||
sql_lora_files,
|
|
||||||
tensorizer_config_dict=tensorizer_config_dict,
|
|
||||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
|
||||||
|
|
||||||
print("lora 1")
|
print("lora 1")
|
||||||
assert do_sample(llm,
|
assert do_sample(llm,
|
||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
tensorizer_config_dict=tensorizer_config_dict,
|
tensorizer_config_dict=tensorizer_config_dict,
|
||||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||||
|
|
||||||
print("no lora")
|
|
||||||
assert do_sample(llm,
|
|
||||||
sql_lora_files,
|
|
||||||
tensorizer_config_dict=tensorizer_config_dict,
|
|
||||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
|
||||||
|
|
||||||
print("lora 2")
|
print("lora 2")
|
||||||
assert do_sample(llm,
|
assert do_sample(llm,
|
||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
@ -110,6 +91,7 @@ def test_llama_lora(sql_lora_files):
|
|||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
tokenizer=sql_lora_files,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
# also test odd max_num_seqs
|
# also test odd max_num_seqs
|
||||||
max_num_seqs=13,
|
max_num_seqs=13,
|
||||||
@ -123,6 +105,7 @@ def test_llama_lora_tp4(sql_lora_files):
|
|||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
tokenizer=sql_lora_files,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
max_loras=4,
|
max_loras=4,
|
||||||
@ -137,6 +120,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
|||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
tokenizer=sql_lora_files,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
max_loras=4,
|
max_loras=4,
|
||||||
@ -184,6 +168,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
|||||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||||
|
|
||||||
loaded_llm = LLM(model=model_ref,
|
loaded_llm = LLM(model=model_ref,
|
||||||
|
tokenizer=sql_lora_files,
|
||||||
load_format="tensorizer",
|
load_format="tensorizer",
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
@ -195,11 +180,6 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
|||||||
tc_as_dict = tensorizer_config.to_serializable()
|
tc_as_dict = tensorizer_config.to_serializable()
|
||||||
|
|
||||||
print("lora adapter created")
|
print("lora adapter created")
|
||||||
assert do_sample(loaded_llm,
|
|
||||||
sql_lora_files,
|
|
||||||
tensorizer_config_dict=tc_as_dict,
|
|
||||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
|
||||||
|
|
||||||
print("lora 1")
|
print("lora 1")
|
||||||
assert do_sample(loaded_llm,
|
assert do_sample(loaded_llm,
|
||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
|
|||||||
@ -1,135 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
|
||||||
from vllm.config.lora import LoRAConfig
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.sampling_params import SamplingParams
|
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
|
||||||
from vllm.v1.engine.processor import Processor
|
|
||||||
|
|
||||||
|
|
||||||
def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
|
|
||||||
sql_lora_files):
|
|
||||||
"""
|
|
||||||
Test that we properly resolve the range of allowed token ids for lora
|
|
||||||
adapters that define additional tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Set up a base model compatible with the sql_lora_files adapter and
|
|
||||||
# a known number of tokens in the base model.
|
|
||||||
model_config = ModelConfig(
|
|
||||||
model=llama_2_7b_base_huggingface_id,
|
|
||||||
tokenizer=llama_2_7b_base_huggingface_id,
|
|
||||||
tokenizer_mode="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
cache_config=CacheConfig(),
|
|
||||||
device_config=DeviceConfig(),
|
|
||||||
lora_config=LoRAConfig(),
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = init_tokenizer_from_configs(
|
|
||||||
model_config=vllm_config.model_config,
|
|
||||||
scheduler_config=vllm_config.scheduler_config,
|
|
||||||
lora_config=vllm_config.lora_config)
|
|
||||||
processor = Processor(vllm_config, tokenizer)
|
|
||||||
|
|
||||||
lora_request = LoRARequest("1", 1, str(sql_lora_files))
|
|
||||||
request_id = "1"
|
|
||||||
prompt = "a prompt"
|
|
||||||
|
|
||||||
# tokens added in the lora adapter should not raise an error
|
|
||||||
lora_token_ids = [32000, 32001, 32002, 32003]
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=lora_token_ids),
|
|
||||||
lora_request=lora_request)
|
|
||||||
|
|
||||||
# tokens in the base model should not raise an error
|
|
||||||
base_token_ids = [1000, 1001, 1002, 1003]
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=base_token_ids),
|
|
||||||
lora_request=lora_request)
|
|
||||||
|
|
||||||
# tokens not in the lora adapter should raise an error
|
|
||||||
invalid_token_ids = [35000, 35001, 35002, 35003]
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=invalid_token_ids),
|
|
||||||
lora_request=lora_request)
|
|
||||||
|
|
||||||
# tokens in the lora adapter with no lora request should raise an error
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=lora_token_ids),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_allowed_token_ids_with_lora_adapter_no_vocab(
|
|
||||||
qwen25vl_base_huggingface_id, qwen25vl_lora_files):
|
|
||||||
"""
|
|
||||||
Test that we properly resolve the range of allowed token ids for lora
|
|
||||||
adapters that do not define additional tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Set up a base model compatible with the qwen25vl_lora_files adapter and
|
|
||||||
# a known number of tokens in the base model.
|
|
||||||
model_config = ModelConfig(
|
|
||||||
model=qwen25vl_base_huggingface_id,
|
|
||||||
tokenizer=qwen25vl_base_huggingface_id,
|
|
||||||
tokenizer_mode="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
cache_config=CacheConfig(),
|
|
||||||
device_config=DeviceConfig(),
|
|
||||||
lora_config=LoRAConfig(),
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = init_tokenizer_from_configs(
|
|
||||||
model_config=vllm_config.model_config,
|
|
||||||
scheduler_config=vllm_config.scheduler_config,
|
|
||||||
lora_config=vllm_config.lora_config)
|
|
||||||
processor = Processor(vllm_config, tokenizer)
|
|
||||||
|
|
||||||
lora_request = LoRARequest("1", 1, str(qwen25vl_lora_files))
|
|
||||||
request_id = "1"
|
|
||||||
prompt = "a prompt"
|
|
||||||
|
|
||||||
# tokens in the base model should not raise an error
|
|
||||||
base_token_ids = [1000, 1001, 1002, 1003]
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=base_token_ids),
|
|
||||||
lora_request=lora_request)
|
|
||||||
|
|
||||||
# tokens in the base model with no lora request should not raise an error
|
|
||||||
base_token_ids = [1000, 1001, 1002, 1003]
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=base_token_ids),
|
|
||||||
)
|
|
||||||
|
|
||||||
# tokens not in the base model should raise an error
|
|
||||||
invalid_token_ids = [200000, 200001, 200002, 200003]
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
processor.process_inputs(
|
|
||||||
request_id,
|
|
||||||
prompt,
|
|
||||||
params=SamplingParams(allowed_token_ids=invalid_token_ids),
|
|
||||||
lora_request=lora_request)
|
|
||||||
@ -82,31 +82,20 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
|||||||
gpu_memory_utilization=0.2, #avoid OOM
|
gpu_memory_utilization=0.2, #avoid OOM
|
||||||
quantization=model.quantization,
|
quantization=model.quantization,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
enable_chunked_prefill=True)
|
enable_chunked_prefill=True,
|
||||||
|
tokenizer=tinyllama_lora_files)
|
||||||
|
|
||||||
if model.quantization is None:
|
if model.quantization is None:
|
||||||
expected_no_lora_output = [
|
|
||||||
"Here are some examples of orange-brown colors",
|
|
||||||
"I'm sorry, I don't have"
|
|
||||||
]
|
|
||||||
expected_lora_output = [
|
expected_lora_output = [
|
||||||
"#ff8050",
|
"#ff8050",
|
||||||
"#ff8080",
|
"#ff8080",
|
||||||
]
|
]
|
||||||
elif model.quantization == "awq":
|
elif model.quantization == "awq":
|
||||||
expected_no_lora_output = [
|
|
||||||
"I'm sorry, I don't understand",
|
|
||||||
"I'm sorry, I don't understand",
|
|
||||||
]
|
|
||||||
expected_lora_output = [
|
expected_lora_output = [
|
||||||
"#f07700: A v",
|
"#f07700: A v",
|
||||||
"#f00000: A v",
|
"#f00000: A v",
|
||||||
]
|
]
|
||||||
elif model.quantization == "gptq":
|
elif model.quantization == "gptq":
|
||||||
expected_no_lora_output = [
|
|
||||||
"I'm sorry, I don't have",
|
|
||||||
"I'm sorry, I don't have",
|
|
||||||
]
|
|
||||||
expected_lora_output = [
|
expected_lora_output = [
|
||||||
"#f08800: This is",
|
"#f08800: This is",
|
||||||
"#f07788 \n#",
|
"#f07788 \n#",
|
||||||
@ -117,7 +106,6 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
|||||||
# Assert that the outputs changed.
|
# Assert that the outputs changed.
|
||||||
if (model.quantization == "gptq"
|
if (model.quantization == "gptq"
|
||||||
and expected_output is expected_lora_output):
|
and expected_output is expected_lora_output):
|
||||||
assert output != expected_no_lora_output
|
|
||||||
for i, o in enumerate(output):
|
for i, o in enumerate(output):
|
||||||
assert o.startswith(
|
assert o.startswith(
|
||||||
'#'), f"Expected example {i} to start with # but got {o}"
|
'#'), f"Expected example {i} to start with # but got {o}"
|
||||||
@ -127,12 +115,6 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
|||||||
max_tokens = 10
|
max_tokens = 10
|
||||||
|
|
||||||
print("lora adapter created")
|
print("lora adapter created")
|
||||||
output = do_sample(llm,
|
|
||||||
tinyllama_lora_files,
|
|
||||||
lora_id=0,
|
|
||||||
max_tokens=max_tokens)
|
|
||||||
expect_match(output, expected_no_lora_output)
|
|
||||||
|
|
||||||
print("lora 1")
|
print("lora 1")
|
||||||
output = do_sample(llm,
|
output = do_sample(llm,
|
||||||
tinyllama_lora_files,
|
tinyllama_lora_files,
|
||||||
@ -140,13 +122,6 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
|||||||
max_tokens=max_tokens)
|
max_tokens=max_tokens)
|
||||||
expect_match(output, expected_lora_output)
|
expect_match(output, expected_lora_output)
|
||||||
|
|
||||||
print("no lora")
|
|
||||||
output = do_sample(llm,
|
|
||||||
tinyllama_lora_files,
|
|
||||||
lora_id=0,
|
|
||||||
max_tokens=max_tokens)
|
|
||||||
expect_match(output, expected_no_lora_output)
|
|
||||||
|
|
||||||
print("lora 2")
|
print("lora 2")
|
||||||
output = do_sample(llm,
|
output = do_sample(llm,
|
||||||
tinyllama_lora_files,
|
tinyllama_lora_files,
|
||||||
|
|||||||
@ -1,72 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
|
|
||||||
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
|
|
||||||
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
|
|
||||||
tokenizer_group = TokenizerGroup(
|
|
||||||
tokenizer_id="gpt2",
|
|
||||||
enable_lora=True,
|
|
||||||
max_num_seqs=1,
|
|
||||||
max_loras=1,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
|
||||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
|
||||||
prompt="prompt", lora_request=lora_request)
|
|
||||||
assert reference_tokenizer.encode(
|
|
||||||
"prompt") == await tokenizer_group.encode_async(
|
|
||||||
prompt="prompt", lora_request=lora_request)
|
|
||||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
|
||||||
PreTrainedTokenizerBase)
|
|
||||||
assert tokenizer_group.get_lora_tokenizer(
|
|
||||||
None) == await tokenizer_group.get_lora_tokenizer_async(None)
|
|
||||||
|
|
||||||
assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
|
|
||||||
PreTrainedTokenizerBase)
|
|
||||||
assert tokenizer_group.get_lora_tokenizer(
|
|
||||||
lora_request) != tokenizer_group.get_lora_tokenizer(None)
|
|
||||||
assert tokenizer_group.get_lora_tokenizer(
|
|
||||||
lora_request) == await tokenizer_group.get_lora_tokenizer_async(
|
|
||||||
lora_request)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_lora_tokenizer(sql_lora_files, tmp_path):
|
|
||||||
lora_request = None
|
|
||||||
tokenizer = get_lora_tokenizer(lora_request)
|
|
||||||
assert not tokenizer
|
|
||||||
|
|
||||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
|
||||||
tokenizer = get_lora_tokenizer(lora_request)
|
|
||||||
assert tokenizer.get_added_vocab()
|
|
||||||
|
|
||||||
lora_request = LoRARequest("1", 1, str(tmp_path))
|
|
||||||
tokenizer = get_lora_tokenizer(lora_request)
|
|
||||||
assert not tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enable_lora", [True, False])
|
|
||||||
@pytest.mark.parametrize("max_num_seqs", [1, 2])
|
|
||||||
@pytest.mark.parametrize("max_loras", [1, 2])
|
|
||||||
def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
|
|
||||||
tokenizer_group = TokenizerGroup(
|
|
||||||
tokenizer_id="gpt2",
|
|
||||||
enable_lora=enable_lora,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
max_loras=max_loras,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
if enable_lora:
|
|
||||||
assert tokenizer_group.lora_tokenizers.capacity == max(
|
|
||||||
max_num_seqs, max_loras)
|
|
||||||
else:
|
|
||||||
assert tokenizer_group.lora_tokenizers.capacity == 0
|
|
||||||
@ -11,7 +11,7 @@ import pytest
|
|||||||
from vllm.inputs import token_inputs
|
from vllm.inputs import token_inputs
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import Sequence
|
from vllm.sequence import Sequence
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
# Make two prefixes with different first blocks.
|
# Make two prefixes with different first blocks.
|
||||||
prefix_start = [("You are an expert"), ("You are a")]
|
prefix_start = [("You are an expert"), ("You are a")]
|
||||||
@ -47,12 +47,7 @@ def flatten_2d(li):
|
|||||||
def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
|
def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
|
||||||
concurrent_lora_int_ids: list[Optional[int]]):
|
concurrent_lora_int_ids: list[Optional[int]]):
|
||||||
|
|
||||||
tokenizer = TokenizerGroup(
|
tokenizer = get_tokenizer("facebook/opt-125m")
|
||||||
tokenizer_id="facebook/opt-125m",
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
hashes: list[list[list[int]]] = []
|
hashes: list[list[list[int]]] = []
|
||||||
|
|
||||||
@ -76,7 +71,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
|
|||||||
inputs=token_inputs(prompt_token_ids,
|
inputs=token_inputs(prompt_token_ids,
|
||||||
prompt=prompt),
|
prompt=prompt),
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
eos_token_id=tokenizer.tokenizer.eos_token_id,
|
eos_token_id=tokenizer.eos_token_id,
|
||||||
lora_request=lora_request)
|
lora_request=lora_request)
|
||||||
|
|
||||||
num_blocks = len(prompt_token_ids) // block_size
|
num_blocks = len(prompt_token_ids) // block_size
|
||||||
|
|||||||
@ -11,7 +11,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
|||||||
from vllm.inputs import token_inputs
|
from vllm.inputs import token_inputs
|
||||||
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
||||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
|
from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
|
||||||
@ -221,17 +221,14 @@ def test_oov_decode(tokenizer, fast):
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def detokenizer(tokenizer_name: str) -> Detokenizer:
|
def detokenizer(tokenizer_name: str) -> Detokenizer:
|
||||||
tokenizer_group = TokenizerGroup(
|
tokenizer = get_tokenizer(
|
||||||
tokenizer_id=tokenizer_name,
|
tokenizer_name,
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=100,
|
|
||||||
max_input_length=None,
|
|
||||||
tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
|
tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
revision=None,
|
revision=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Detokenizer(tokenizer_group)
|
return Detokenizer(tokenizer)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(name="complete_sequence_token_ids")
|
@pytest.fixture(name="complete_sequence_token_ids")
|
||||||
@ -312,8 +309,7 @@ def test_decode_prompt_logprobs(complete_sequence: str,
|
|||||||
# don't support that.
|
# don't support that.
|
||||||
if complete_sequence not in SPECIAL_TOKS_TRUTH:
|
if complete_sequence not in SPECIAL_TOKS_TRUTH:
|
||||||
skip_special_tokens = True
|
skip_special_tokens = True
|
||||||
elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
|
elif not isinstance(detokenizer.tokenizer, MistralTokenizer):
|
||||||
MistralTokenizer):
|
|
||||||
skip_special_tokens = False
|
skip_special_tokens = False
|
||||||
else:
|
else:
|
||||||
pytest.skip("MistralTokenizers don't support "
|
pytest.skip("MistralTokenizers don't support "
|
||||||
@ -339,7 +335,7 @@ def test_decode_prompt_logprobs(complete_sequence: str,
|
|||||||
|
|
||||||
# decoded_prompt_logprobs doesn't contain the first token.
|
# decoded_prompt_logprobs doesn't contain the first token.
|
||||||
token_ids = complete_sequence_token_ids
|
token_ids = complete_sequence_token_ids
|
||||||
tokenizer = detokenizer.get_tokenizer_for_seq(seq)
|
tokenizer = detokenizer.tokenizer
|
||||||
text_full = tokenizer.decode(token_ids,
|
text_full = tokenizer.decode(token_ids,
|
||||||
skip_special_tokens=skip_special_tokens)
|
skip_special_tokens=skip_special_tokens)
|
||||||
text_first = tokenizer.decode(token_ids[0],
|
text_first = tokenizer.decode(token_ids[0],
|
||||||
|
|||||||
@ -1,27 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_tokenizer_group():
|
|
||||||
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
||||||
tokenizer_group = TokenizerGroup(
|
|
||||||
tokenizer_id="gpt2",
|
|
||||||
enable_lora=False,
|
|
||||||
max_num_seqs=1,
|
|
||||||
max_input_length=None,
|
|
||||||
)
|
|
||||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
|
||||||
prompt="prompt", lora_request=None)
|
|
||||||
assert reference_tokenizer.encode(
|
|
||||||
"prompt") == await tokenizer_group.encode_async(prompt="prompt",
|
|
||||||
lora_request=None)
|
|
||||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
|
||||||
PreTrainedTokenizerBase)
|
|
||||||
assert tokenizer_group.get_lora_tokenizer(
|
|
||||||
None) == await tokenizer_group.get_lora_tokenizer_async(None)
|
|
||||||
@ -57,6 +57,10 @@ class TestTokenizer(TokenizerBase):
|
|||||||
def max_token_id(self) -> int:
|
def max_token_id(self) -> int:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def truncation_side(self) -> str:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
text: Union[str, list[str], list[int]],
|
text: Union[str, list[str], list[int]],
|
||||||
|
|||||||
@ -12,7 +12,6 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
|
|||||||
generate_dummy_prompt_logprobs_tensors,
|
generate_dummy_prompt_logprobs_tensors,
|
||||||
generate_dummy_sample_logprobs)
|
generate_dummy_sample_logprobs)
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
|
||||||
|
|
||||||
from ...distributed.conftest import publisher_config, random_port # noqa: F401
|
from ...distributed.conftest import publisher_config, random_port # noqa: F401
|
||||||
|
|
||||||
@ -48,9 +47,6 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
|
|||||||
]
|
]
|
||||||
return DummyOutputProcessorTestVectors(
|
return DummyOutputProcessorTestVectors(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
tokenizer_group=init_tokenizer_from_configs(
|
|
||||||
vllm_config.model_config, vllm_config.scheduler_config,
|
|
||||||
vllm_config.lora_config),
|
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
|
full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
|
|||||||
@ -43,7 +43,7 @@ def _ref_convert_id_to_token(
|
|||||||
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||||
def test_incremental_detokenization(request_output_kind: RequestOutputKind,
|
def test_incremental_detokenization(request_output_kind: RequestOutputKind,
|
||||||
dummy_test_vectors):
|
dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||||
log_stats=False)
|
log_stats=False)
|
||||||
engine_core = MockEngineCore(
|
engine_core = MockEngineCore(
|
||||||
tokens_list=dummy_test_vectors.generation_tokens)
|
tokens_list=dummy_test_vectors.generation_tokens)
|
||||||
@ -382,7 +382,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
|
|||||||
num_sample_logprobs: Optional[int],
|
num_sample_logprobs: Optional[int],
|
||||||
num_prompt_logprobs: Optional[int],
|
num_prompt_logprobs: Optional[int],
|
||||||
dummy_test_vectors):
|
dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||||
log_stats=False)
|
log_stats=False)
|
||||||
engine_core = MockEngineCore(
|
engine_core = MockEngineCore(
|
||||||
tokens_list=dummy_test_vectors.generation_tokens,
|
tokens_list=dummy_test_vectors.generation_tokens,
|
||||||
@ -535,7 +535,7 @@ def test_stop_token(include_stop_str_in_output: bool,
|
|||||||
) # '<|end_of_text|>'
|
) # '<|end_of_text|>'
|
||||||
stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>'
|
stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>'
|
||||||
|
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||||
log_stats=False)
|
log_stats=False)
|
||||||
# Dummy engine core outputs, with control tokens suffixed to test stops
|
# Dummy engine core outputs, with control tokens suffixed to test stops
|
||||||
suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
|
suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
|
||||||
@ -642,7 +642,7 @@ def test_stop_token(include_stop_str_in_output: bool,
|
|||||||
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
|
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
|
||||||
def test_stop_string(include_stop_str_in_output: bool,
|
def test_stop_string(include_stop_str_in_output: bool,
|
||||||
num_sample_logprobs: Optional[int], dummy_test_vectors):
|
num_sample_logprobs: Optional[int], dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||||
log_stats=False)
|
log_stats=False)
|
||||||
engine_core = MockEngineCore(
|
engine_core = MockEngineCore(
|
||||||
tokens_list=dummy_test_vectors.generation_tokens,
|
tokens_list=dummy_test_vectors.generation_tokens,
|
||||||
@ -763,7 +763,7 @@ def test_stop_string(include_stop_str_in_output: bool,
|
|||||||
|
|
||||||
|
|
||||||
def test_iteration_stats(dummy_test_vectors):
|
def test_iteration_stats(dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||||
log_stats=True)
|
log_stats=True)
|
||||||
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
||||||
engine_core_timestamp = time.monotonic()
|
engine_core_timestamp = time.monotonic()
|
||||||
|
|||||||
@ -9,7 +9,6 @@ import torch
|
|||||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||||
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
from vllm.v1.engine import EngineCoreOutput, FinishReason
|
from vllm.v1.engine import EngineCoreOutput, FinishReason
|
||||||
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
|
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
|
||||||
|
|
||||||
@ -296,7 +295,6 @@ def generate_dummy_prompt_logprobs_tensors(
|
|||||||
class DummyOutputProcessorTestVectors:
|
class DummyOutputProcessorTestVectors:
|
||||||
"""Dummy test vectors for output processor tests"""
|
"""Dummy test vectors for output processor tests"""
|
||||||
tokenizer: GeneralTokenizerType
|
tokenizer: GeneralTokenizerType
|
||||||
tokenizer_group: TokenizerGroup
|
|
||||||
vllm_config: EngineArgs
|
vllm_config: EngineArgs
|
||||||
full_tokens: list[list[int]] # Prompt + generated tokens
|
full_tokens: list[list[int]] # Prompt + generated tokens
|
||||||
prompt_tokens: list[list[int]]
|
prompt_tokens: list[list[int]]
|
||||||
|
|||||||
@ -582,7 +582,7 @@ def test_structured_output_with_reasoning_matrices(
|
|||||||
reasoning_parser=reasoning_parser,
|
reasoning_parser=reasoning_parser,
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
)
|
)
|
||||||
tokenizer = llm.get_tokenizer(None)
|
tokenizer = llm.get_tokenizer()
|
||||||
reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
|
reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
|
||||||
tokenizer=tokenizer)
|
tokenizer=tokenizer)
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.lora.utils import get_adapter_absolute_path
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
from vllm.multimodal.image import convert_image_mode
|
from vllm.multimodal.image import convert_image_mode
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import PlaceholderModule
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -155,34 +155,26 @@ class BenchmarkDataset(ABC):
|
|||||||
|
|
||||||
def get_random_lora_request(
|
def get_random_lora_request(
|
||||||
self,
|
self,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
max_loras: Optional[int] = None,
|
max_loras: Optional[int] = None,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
) -> tuple[Optional[LoRARequest], AnyTokenizer]:
|
) -> Optional[LoRARequest]:
|
||||||
"""
|
"""
|
||||||
Optionally select a random LoRA request and return its associated
|
Optionally select a random LoRA request.
|
||||||
tokenizer.
|
|
||||||
|
|
||||||
This method is used when LoRA parameters are provided. It randomly
|
This method is used when LoRA parameters are provided. It randomly
|
||||||
selects a LoRA based on max_loras and retrieves a cached tokenizer for
|
selects a LoRA based on max_loras.
|
||||||
that LoRA if available. Otherwise, it returns the base tokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
|
|
||||||
LoRA is selected.
|
|
||||||
max_loras (Optional[int]): The maximum number of LoRAs available.
|
max_loras (Optional[int]): The maximum number of LoRAs available.
|
||||||
If `None`, LoRA is not used.
|
If `None`, LoRA is not used.
|
||||||
lora_path (Optional[str]): Path to the LoRA parameters on disk.
|
lora_path (Optional[str]): Path to the LoRA parameters on disk.
|
||||||
If `None`, LoRA is not used.
|
If `None`, LoRA is not used.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A tuple with the following elements:
|
A new [LoRARequest][] (or `None` if not applicable).
|
||||||
- A new [LoRARequest][] (or `None` if not applicable).
|
|
||||||
- The tokenizer associated with the LoRA request
|
|
||||||
(or the base tokenizer).
|
|
||||||
"""
|
"""
|
||||||
if max_loras is None or lora_path is None:
|
if max_loras is None or lora_path is None:
|
||||||
return None, tokenizer
|
return None
|
||||||
|
|
||||||
# Generate a random LoRA ID in the range [1, max_loras].
|
# Generate a random LoRA ID in the range [1, max_loras].
|
||||||
lora_id = random.randint(1, max_loras)
|
lora_id = random.randint(1, max_loras)
|
||||||
@ -191,11 +183,7 @@ class BenchmarkDataset(ABC):
|
|||||||
lora_int_id=lora_id,
|
lora_int_id=lora_id,
|
||||||
lora_path=lora_path_on_disk(lora_path),
|
lora_path=lora_path_on_disk(lora_path),
|
||||||
)
|
)
|
||||||
if lora_id not in lora_tokenizer_cache:
|
return lora_request
|
||||||
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
|
|
||||||
# Return lora_request and the cached tokenizer if available; otherwise,
|
|
||||||
# return the base tokenizer
|
|
||||||
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def sample(self, tokenizer: PreTrainedTokenizerBase,
|
def sample(self, tokenizer: PreTrainedTokenizerBase,
|
||||||
@ -982,8 +970,8 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
entry["conversations"][1]["value"],
|
entry["conversations"][1]["value"],
|
||||||
)
|
)
|
||||||
|
|
||||||
lora_request, tokenizer = self.get_random_lora_request(
|
lora_request = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
max_loras=max_loras, lora_path=lora_path)
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
completion_ids = tokenizer(completion).input_ids
|
completion_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_ids)
|
prompt_len = len(prompt_ids)
|
||||||
@ -1882,8 +1870,8 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
input_len = int(data[i][2])
|
input_len = int(data[i][2])
|
||||||
output_len = int(data[i][3])
|
output_len = int(data[i][3])
|
||||||
lora_req, tokenizer = self.get_random_lora_request(
|
lora_req = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
max_loras=max_loras, lora_path=lora_path)
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
||||||
# j) modulo vocab_size.
|
# j) modulo vocab_size.
|
||||||
@ -2376,7 +2364,6 @@ class AIMODataset(HuggingFaceDataset):
|
|||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=None,
|
multi_modal_data=None,
|
||||||
request_id=request_id_prefix + str(ind),
|
request_id=request_id_prefix + str(ind),
|
||||||
|
|
||||||
))
|
))
|
||||||
ind += 1
|
ind += 1
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests,
|
self.maybe_oversample_requests(sampled_requests, num_requests,
|
||||||
|
|||||||
@ -390,11 +390,8 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
"""Stop the remote worker execution loop."""
|
"""Stop the remote worker execution loop."""
|
||||||
await self.model_executor.stop_remote_worker_execution_loop_async()
|
await self.model_executor.stop_remote_worker_execution_loop_async()
|
||||||
|
|
||||||
async def get_tokenizer_async(self,
|
async def get_tokenizer_async(self) -> AnyTokenizer:
|
||||||
lora_request: Optional[LoRARequest] = None
|
return self.get_tokenizer()
|
||||||
) -> AnyTokenizer:
|
|
||||||
return await (
|
|
||||||
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
|
|
||||||
|
|
||||||
async def add_request_async(
|
async def add_request_async(
|
||||||
self,
|
self,
|
||||||
@ -435,7 +432,6 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
|
|
||||||
processed_inputs = await self.input_preprocessor.preprocess_async(
|
processed_inputs = await self.input_preprocessor.preprocess_async(
|
||||||
prompt,
|
prompt,
|
||||||
lora_request=lora_request,
|
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -614,11 +610,8 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
async def get_input_preprocessor(self) -> InputPreprocessor:
|
async def get_input_preprocessor(self) -> InputPreprocessor:
|
||||||
return self.engine.input_preprocessor
|
return self.engine.input_preprocessor
|
||||||
|
|
||||||
async def get_tokenizer(
|
async def get_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
return self.engine.get_tokenizer()
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
return await self.engine.get_tokenizer_async(lora_request)
|
|
||||||
|
|
||||||
def start_background_loop(self) -> None:
|
def start_background_loop(self) -> None:
|
||||||
"""Start the background loop."""
|
"""Start the background loop."""
|
||||||
|
|||||||
@ -49,9 +49,8 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
|
|||||||
from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
|
from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
|
||||||
init_tracer)
|
init_tracer)
|
||||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||||
from vllm.transformers_utils.tokenizer_group import (
|
init_tokenizer_from_configs)
|
||||||
TokenizerGroup, init_tokenizer_from_configs)
|
|
||||||
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
||||||
usage_message)
|
usage_message)
|
||||||
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
|
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
|
||||||
@ -186,7 +185,7 @@ class LLMEngine:
|
|||||||
|
|
||||||
return outputs_
|
return outputs_
|
||||||
|
|
||||||
tokenizer: Optional[TokenizerGroup]
|
tokenizer: Optional[AnyTokenizer]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -233,18 +232,9 @@ class LLMEngine:
|
|||||||
if self.model_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
self.tokenizer = None
|
self.tokenizer = None
|
||||||
self.detokenizer = None
|
self.detokenizer = None
|
||||||
tokenizer_group = None
|
|
||||||
else:
|
else:
|
||||||
self.tokenizer = self._init_tokenizer()
|
self.tokenizer = self._init_tokenizer()
|
||||||
self.detokenizer = Detokenizer(self.tokenizer)
|
self.detokenizer = Detokenizer(self.tokenizer)
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
|
||||||
|
|
||||||
# Ensure that the function doesn't contain a reference to self,
|
|
||||||
# to avoid engine GC issues
|
|
||||||
def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
|
|
||||||
assert tokenizer_group, ("tokenizer_group cannot be None, "
|
|
||||||
"make sure skip_tokenizer_init is False")
|
|
||||||
return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
|
||||||
|
|
||||||
self.seq_counter = Counter()
|
self.seq_counter = Counter()
|
||||||
self.generation_config_fields = (
|
self.generation_config_fields = (
|
||||||
@ -389,10 +379,8 @@ class LLMEngine:
|
|||||||
self.detokenizer,
|
self.detokenizer,
|
||||||
self.scheduler,
|
self.scheduler,
|
||||||
self.seq_counter,
|
self.seq_counter,
|
||||||
get_tokenizer_for_seq,
|
|
||||||
stop_checker=StopChecker(
|
stop_checker=StopChecker(
|
||||||
self.scheduler_config.max_model_len,
|
self.scheduler_config.max_model_len,
|
||||||
get_tokenizer_for_seq,
|
|
||||||
self.reasoner if self.decoding_config.reasoning_backend
|
self.reasoner if self.decoding_config.reasoning_backend
|
||||||
and self.tokenizer else None,
|
and self.tokenizer else None,
|
||||||
),
|
),
|
||||||
@ -521,24 +509,15 @@ class LLMEngine:
|
|||||||
if model_executor := getattr(self, "model_executor", None):
|
if model_executor := getattr(self, "model_executor", None):
|
||||||
model_executor.shutdown()
|
model_executor.shutdown()
|
||||||
|
|
||||||
def get_tokenizer_group(self) -> TokenizerGroup:
|
def get_tokenizer(self) -> AnyTokenizer:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError("Unable to get tokenizer because "
|
raise ValueError("Unable to get tokenizer because "
|
||||||
"skip_tokenizer_init is True")
|
"skip_tokenizer_init is True")
|
||||||
|
|
||||||
return self.tokenizer
|
return self.tokenizer
|
||||||
|
|
||||||
def get_tokenizer(
|
def _init_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
return init_tokenizer_from_configs(model_config=self.model_config)
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
|
|
||||||
|
|
||||||
def _init_tokenizer(self) -> TokenizerGroup:
|
|
||||||
return init_tokenizer_from_configs(
|
|
||||||
model_config=self.model_config,
|
|
||||||
scheduler_config=self.scheduler_config,
|
|
||||||
lora_config=self.lora_config)
|
|
||||||
|
|
||||||
def _verify_args(self) -> None:
|
def _verify_args(self) -> None:
|
||||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||||
@ -574,11 +553,11 @@ class LLMEngine:
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self._validate_model_inputs(processed_inputs, lora_request)
|
self._validate_model_inputs(processed_inputs)
|
||||||
# Create the sequences.
|
# Create the sequences.
|
||||||
block_size = self.cache_config.block_size
|
block_size = self.cache_config.block_size
|
||||||
seq_id = next(self.seq_counter)
|
seq_id = next(self.seq_counter)
|
||||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
eos_token_id = self.input_preprocessor.get_eos_token_id()
|
||||||
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||||
|
|
||||||
@ -700,7 +679,6 @@ class LLMEngine:
|
|||||||
processed_inputs = self.input_preprocessor.preprocess(
|
processed_inputs = self.input_preprocessor.preprocess(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self._add_processed_request(
|
self._add_processed_request(
|
||||||
@ -1739,29 +1717,22 @@ class LLMEngine:
|
|||||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
|
||||||
metrics.model_execute_time)
|
metrics.model_execute_time)
|
||||||
|
|
||||||
def _validate_model_inputs(self, inputs: ProcessorInputs,
|
def _validate_model_inputs(self, inputs: ProcessorInputs):
|
||||||
lora_request: Optional[LoRARequest]):
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
||||||
|
|
||||||
if encoder_inputs is not None:
|
if encoder_inputs is not None:
|
||||||
self._validate_model_input(encoder_inputs,
|
self._validate_model_input(encoder_inputs, prompt_type="encoder")
|
||||||
lora_request,
|
|
||||||
prompt_type="encoder")
|
|
||||||
|
|
||||||
self._validate_model_input(decoder_inputs,
|
self._validate_model_input(decoder_inputs, prompt_type="decoder")
|
||||||
lora_request,
|
|
||||||
prompt_type="decoder")
|
|
||||||
|
|
||||||
def _validate_model_input(
|
def _validate_model_input(
|
||||||
self,
|
self,
|
||||||
prompt_inputs: SingletonInputs,
|
prompt_inputs: SingletonInputs,
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
*,
|
*,
|
||||||
prompt_type: Literal["encoder", "decoder"],
|
prompt_type: Literal["encoder", "decoder"],
|
||||||
):
|
):
|
||||||
model_config = self.model_config
|
model_config = self.model_config
|
||||||
tokenizer = (None if self.tokenizer is None else
|
tokenizer = self.tokenizer
|
||||||
self.tokenizer.get_lora_tokenizer(lora_request))
|
|
||||||
|
|
||||||
prompt_ids = prompt_inputs.get("prompt_token_ids", [])
|
prompt_ids = prompt_inputs.get("prompt_token_ids", [])
|
||||||
if not prompt_ids:
|
if not prompt_ids:
|
||||||
@ -1822,7 +1793,7 @@ class LLMEngine:
|
|||||||
logits_processors = []
|
logits_processors = []
|
||||||
|
|
||||||
if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
|
if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
|
||||||
tokenizer = self.get_tokenizer(lora_request=lora_request)
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processors = get_openai_logits_processors(
|
processors = get_openai_logits_processors(
|
||||||
logit_bias=sampling_params.logit_bias,
|
logit_bias=sampling_params.logit_bias,
|
||||||
@ -1835,7 +1806,7 @@ class LLMEngine:
|
|||||||
sampling_params.allowed_token_ids = None
|
sampling_params.allowed_token_ids = None
|
||||||
|
|
||||||
if len(sampling_params.bad_words) > 0:
|
if len(sampling_params.bad_words) > 0:
|
||||||
tokenizer = self.get_tokenizer(lora_request)
|
tokenizer = self.get_tokenizer()
|
||||||
processors = get_bad_words_logits_processors(
|
processors = get_bad_words_logits_processors(
|
||||||
bad_words=sampling_params.bad_words, tokenizer=tokenizer)
|
bad_words=sampling_params.bad_words, tokenizer=tokenizer)
|
||||||
logits_processors.extend(processors)
|
logits_processors.extend(processors)
|
||||||
|
|||||||
@ -2,14 +2,13 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, List
|
from typing import List
|
||||||
|
|
||||||
from vllm.config import SchedulerConfig
|
from vllm.config import SchedulerConfig
|
||||||
from vllm.core.scheduler import Scheduler
|
from vllm.core.scheduler import Scheduler
|
||||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
|
from vllm.sequence import SequenceGroup, SequenceGroupOutput
|
||||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
|
||||||
from vllm.utils import Counter
|
from vllm.utils import Counter
|
||||||
|
|
||||||
|
|
||||||
@ -31,7 +30,6 @@ class SequenceGroupOutputProcessor(ABC):
|
|||||||
detokenizer: Detokenizer,
|
detokenizer: Detokenizer,
|
||||||
scheduler: List[Scheduler],
|
scheduler: List[Scheduler],
|
||||||
seq_counter: Counter,
|
seq_counter: Counter,
|
||||||
get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
|
|
||||||
stop_checker: "StopChecker",
|
stop_checker: "StopChecker",
|
||||||
):
|
):
|
||||||
"""Create an output processor.
|
"""Create an output processor.
|
||||||
|
|||||||
@ -1,13 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Callable, List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.reasoning import ReasoningParser
|
from vllm.reasoning import ReasoningParser
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import Sequence, SequenceStatus
|
from vllm.sequence import Sequence, SequenceStatus
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class StopChecker:
|
class StopChecker:
|
||||||
@ -20,12 +19,10 @@ class StopChecker:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_model_len: int,
|
max_model_len: int,
|
||||||
get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
|
|
||||||
reasoner: Optional[ReasoningParser] = None,
|
reasoner: Optional[ReasoningParser] = None,
|
||||||
):
|
):
|
||||||
# Do not use it directly, but use `self._get_max_model_len`.
|
# Do not use it directly, but use `self._get_max_model_len`.
|
||||||
self._max_model_len = max_model_len
|
self._max_model_len = max_model_len
|
||||||
self.get_tokenizer_for_seq = get_tokenizer_for_seq
|
|
||||||
self.reasoner = reasoner
|
self.reasoner = reasoner
|
||||||
|
|
||||||
def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
|
def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
|
||||||
|
|||||||
@ -76,8 +76,7 @@ class EngineClient(ABC):
|
|||||||
include_stop_str_in_output = params.include_stop_str_in_output
|
include_stop_str_in_output = params.include_stop_str_in_output
|
||||||
|
|
||||||
preprocessor = await self.get_input_preprocessor()
|
preprocessor = await self.get_input_preprocessor()
|
||||||
tokenizer_group = preprocessor.get_tokenizer_group()
|
tokenizer = preprocessor.get_tokenizer()
|
||||||
tokenizer = await tokenizer_group.get_lora_tokenizer_async()
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
if is_explicit_encoder_decoder_prompt(prompt):
|
if is_explicit_encoder_decoder_prompt(prompt):
|
||||||
@ -260,11 +259,8 @@ class EngineClient(ABC):
|
|||||||
...
|
...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def get_tokenizer(
|
async def get_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
"""Get the tokenizer"""
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
"""Get the appropriate tokenizer for the request"""
|
|
||||||
...
|
...
|
||||||
|
|
||||||
async def get_io_processor(self) -> IOProcessor:
|
async def get_io_processor(self) -> IOProcessor:
|
||||||
|
|||||||
@ -301,23 +301,17 @@ class LLM:
|
|||||||
self.io_processor = get_io_processor(self.llm_engine.vllm_config,
|
self.io_processor = get_io_processor(self.llm_engine.vllm_config,
|
||||||
io_processor_plugin)
|
io_processor_plugin)
|
||||||
|
|
||||||
def get_tokenizer(
|
def get_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
return self.llm_engine.get_tokenizer()
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
|
|
||||||
lora_request)
|
|
||||||
|
|
||||||
def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
|
def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
|
||||||
tokenizer_group = self.llm_engine.get_tokenizer_group()
|
|
||||||
|
|
||||||
# While CachedTokenizer is dynamic, have no choice but
|
# While CachedTokenizer is dynamic, have no choice but
|
||||||
# compare class name. Misjudgment will arise from
|
# compare class name. Misjudgment will arise from
|
||||||
# user-defined tokenizer started with 'Cached'
|
# user-defined tokenizer started with 'Cached'
|
||||||
if tokenizer.__class__.__name__.startswith("Cached"):
|
if tokenizer.__class__.__name__.startswith("Cached"):
|
||||||
tokenizer_group.tokenizer = tokenizer
|
self.llm_engine.tokenizer = tokenizer
|
||||||
else:
|
else:
|
||||||
tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
|
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
|
||||||
|
|
||||||
def get_default_sampling_params(self) -> SamplingParams:
|
def get_default_sampling_params(self) -> SamplingParams:
|
||||||
if self.default_sampling_params is None:
|
if self.default_sampling_params is None:
|
||||||
@ -707,7 +701,6 @@ class LLM:
|
|||||||
self,
|
self,
|
||||||
messages: Union[list[ChatCompletionMessageParam],
|
messages: Union[list[ChatCompletionMessageParam],
|
||||||
list[list[ChatCompletionMessageParam]]],
|
list[list[ChatCompletionMessageParam]]],
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
chat_template: Optional[str] = None,
|
chat_template: Optional[str] = None,
|
||||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
||||||
add_generation_prompt: bool = True,
|
add_generation_prompt: bool = True,
|
||||||
@ -739,7 +732,7 @@ class LLM:
|
|||||||
cast(list[ChatCompletionMessageParam], messages)
|
cast(list[ChatCompletionMessageParam], messages)
|
||||||
]
|
]
|
||||||
|
|
||||||
tokenizer = self.get_tokenizer(lora_request)
|
tokenizer = self.get_tokenizer()
|
||||||
model_config = self.llm_engine.get_model_config()
|
model_config = self.llm_engine.get_model_config()
|
||||||
resolved_content_format = resolve_chat_template_content_format(
|
resolved_content_format = resolve_chat_template_content_format(
|
||||||
chat_template,
|
chat_template,
|
||||||
@ -872,7 +865,6 @@ class LLM:
|
|||||||
|
|
||||||
prompts = self.preprocess_chat(
|
prompts = self.preprocess_chat(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
lora_request=lora_request,
|
|
||||||
chat_template=chat_template,
|
chat_template=chat_template,
|
||||||
chat_template_content_format=chat_template_content_format,
|
chat_template_content_format=chat_template_content_format,
|
||||||
add_generation_prompt=add_generation_prompt,
|
add_generation_prompt=add_generation_prompt,
|
||||||
|
|||||||
@ -188,7 +188,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
model_name = self.models.model_name(lora_request)
|
model_name = self.models.model_name(lora_request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
tool_parser = self.tool_parser
|
tool_parser = self.tool_parser
|
||||||
|
|
||||||
|
|||||||
@ -50,10 +50,7 @@ class ClassificationMixin(OpenAIServing):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ctx.lora_request = self._maybe_get_adapters(ctx.request)
|
ctx.tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
ctx.tokenizer = await self.engine_client.get_tokenizer(
|
|
||||||
ctx.lora_request)
|
|
||||||
|
|
||||||
renderer = self._get_renderer(ctx.tokenizer)
|
renderer = self._get_renderer(ctx.tokenizer)
|
||||||
ctx.engine_prompts = await renderer.render_prompt(
|
ctx.engine_prompts = await renderer.render_prompt(
|
||||||
|
|||||||
@ -127,8 +127,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
if self.model_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
)
|
|
||||||
renderer = self._get_renderer(tokenizer)
|
renderer = self._get_renderer(tokenizer)
|
||||||
|
|
||||||
engine_prompts = await renderer.render_prompt_and_embeds(
|
engine_prompts = await renderer.render_prompt_and_embeds(
|
||||||
|
|||||||
@ -76,8 +76,7 @@ class EmbeddingMixin(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
ctx.lora_request = self._maybe_get_adapters(ctx.request)
|
ctx.lora_request = self._maybe_get_adapters(ctx.request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
)
|
|
||||||
renderer = self._get_renderer(tokenizer)
|
renderer = self._get_renderer(tokenizer)
|
||||||
|
|
||||||
if isinstance(ctx.request, EmbeddingChatRequest):
|
if isinstance(ctx.request, EmbeddingChatRequest):
|
||||||
|
|||||||
@ -103,8 +103,7 @@ class OpenAIServingPooling(OpenAIServing):
|
|||||||
if self.model_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
)
|
|
||||||
renderer = self._get_renderer(tokenizer)
|
renderer = self._get_renderer(tokenizer)
|
||||||
|
|
||||||
if getattr(request, "dimensions", None) is not None:
|
if getattr(request, "dimensions", None) is not None:
|
||||||
|
|||||||
@ -240,7 +240,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
model_name = self.models.model_name(lora_request)
|
model_name = self.models.model_name(lora_request)
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
if self.use_harmony:
|
if self.use_harmony:
|
||||||
messages, request_prompts, engine_prompts = (
|
messages, request_prompts, engine_prompts = (
|
||||||
|
|||||||
@ -269,7 +269,7 @@ class ServingScores(OpenAIServing):
|
|||||||
) -> Union[list[PoolingRequestOutput], ErrorResponse]:
|
) -> Union[list[PoolingRequestOutput], ErrorResponse]:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
|
truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
|
||||||
None)
|
None)
|
||||||
|
|||||||
@ -65,7 +65,7 @@ class OpenAIServingTokenization(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
renderer = self._get_renderer(tokenizer)
|
renderer = self._get_renderer(tokenizer)
|
||||||
|
|
||||||
if isinstance(request, TokenizeChatRequest):
|
if isinstance(request, TokenizeChatRequest):
|
||||||
@ -130,7 +130,7 @@ class OpenAIServingTokenization(OpenAIServing):
|
|||||||
|
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|
||||||
self._log_inputs(request_id,
|
self._log_inputs(request_id,
|
||||||
request.tokens,
|
request.tokens,
|
||||||
|
|||||||
@ -9,13 +9,11 @@ from typing_extensions import assert_never
|
|||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||||
MultiModalInputs, MultiModalUUIDDict)
|
MultiModalInputs, MultiModalUUIDDict)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
|
|
||||||
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
||||||
EncoderDecoderInputs, ProcessorInputs, PromptType,
|
EncoderDecoderInputs, ProcessorInputs, PromptType,
|
||||||
@ -31,7 +29,7 @@ class InputPreprocessor:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
tokenizer: Optional[TokenizerGroup],
|
tokenizer: Optional[AnyTokenizer],
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
|
mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -42,32 +40,28 @@ class InputPreprocessor:
|
|||||||
self.mm_registry = mm_registry
|
self.mm_registry = mm_registry
|
||||||
self.mm_processor_cache = mm_processor_cache
|
self.mm_processor_cache = mm_processor_cache
|
||||||
|
|
||||||
def get_tokenizer_group(self) -> TokenizerGroup:
|
def get_tokenizer(self) -> AnyTokenizer:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError("You cannot pass text prompts when "
|
raise ValueError("You cannot pass text prompts when "
|
||||||
"`skip_tokenizer_init` is True")
|
"`skip_tokenizer_init` is True")
|
||||||
|
|
||||||
return self.tokenizer
|
return self.tokenizer
|
||||||
|
|
||||||
def get_bos_token_id(self,
|
def get_bos_token_id(self) -> Optional[int]:
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
) -> Optional[int]:
|
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
logger.warning("Using None for BOS token id because tokenizer "
|
logger.warning("Using None for BOS token id because tokenizer "
|
||||||
"is not initialized")
|
"is not initialized")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
|
return self.tokenizer.bos_token_id
|
||||||
|
|
||||||
def get_eos_token_id(self,
|
def get_eos_token_id(self) -> Optional[int]:
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
) -> Optional[int]:
|
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
logger.warning("Using None for EOS token id because tokenizer "
|
logger.warning("Using None for EOS token id because tokenizer "
|
||||||
"is not initialized")
|
"is not initialized")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
|
return self.tokenizer.eos_token_id
|
||||||
|
|
||||||
def get_decoder_start_token_id(self) -> Optional[int]:
|
def get_decoder_start_token_id(self) -> Optional[int]:
|
||||||
"""
|
"""
|
||||||
@ -190,14 +184,13 @@ class InputPreprocessor:
|
|||||||
def _tokenize_prompt(
|
def _tokenize_prompt(
|
||||||
self,
|
self,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""
|
"""
|
||||||
Apply the model's tokenizer to a text prompt, returning the
|
Apply the model's tokenizer to a text prompt, returning the
|
||||||
corresponding token IDs.
|
corresponding token IDs.
|
||||||
"""
|
"""
|
||||||
tokenizer = self.get_tokenizer_group()
|
tokenizer = self.get_tokenizer()
|
||||||
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
||||||
|
|
||||||
encoder_config = self.model_config.encoder_config
|
encoder_config = self.model_config.encoder_config
|
||||||
@ -205,50 +198,39 @@ class InputPreprocessor:
|
|||||||
if encoder_config and encoder_config.get("do_lower_case", False):
|
if encoder_config and encoder_config.get("do_lower_case", False):
|
||||||
prompt = prompt.lower()
|
prompt = prompt.lower()
|
||||||
|
|
||||||
return tokenizer.encode(prompt=prompt,
|
return tokenizer.encode(prompt, **tokenization_kwargs)
|
||||||
lora_request=lora_request,
|
|
||||||
**tokenization_kwargs)
|
|
||||||
|
|
||||||
async def _tokenize_prompt_async(
|
async def _tokenize_prompt_async(
|
||||||
self,
|
self,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""
|
"""
|
||||||
Async version of
|
Async version of
|
||||||
[`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
|
[`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
|
||||||
"""
|
"""
|
||||||
tokenizer = self.get_tokenizer_group()
|
tokenizer = self.get_tokenizer()
|
||||||
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
|
||||||
|
|
||||||
return await tokenizer.encode_async(prompt=prompt,
|
return tokenizer.encode(prompt, **tokenization_kwargs)
|
||||||
lora_request=lora_request,
|
|
||||||
**tokenization_kwargs)
|
|
||||||
|
|
||||||
def _get_mm_tokenizer(
|
def _get_mm_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
# PrithviGeoSpatialMAE needs to be initialized without a tokenizer
|
# PrithviGeoSpatialMAE needs to be initialized without a tokenizer
|
||||||
# while using also multi-modal input
|
# while using also multi-modal input
|
||||||
if not self.tokenizer:
|
if not self.tokenizer:
|
||||||
return cast(AnyTokenizer, object()) # Dummy
|
return cast(AnyTokenizer, object()) # Dummy
|
||||||
|
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
tokenizer = self.get_tokenizer()
|
||||||
return tokenizer_group.get_lora_tokenizer(lora_request)
|
return tokenizer
|
||||||
|
|
||||||
async def _get_mm_tokenizer_async(
|
async def _get_mm_tokenizer_async(self) -> AnyTokenizer:
|
||||||
self,
|
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
# PrithviGeoSpatialMAE needs to be initialized without a tokenizer
|
# PrithviGeoSpatialMAE needs to be initialized without a tokenizer
|
||||||
# while using also multi-modal input
|
# while using also multi-modal input
|
||||||
if not self.tokenizer:
|
if not self.tokenizer:
|
||||||
return cast(AnyTokenizer, object()) # Dummy
|
return cast(AnyTokenizer, object()) # Dummy
|
||||||
|
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
tokenizer = self.get_tokenizer()
|
||||||
return await tokenizer_group.get_lora_tokenizer_async(lora_request)
|
return tokenizer
|
||||||
|
|
||||||
def _process_multimodal(
|
def _process_multimodal(
|
||||||
self,
|
self,
|
||||||
@ -256,7 +238,6 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
@ -264,7 +245,7 @@ class InputPreprocessor:
|
|||||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||||
returning the corresponding token IDs and metadata.
|
returning the corresponding token IDs and metadata.
|
||||||
"""
|
"""
|
||||||
tokenizer = self._get_mm_tokenizer(lora_request)
|
tokenizer = self._get_mm_tokenizer()
|
||||||
|
|
||||||
mm_processor = self.mm_registry.create_processor(
|
mm_processor = self.mm_registry.create_processor(
|
||||||
self.model_config,
|
self.model_config,
|
||||||
@ -299,7 +280,6 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
@ -307,7 +287,7 @@ class InputPreprocessor:
|
|||||||
Async version of
|
Async version of
|
||||||
[`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
|
[`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
|
||||||
"""
|
"""
|
||||||
tokenizer = await self._get_mm_tokenizer_async(lora_request)
|
tokenizer = await self._get_mm_tokenizer_async()
|
||||||
|
|
||||||
mm_processor = self.mm_registry.create_processor(
|
mm_processor = self.mm_registry.create_processor(
|
||||||
self.model_config,
|
self.model_config,
|
||||||
@ -386,7 +366,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
parsed_content: TokensPrompt,
|
parsed_content: TokensPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
@ -400,7 +379,6 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -415,7 +393,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
parsed_content: TokensPrompt,
|
parsed_content: TokensPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
@ -429,7 +406,6 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -444,7 +420,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
parsed_content: TextPrompt,
|
parsed_content: TextPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
@ -457,13 +432,11 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt_token_ids = self._tokenize_prompt(
|
prompt_token_ids = self._tokenize_prompt(
|
||||||
prompt_text,
|
prompt_text,
|
||||||
lora_request=lora_request,
|
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
)
|
)
|
||||||
inputs = token_inputs(
|
inputs = token_inputs(
|
||||||
@ -480,7 +453,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
parsed_content: TextPrompt,
|
parsed_content: TextPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
@ -493,13 +465,11 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt_token_ids = await self._tokenize_prompt_async(
|
prompt_token_ids = await self._tokenize_prompt_async(
|
||||||
prompt_text,
|
prompt_text,
|
||||||
lora_request=lora_request,
|
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
)
|
)
|
||||||
inputs = token_inputs(
|
inputs = token_inputs(
|
||||||
@ -516,7 +486,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
@ -526,7 +495,6 @@ class InputPreprocessor:
|
|||||||
Arguments:
|
Arguments:
|
||||||
|
|
||||||
* prompt: single encoder or decoder input prompt
|
* prompt: single encoder or decoder input prompt
|
||||||
* lora_request: this is only valid for decoder prompts
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -539,21 +507,18 @@ class InputPreprocessor:
|
|||||||
if parsed["type"] == "tokens":
|
if parsed["type"] == "tokens":
|
||||||
return self._process_tokens(
|
return self._process_tokens(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
if parsed["type"] == "text":
|
if parsed["type"] == "text":
|
||||||
return self._process_text(
|
return self._process_text(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
if parsed["type"] == "str":
|
if parsed["type"] == "str":
|
||||||
return self._process_text(
|
return self._process_text(
|
||||||
TextPrompt(prompt=parsed["content"]),
|
TextPrompt(prompt=parsed["content"]),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -563,7 +528,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
@ -578,21 +542,18 @@ class InputPreprocessor:
|
|||||||
if parsed["type"] == "tokens":
|
if parsed["type"] == "tokens":
|
||||||
return await self._process_tokens_async(
|
return await self._process_tokens_async(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
if parsed["type"] == "text":
|
if parsed["type"] == "text":
|
||||||
return await self._process_text_async(
|
return await self._process_text_async(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
if parsed["type"] == "str":
|
if parsed["type"] == "str":
|
||||||
return await self._process_text_async(
|
return await self._process_text_async(
|
||||||
TextPrompt(prompt=parsed["content"]),
|
TextPrompt(prompt=parsed["content"]),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -844,7 +805,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
@ -856,7 +816,6 @@ class InputPreprocessor:
|
|||||||
Arguments:
|
Arguments:
|
||||||
|
|
||||||
* prompt: input prompt
|
* prompt: input prompt
|
||||||
* lora_request
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -866,7 +825,6 @@ class InputPreprocessor:
|
|||||||
prompt_comps = self._prompt_to_llm_inputs(
|
prompt_comps = self._prompt_to_llm_inputs(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -876,7 +834,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
@ -887,7 +844,6 @@ class InputPreprocessor:
|
|||||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -897,7 +853,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
@ -919,7 +874,6 @@ class InputPreprocessor:
|
|||||||
return self._process_decoder_only_prompt(
|
return self._process_decoder_only_prompt(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -927,7 +881,6 @@ class InputPreprocessor:
|
|||||||
self,
|
self,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
*,
|
*,
|
||||||
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
mm_uuids: Optional[MultiModalUUIDDict] = None,
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
@ -952,7 +905,6 @@ class InputPreprocessor:
|
|||||||
return await self._process_decoder_only_prompt_async(
|
return await self._process_decoder_only_prompt_async(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -10,18 +10,13 @@ from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence,
|
|||||||
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
||||||
detokenize_incrementally)
|
detokenize_incrementally)
|
||||||
from .tokenizer import AnyTokenizer
|
from .tokenizer import AnyTokenizer
|
||||||
from .tokenizer_group import TokenizerGroup
|
|
||||||
|
|
||||||
|
|
||||||
class Detokenizer:
|
class Detokenizer:
|
||||||
"""Provides methods to decode the output of a model into text."""
|
"""Provides methods to decode the output of a model into text."""
|
||||||
|
|
||||||
def __init__(self, tokenizer_group: TokenizerGroup):
|
def __init__(self, tokenizer: AnyTokenizer):
|
||||||
self.tokenizer_group = tokenizer_group
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
|
|
||||||
"""Returns the HF tokenizer to use for a given sequence."""
|
|
||||||
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
|
||||||
|
|
||||||
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
|
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
|
||||||
prompt_logprobs: list[Optional[dict[
|
prompt_logprobs: list[Optional[dict[
|
||||||
@ -46,7 +41,6 @@ class Detokenizer:
|
|||||||
# Only prompt, without the generated token.
|
# Only prompt, without the generated token.
|
||||||
all_token_ids = seq.get_token_ids()
|
all_token_ids = seq.get_token_ids()
|
||||||
prompt_token_ids = all_token_ids[:-1]
|
prompt_token_ids = all_token_ids[:-1]
|
||||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
|
||||||
prefix_offset = 0
|
prefix_offset = 0
|
||||||
read_offset = 0
|
read_offset = 0
|
||||||
next_iter_prefix_offset = 0
|
next_iter_prefix_offset = 0
|
||||||
@ -70,7 +64,7 @@ class Detokenizer:
|
|||||||
prompt_token_ids[:token_position] + [token_id])
|
prompt_token_ids[:token_position] + [token_id])
|
||||||
(new_tokens, new_text, new_prefix_offset,
|
(new_tokens, new_text, new_prefix_offset,
|
||||||
new_read_offset) = detokenize_incrementally(
|
new_read_offset) = detokenize_incrementally(
|
||||||
tokenizer=tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
all_input_ids=prompt_token_ids_with_token,
|
all_input_ids=prompt_token_ids_with_token,
|
||||||
prev_tokens=prev_tokens,
|
prev_tokens=prev_tokens,
|
||||||
prefix_offset=prefix_offset,
|
prefix_offset=prefix_offset,
|
||||||
@ -111,7 +105,6 @@ class Detokenizer:
|
|||||||
"""
|
"""
|
||||||
all_input_ids = seq.get_token_ids()
|
all_input_ids = seq.get_token_ids()
|
||||||
token_id_generated_this_iteration = all_input_ids[-1]
|
token_id_generated_this_iteration = all_input_ids[-1]
|
||||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
|
||||||
|
|
||||||
# Convert prompt token IDs to tokens if necessary.
|
# Convert prompt token IDs to tokens if necessary.
|
||||||
# Do it here so that we don't have to repeat this
|
# Do it here so that we don't have to repeat this
|
||||||
@ -119,14 +112,14 @@ class Detokenizer:
|
|||||||
if seq.tokens is None:
|
if seq.tokens is None:
|
||||||
(seq.tokens, seq.prefix_offset,
|
(seq.tokens, seq.prefix_offset,
|
||||||
seq.read_offset) = convert_prompt_ids_to_tokens(
|
seq.read_offset) = convert_prompt_ids_to_tokens(
|
||||||
tokenizer=tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
prompt_ids=all_input_ids[:-1],
|
prompt_ids=all_input_ids[:-1],
|
||||||
skip_special_tokens=prms.skip_special_tokens,
|
skip_special_tokens=prms.skip_special_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
(new_tokens, new_decoded_token_text, prefix_offset,
|
(new_tokens, new_decoded_token_text, prefix_offset,
|
||||||
read_offset) = detokenize_incrementally(
|
read_offset) = detokenize_incrementally(
|
||||||
tokenizer=tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
all_input_ids=all_input_ids,
|
all_input_ids=all_input_ids,
|
||||||
prev_tokens=seq.tokens,
|
prev_tokens=seq.tokens,
|
||||||
prefix_offset=seq.prefix_offset,
|
prefix_offset=seq.prefix_offset,
|
||||||
@ -150,7 +143,7 @@ class Detokenizer:
|
|||||||
and token_id != VLLM_INVALID_TOKEN_ID):
|
and token_id != VLLM_INVALID_TOKEN_ID):
|
||||||
all_input_ids_with_logprob = previous_tokens + [token_id]
|
all_input_ids_with_logprob = previous_tokens + [token_id]
|
||||||
(_, new_text, _, _) = detokenize_incrementally(
|
(_, new_text, _, _) = detokenize_incrementally(
|
||||||
tokenizer=tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
all_input_ids=all_input_ids_with_logprob,
|
all_input_ids=all_input_ids_with_logprob,
|
||||||
prev_tokens=seq.tokens,
|
prev_tokens=seq.tokens,
|
||||||
prefix_offset=seq.prefix_offset,
|
prefix_offset=seq.prefix_offset,
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|||||||
import huggingface_hub
|
import huggingface_hub
|
||||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
PreTrainedTokenizerFast)
|
PreTrainedTokenizerFast)
|
||||||
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -19,7 +20,6 @@ from vllm.transformers_utils.config import (
|
|||||||
get_sentence_transformer_tokenizer_config)
|
get_sentence_transformer_tokenizer_config)
|
||||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||||
from vllm.transformers_utils.utils import check_gguf_file
|
from vllm.transformers_utils.utils import check_gguf_file
|
||||||
from vllm.utils import make_async
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
@ -274,20 +274,19 @@ def cached_tokenizer_from_config(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
def init_tokenizer_from_configs(model_config: ModelConfig):
|
||||||
**kwargs) -> Optional[AnyTokenizer]:
|
runner_type = model_config.runner_type
|
||||||
if lora_request is None:
|
if runner_type == "generate" or runner_type == "draft":
|
||||||
return None
|
truncation_side = "left"
|
||||||
try:
|
elif runner_type == "pooling":
|
||||||
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
|
truncation_side = "right"
|
||||||
except Exception as e:
|
else:
|
||||||
# No tokenizer was found in the LoRA folder,
|
assert_never(runner_type)
|
||||||
# use base model tokenizer
|
|
||||||
logger.warning(
|
|
||||||
"No tokenizer found in %s, using base model tokenizer instead. "
|
|
||||||
"(Exception: %s)", lora_request.lora_path, e)
|
|
||||||
tokenizer = None
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
return get_tokenizer(
|
||||||
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
model_config.tokenizer,
|
||||||
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
|
revision=model_config.tokenizer_revision,
|
||||||
|
truncation_side=truncation_side,
|
||||||
|
)
|
||||||
|
|||||||
@ -61,6 +61,11 @@ class TokenizerBase(ABC):
|
|||||||
def max_token_id(self) -> int:
|
def max_token_id(self) -> int:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def truncation_side(self) -> str:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
return self.vocab_size
|
return self.vocab_size
|
||||||
|
|
||||||
|
|||||||
@ -1,132 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from typing_extensions import assert_never
|
|
||||||
|
|
||||||
from vllm.config import ModelConfig, SchedulerConfig
|
|
||||||
from vllm.config.lora import LoRAConfig
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
|
|
||||||
get_lora_tokenizer,
|
|
||||||
get_lora_tokenizer_async,
|
|
||||||
get_tokenizer)
|
|
||||||
from vllm.utils import LRUCache
|
|
||||||
|
|
||||||
|
|
||||||
class TokenizerGroup:
|
|
||||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
|
||||||
|
|
||||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
|
||||||
max_input_length: Optional[int], **tokenizer_config):
|
|
||||||
self.tokenizer_id = tokenizer_id
|
|
||||||
self.tokenizer_config = tokenizer_config
|
|
||||||
self.enable_lora = enable_lora
|
|
||||||
self.max_input_length = max_input_length
|
|
||||||
self.truncation_side = tokenizer_config.get("truncation_side", "left")
|
|
||||||
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
|
||||||
max_loras = tokenizer_config.get("max_loras", 0)
|
|
||||||
self.lora_tokenizers = LRUCache[int, AnyTokenizer](
|
|
||||||
capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
|
|
||||||
|
|
||||||
def get_max_input_len(self,
|
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
) -> Optional[int]:
|
|
||||||
"""Get the maximum input length for the LoRA request."""
|
|
||||||
return self.max_input_length
|
|
||||||
|
|
||||||
def _raise_if_input_too_long(self,
|
|
||||||
encoded_tokens: list[int],
|
|
||||||
lora_request: Optional[LoRARequest] = None):
|
|
||||||
input_length = len(encoded_tokens)
|
|
||||||
if lora_request:
|
|
||||||
max_input_length = (lora_request.long_lora_max_len
|
|
||||||
or self.max_input_length)
|
|
||||||
else:
|
|
||||||
max_input_length = self.max_input_length
|
|
||||||
if max_input_length is not None and input_length > max_input_length:
|
|
||||||
raise ValueError("Input too long.", input_length, max_input_length)
|
|
||||||
|
|
||||||
def encode(self,
|
|
||||||
prompt: str,
|
|
||||||
max_length: Optional[int] = None,
|
|
||||||
truncation: Optional[bool] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
|
||||||
|
|
||||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
|
||||||
ret = encode_tokens(tokenizer,
|
|
||||||
prompt,
|
|
||||||
max_length=max_length,
|
|
||||||
truncation=truncation,
|
|
||||||
add_special_tokens=add_special_tokens)
|
|
||||||
self._raise_if_input_too_long(ret, lora_request)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
async def encode_async(
|
|
||||||
self,
|
|
||||||
prompt: str,
|
|
||||||
max_length: Optional[int] = None,
|
|
||||||
truncation: Optional[bool] = None,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
|
||||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
|
||||||
ret = encode_tokens(tokenizer,
|
|
||||||
prompt,
|
|
||||||
max_length=max_length,
|
|
||||||
truncation=truncation,
|
|
||||||
add_special_tokens=add_special_tokens)
|
|
||||||
self._raise_if_input_too_long(ret, lora_request)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def get_lora_tokenizer(
|
|
||||||
self,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
if not lora_request or not self.enable_lora:
|
|
||||||
return self.tokenizer
|
|
||||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
|
||||||
tokenizer = (get_lora_tokenizer(
|
|
||||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
|
||||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
|
||||||
return tokenizer
|
|
||||||
else:
|
|
||||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
|
||||||
|
|
||||||
async def get_lora_tokenizer_async(
|
|
||||||
self,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
if not lora_request or not self.enable_lora:
|
|
||||||
return self.tokenizer
|
|
||||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
|
||||||
tokenizer = (await get_lora_tokenizer_async(
|
|
||||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
|
||||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
|
||||||
return tokenizer
|
|
||||||
else:
|
|
||||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
|
||||||
|
|
||||||
|
|
||||||
def init_tokenizer_from_configs(model_config: ModelConfig,
|
|
||||||
scheduler_config: SchedulerConfig,
|
|
||||||
lora_config: Optional[LoRAConfig]):
|
|
||||||
runner_type = model_config.runner_type
|
|
||||||
if runner_type == "generate" or runner_type == "draft":
|
|
||||||
truncation_side = "left"
|
|
||||||
elif runner_type == "pooling":
|
|
||||||
truncation_side = "right"
|
|
||||||
else:
|
|
||||||
assert_never(runner_type)
|
|
||||||
|
|
||||||
return TokenizerGroup(
|
|
||||||
tokenizer_id=model_config.tokenizer,
|
|
||||||
enable_lora=bool(lora_config),
|
|
||||||
max_num_seqs=scheduler_config.max_num_seqs,
|
|
||||||
max_loras=lora_config.max_loras if lora_config else 0,
|
|
||||||
max_input_length=None,
|
|
||||||
tokenizer_mode=model_config.tokenizer_mode,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
|
||||||
revision=model_config.tokenizer_revision,
|
|
||||||
truncation_side=truncation_side)
|
|
||||||
@ -327,6 +327,10 @@ class MistralTokenizer(TokenizerBase):
|
|||||||
def max_token_id(self) -> int:
|
def max_token_id(self) -> int:
|
||||||
return self._max_token_id
|
return self._max_token_id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def truncation_side(self) -> str:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
return self.vocab_size
|
return self.vocab_size
|
||||||
|
|
||||||
|
|||||||
@ -29,8 +29,8 @@ from vllm.tasks import SupportedTask
|
|||||||
from vllm.tracing import init_tracer
|
from vllm.tracing import init_tracer
|
||||||
from vllm.transformers_utils.config import (
|
from vllm.transformers_utils.config import (
|
||||||
maybe_register_config_serialize_by_value)
|
maybe_register_config_serialize_by_value)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
init_tokenizer_from_configs)
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
|
from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
|
||||||
deprecate_kwargs)
|
deprecate_kwargs)
|
||||||
@ -112,9 +112,7 @@ class AsyncLLM(EngineClient):
|
|||||||
else:
|
else:
|
||||||
# Tokenizer (+ ensure liveness if running in another process).
|
# Tokenizer (+ ensure liveness if running in another process).
|
||||||
self.tokenizer = init_tokenizer_from_configs(
|
self.tokenizer = init_tokenizer_from_configs(
|
||||||
model_config=vllm_config.model_config,
|
model_config=vllm_config.model_config)
|
||||||
scheduler_config=vllm_config.scheduler_config,
|
|
||||||
lora_config=vllm_config.lora_config)
|
|
||||||
|
|
||||||
# Processor (converts Inputs --> EngineCoreRequests).
|
# Processor (converts Inputs --> EngineCoreRequests).
|
||||||
self.processor = Processor(
|
self.processor = Processor(
|
||||||
@ -596,15 +594,12 @@ class AsyncLLM(EngineClient):
|
|||||||
async def get_input_preprocessor(self) -> InputPreprocessor:
|
async def get_input_preprocessor(self) -> InputPreprocessor:
|
||||||
return self.processor.input_preprocessor
|
return self.processor.input_preprocessor
|
||||||
|
|
||||||
async def get_tokenizer(
|
async def get_tokenizer(self) -> AnyTokenizer:
|
||||||
self,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
) -> AnyTokenizer:
|
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError("Unable to get tokenizer because "
|
raise ValueError("Unable to get tokenizer because "
|
||||||
"skip_tokenizer_init is True")
|
"skip_tokenizer_init is True")
|
||||||
|
|
||||||
return self.tokenizer.get_lora_tokenizer(lora_request)
|
return self.tokenizer
|
||||||
|
|
||||||
async def is_tracing_enabled(self) -> bool:
|
async def is_tracing_enabled(self) -> bool:
|
||||||
return self.observability_config.otlp_traces_endpoint is not None
|
return self.observability_config.otlp_traces_endpoint is not None
|
||||||
|
|||||||
@ -20,8 +20,8 @@ from vllm.pooling_params import PoolingParams
|
|||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.tracing import init_tracer
|
from vllm.tracing import init_tracer
|
||||||
from vllm.transformers_utils.tokenizer_group import (
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||||
TokenizerGroup, init_tokenizer_from_configs)
|
init_tokenizer_from_configs)
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import Device
|
from vllm.utils import Device
|
||||||
from vllm.v1.engine.core_client import EngineCoreClient
|
from vllm.v1.engine.core_client import EngineCoreClient
|
||||||
@ -89,9 +89,7 @@ class LLMEngine:
|
|||||||
else:
|
else:
|
||||||
# Tokenizer (+ ensure liveness if running in another process).
|
# Tokenizer (+ ensure liveness if running in another process).
|
||||||
self.tokenizer = init_tokenizer_from_configs(
|
self.tokenizer = init_tokenizer_from_configs(
|
||||||
model_config=vllm_config.model_config,
|
model_config=vllm_config.model_config)
|
||||||
scheduler_config=vllm_config.scheduler_config,
|
|
||||||
lora_config=vllm_config.lora_config)
|
|
||||||
|
|
||||||
# Processor (convert Inputs --> EngineCoreRequests)
|
# Processor (convert Inputs --> EngineCoreRequests)
|
||||||
self.processor = Processor(vllm_config=vllm_config,
|
self.processor = Processor(vllm_config=vllm_config,
|
||||||
@ -297,7 +295,7 @@ class LLMEngine:
|
|||||||
assert self.log_stats, "Stat logging disabled"
|
assert self.log_stats, "Stat logging disabled"
|
||||||
return get_metrics_snapshot()
|
return get_metrics_snapshot()
|
||||||
|
|
||||||
def get_tokenizer_group(self) -> TokenizerGroup:
|
def get_tokenizer(self) -> AnyTokenizer:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError("Unable to get tokenizer because "
|
raise ValueError("Unable to get tokenizer because "
|
||||||
"skip_tokenizer_init is True")
|
"skip_tokenizer_init is True")
|
||||||
|
|||||||
@ -14,7 +14,6 @@ from vllm.sampling_params import RequestOutputKind
|
|||||||
from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
|
from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
|
||||||
extract_trace_context)
|
extract_trace_context)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
|
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
|
||||||
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
|
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
|
||||||
from vllm.v1.engine.logprobs import LogprobsProcessor
|
from vllm.v1.engine.logprobs import LogprobsProcessor
|
||||||
@ -290,7 +289,7 @@ class RequestState:
|
|||||||
class OutputProcessor:
|
class OutputProcessor:
|
||||||
"""Process EngineCoreOutputs into RequestOutputs."""
|
"""Process EngineCoreOutputs into RequestOutputs."""
|
||||||
|
|
||||||
def __init__(self, tokenizer: TokenizerGroup, log_stats: bool):
|
def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
|
||||||
self.log_stats = log_stats
|
self.log_stats = log_stats
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.request_states: dict[str, RequestState] = {}
|
self.request_states: dict[str, RequestState] = {}
|
||||||
@ -347,10 +346,7 @@ class OutputProcessor:
|
|||||||
if request_id in self.request_states:
|
if request_id in self.request_states:
|
||||||
raise ValueError(f"Request id {request_id} already running.")
|
raise ValueError(f"Request id {request_id} already running.")
|
||||||
|
|
||||||
tokenizer = None if not self.tokenizer else \
|
req_state = RequestState.from_new_request(tokenizer=self.tokenizer,
|
||||||
self.tokenizer.get_lora_tokenizer(request.lora_request)
|
|
||||||
|
|
||||||
req_state = RequestState.from_new_request(tokenizer=tokenizer,
|
|
||||||
request=request,
|
request=request,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
parent_req=parent_req,
|
parent_req=parent_req,
|
||||||
|
|||||||
@ -9,6 +9,7 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
|
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
|
||||||
from vllm.inputs.parse import split_enc_dec_inputs
|
from vllm.inputs.parse import split_enc_dec_inputs
|
||||||
from vllm.inputs.preprocess import InputPreprocessor
|
from vllm.inputs.preprocess import InputPreprocessor
|
||||||
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.multimodal.cache import processor_cache_from_config
|
from vllm.multimodal.cache import processor_cache_from_config
|
||||||
@ -17,7 +18,7 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
|
|||||||
from vllm.multimodal.utils import argsort_mm_positions
|
from vllm.multimodal.utils import argsort_mm_positions
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.structured_output.backend_guidance import (
|
from vllm.v1.structured_output.backend_guidance import (
|
||||||
validate_guidance_grammar)
|
validate_guidance_grammar)
|
||||||
@ -28,13 +29,15 @@ from vllm.v1.structured_output.backend_outlines import (
|
|||||||
from vllm.v1.structured_output.backend_xgrammar import (
|
from vllm.v1.structured_output.backend_xgrammar import (
|
||||||
validate_xgrammar_grammar)
|
validate_xgrammar_grammar)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Processor:
|
class Processor:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
tokenizer: TokenizerGroup,
|
tokenizer: AnyTokenizer,
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
):
|
):
|
||||||
|
|
||||||
@ -90,7 +93,6 @@ class Processor:
|
|||||||
def _validate_sampling_params(
|
def _validate_sampling_params(
|
||||||
self,
|
self,
|
||||||
params: SamplingParams,
|
params: SamplingParams,
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
) -> None:
|
) -> None:
|
||||||
self._validate_structured_output(params)
|
self._validate_structured_output(params)
|
||||||
self._validate_logit_bias(params)
|
self._validate_logit_bias(params)
|
||||||
@ -103,8 +105,7 @@ class Processor:
|
|||||||
# When skip_tokenizer_init=True, we can't validate token IDs
|
# When skip_tokenizer_init=True, we can't validate token IDs
|
||||||
# Skip validation and let the model handle invalid tokens
|
# Skip validation and let the model handle invalid tokens
|
||||||
return
|
return
|
||||||
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
vocab_size = len(self.tokenizer)
|
||||||
vocab_size = len(tokenizer)
|
|
||||||
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
|
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"allowed_token_ids contains out-of-vocab token id!")
|
"allowed_token_ids contains out-of-vocab token id!")
|
||||||
@ -144,7 +145,6 @@ class Processor:
|
|||||||
def _validate_params(
|
def _validate_params(
|
||||||
self,
|
self,
|
||||||
params: Union[SamplingParams, PoolingParams],
|
params: Union[SamplingParams, PoolingParams],
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Validate supported SamplingParam.
|
Validate supported SamplingParam.
|
||||||
@ -155,7 +155,7 @@ class Processor:
|
|||||||
return
|
return
|
||||||
|
|
||||||
self._validate_logprobs(params)
|
self._validate_logprobs(params)
|
||||||
self._validate_sampling_params(params, lora_request)
|
self._validate_sampling_params(params)
|
||||||
self._validate_supported_sampling_params(params)
|
self._validate_supported_sampling_params(params)
|
||||||
|
|
||||||
def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
|
def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
|
||||||
@ -202,10 +202,22 @@ class Processor:
|
|||||||
_validate_single_prompt(prompt) # type: ignore[arg-type]
|
_validate_single_prompt(prompt) # type: ignore[arg-type]
|
||||||
|
|
||||||
def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
|
def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
|
||||||
if lora_request is not None and not self.lora_config:
|
if lora_request is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# LoRA request passed in while LoRA is not enabled
|
||||||
|
if not self.lora_config:
|
||||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||||
"not enabled!")
|
"not enabled!")
|
||||||
|
|
||||||
|
if self.tokenizer is not None:
|
||||||
|
logger.warning_once(
|
||||||
|
"vLLM has deprecated support for supporting different "
|
||||||
|
"tokenizers for different LoRAs. By default, vLLM uses base "
|
||||||
|
"model's tokenizer. If you are using a LoRA "
|
||||||
|
"with its own tokenizer, consider specifying `--tokenizer "
|
||||||
|
"[lora_path]` to use the LoRA tokenizer.")
|
||||||
|
|
||||||
def _validate_structured_output(self, params: SamplingParams) -> None:
|
def _validate_structured_output(self, params: SamplingParams) -> None:
|
||||||
if not params.guided_decoding or not self.decoding_config:
|
if not params.guided_decoding or not self.decoding_config:
|
||||||
return
|
return
|
||||||
@ -326,7 +338,7 @@ class Processor:
|
|||||||
|
|
||||||
# TODO(woosuk): Support pooling models.
|
# TODO(woosuk): Support pooling models.
|
||||||
self._validate_lora(lora_request)
|
self._validate_lora(lora_request)
|
||||||
self._validate_params(params, lora_request)
|
self._validate_params(params)
|
||||||
|
|
||||||
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
|
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
|
||||||
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
|
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
|
||||||
@ -365,7 +377,6 @@ class Processor:
|
|||||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@ -375,9 +386,9 @@ class Processor:
|
|||||||
processed_inputs=processed_inputs,
|
processed_inputs=processed_inputs,
|
||||||
)
|
)
|
||||||
|
|
||||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
eos_token_id = self.input_preprocessor.get_eos_token_id()
|
||||||
|
|
||||||
self._validate_model_inputs(processed_inputs, lora_request)
|
self._validate_model_inputs(processed_inputs)
|
||||||
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||||
|
|
||||||
@ -394,8 +405,7 @@ class Processor:
|
|||||||
sampling_params.update_from_generation_config(
|
sampling_params.update_from_generation_config(
|
||||||
self.generation_config_fields, eos_token_id)
|
self.generation_config_fields, eos_token_id)
|
||||||
if self.tokenizer is not None:
|
if self.tokenizer is not None:
|
||||||
sampling_params.update_from_tokenizer(
|
sampling_params.update_from_tokenizer(self.tokenizer)
|
||||||
self.tokenizer.get_lora_tokenizer(lora_request))
|
|
||||||
else:
|
else:
|
||||||
pooling_params = params.clone()
|
pooling_params = params.clone()
|
||||||
|
|
||||||
@ -436,24 +446,17 @@ class Processor:
|
|||||||
trace_headers=trace_headers,
|
trace_headers=trace_headers,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _validate_model_inputs(self,
|
def _validate_model_inputs(self, inputs: ProcessorInputs):
|
||||||
inputs: ProcessorInputs,
|
|
||||||
lora_request: Optional[LoRARequest] = None):
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
||||||
|
|
||||||
if encoder_inputs is not None:
|
if encoder_inputs is not None:
|
||||||
self._validate_model_input(encoder_inputs,
|
self._validate_model_input(encoder_inputs, prompt_type="encoder")
|
||||||
lora_request,
|
|
||||||
prompt_type="encoder")
|
|
||||||
|
|
||||||
self._validate_model_input(decoder_inputs,
|
self._validate_model_input(decoder_inputs, prompt_type="decoder")
|
||||||
lora_request,
|
|
||||||
prompt_type="decoder")
|
|
||||||
|
|
||||||
def _validate_model_input(
|
def _validate_model_input(
|
||||||
self,
|
self,
|
||||||
prompt_inputs: SingletonInputs,
|
prompt_inputs: SingletonInputs,
|
||||||
lora_request: Optional[LoRARequest],
|
|
||||||
*,
|
*,
|
||||||
prompt_type: Literal["encoder", "decoder"],
|
prompt_type: Literal["encoder", "decoder"],
|
||||||
):
|
):
|
||||||
@ -469,7 +472,7 @@ class Processor:
|
|||||||
if self.model_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
tokenizer = self.tokenizer
|
||||||
max_input_id = max(prompt_ids, default=0)
|
max_input_id = max(prompt_ids, default=0)
|
||||||
|
|
||||||
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
|
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Optional
|
|||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.reasoning import ReasoningParserManager
|
from vllm.reasoning import ReasoningParserManager
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
|
||||||
from vllm.utils import LazyLoader
|
from vllm.utils import LazyLoader
|
||||||
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
|
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
|
||||||
from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
|
from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
|
||||||
@ -60,10 +60,7 @@ class StructuredOutputManager:
|
|||||||
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
|
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
|
||||||
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
self.tokenizer = init_tokenizer_from_configs(
|
self.tokenizer = init_tokenizer_from_configs(
|
||||||
model_config=self.vllm_config.model_config,
|
model_config=self.vllm_config.model_config)
|
||||||
scheduler_config=self.vllm_config.scheduler_config,
|
|
||||||
lora_config=self.vllm_config.lora_config,
|
|
||||||
).get_lora_tokenizer(None)
|
|
||||||
reasoning_backend = \
|
reasoning_backend = \
|
||||||
self.vllm_config.decoding_config.reasoning_backend
|
self.vllm_config.decoding_config.reasoning_backend
|
||||||
if reasoning_backend:
|
if reasoning_backend:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user