mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 12:57:07 +08:00
Merge branch 'main' into v1-sched-interface-2
This commit is contained in:
commit
6b42a56d46
@ -365,6 +365,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int] = None,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
dtype_str = get_config_dtype_str(dtype,
|
||||
@ -385,10 +386,17 @@ class BenchmarkWorker:
|
||||
else:
|
||||
config = op_config[min(op_config.keys(),
|
||||
key=lambda x: abs(x - num_tokens))]
|
||||
kernel_time = benchmark_config(config, num_tokens, num_experts,
|
||||
shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8,
|
||||
use_int8_w8a16)
|
||||
kernel_time = benchmark_config(config,
|
||||
num_tokens,
|
||||
num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
block_quant_shape=block_quant_shape)
|
||||
return config, kernel_time
|
||||
|
||||
def tune(
|
||||
@ -487,6 +495,14 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def get_weight_block_size_safety(config, default_value=None):
|
||||
|
||||
quantization_config = getattr(config, 'quantization_config', {})
|
||||
if isinstance(quantization_config, dict):
|
||||
return quantization_config.get('weight_block_size', default_value)
|
||||
return default_value
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
block_quant_shape = None
|
||||
@ -508,7 +524,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
block_quant_shape = config.quantization_config['weight_block_size']
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
elif config.architectures[0] == "Qwen2MoeForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎\*
|
||||
* ⚠️
|
||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||
* GLM-4V
|
||||
* T + I
|
||||
@ -786,9 +786,9 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
- * `InternVLChatModel`
|
||||
* InternVL 2.5, Mono-InternVL, InternVL 2.0
|
||||
* InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
|
||||
* T + I<sup>E+</sup>
|
||||
* `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
|
||||
* `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
@ -856,12 +856,12 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `PaliGemmaForConditionalGeneration`
|
||||
* PaliGemma ⚠️, PaliGemma 2 ⚠️
|
||||
* PaliGemma, PaliGemma 2
|
||||
* T + I<sup>E</sup>
|
||||
* `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ⚠️
|
||||
- * `Phi3VForCausalLM`
|
||||
* Phi-3-Vision, Phi-3.5-Vision
|
||||
* T + I<sup>E+</sup>
|
||||
@ -926,34 +926,15 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
:::{warning}
|
||||
vLLM does not currently support PrefixLM attention mask, so our PaliGemma implementation uses regular causal attention, which causes the model output to be unstable.
|
||||
|
||||
We may deprecate this model series in a future release.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
:::{important}
|
||||
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
|
||||
`pip install git+https://github.com/huggingface/transformers`.
|
||||
The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
|
||||
|
||||
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
||||
However, there are differences in how they handle text + image inputs:
|
||||
|
||||
@ -969,9 +950,23 @@ V1 currently uses a simplified attention pattern:
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
:::
|
||||
|
||||
Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
|
||||
Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
|
||||
:::{note}
|
||||
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||
:::
|
||||
|
||||
### Pooling Models
|
||||
|
||||
@ -123,10 +123,14 @@ def run_gemma3(questions: list[str], modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
# Default is False; setting it to True is not supported in V1 yet
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [("<bos><start_of_turn>user\n"
|
||||
f"<start_of_image>{question}<end_of_turn>\n"
|
||||
|
||||
@ -83,10 +83,14 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
|
||||
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
# Default is False; setting it to True is not supported in V1 yet
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [{
|
||||
|
||||
@ -1,131 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import shutil
|
||||
from os import path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
from safetensors.torch import load_file, save_file
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
|
||||
|
||||
PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def llama3_1_8b_chess_lora_path():
|
||||
return snapshot_download(
|
||||
repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
|
||||
|
||||
|
||||
# can't use llama lora adapter without module name transformation
|
||||
# because ultravox nest language model
|
||||
def transform_module_names_for_ultravox(state_dict):
|
||||
transformed_state_dict = {}
|
||||
for key, value in state_dict.items():
|
||||
new_key = key.replace("base_model.model",
|
||||
"base_model.model.language_model")
|
||||
transformed_state_dict[new_key] = value
|
||||
return transformed_state_dict
|
||||
|
||||
|
||||
def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
|
||||
tensor_file = "adapter_model.safetensors"
|
||||
state_dict = load_file(path.join(source_repo, tensor_file))
|
||||
transformed_state_dict = transform_module_names_for_ultravox(state_dict)
|
||||
|
||||
save_file(transformed_state_dict, path.join(target_path, tensor_file))
|
||||
|
||||
config_file = "adapter_config.json"
|
||||
shutil.copyfile(path.join(source_repo, config_file),
|
||||
path.join(target_path, config_file))
|
||||
return target_path
|
||||
|
||||
|
||||
def _get_prompt(audio_count, question, placeholder, model_name) -> str:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template([{
|
||||
'role': 'user',
|
||||
'content': f"{placeholder}{question}"
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
|
||||
def test_ultravox_lora(vllm_runner):
|
||||
"""
|
||||
TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
|
||||
"""
|
||||
# Workaround to prevent device mismatch in Whisper.
|
||||
# Can be removed when it is fixed upstream in transformer
|
||||
# https://github.com/huggingface/transformers/pull/35866
|
||||
torch.set_default_device("cpu")
|
||||
|
||||
llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
|
||||
with TemporaryDirectory() as temp_ultravox_lora_dir:
|
||||
llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
|
||||
llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
|
||||
with vllm_runner(
|
||||
ULTRAVOX_MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=128,
|
||||
dtype="bfloat16",
|
||||
max_model_len=1024,
|
||||
) as vllm_model:
|
||||
ultravox_outputs: list[tuple[
|
||||
list[int], str]] = vllm_model.generate_greedy(
|
||||
[
|
||||
_get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
|
||||
ULTRAVOX_MODEL_NAME)
|
||||
],
|
||||
256,
|
||||
lora_request=LoRARequest(str(1), 1,
|
||||
llama3_1_8b_ultravox_chess_lora),
|
||||
)
|
||||
|
||||
# run llama with and without lora to compare outputs with above
|
||||
with vllm_runner(
|
||||
LLMA_MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=128,
|
||||
dtype="bfloat16",
|
||||
max_model_len=1024,
|
||||
) as vllm_model:
|
||||
llama_outputs: list[tuple[list[int], str]] = (
|
||||
vllm_model.generate_greedy(
|
||||
[_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
|
||||
256,
|
||||
lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
|
||||
))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=ultravox_outputs,
|
||||
outputs_1_lst=llama_outputs,
|
||||
name_0="ultravox",
|
||||
name_1="llama",
|
||||
)
|
||||
@ -9,7 +9,7 @@ from pathlib import PosixPath
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import AutoModelForVision2Seq
|
||||
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@ -234,16 +234,41 @@ VLM_TEST_SETTINGS = {
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
# TODO: Use AutoModelForVision2Seq once transformers supports this
|
||||
auto_cls=AutoModelForPreTraining,
|
||||
dtype="bfloat16",
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
),
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["THUDM/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
||||
}),
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
||||
# The image embeddings match with HF but the outputs of the language
|
||||
# decoder are only consistent up to 2 decimal places.
|
||||
# So, we need to reduce the number of tokens for the test to pass.
|
||||
max_tokens=8,
|
||||
num_logprobs=10,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
|
||||
@ -61,7 +61,9 @@ def run_test(
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
vllm_runner_kwargs_: dict[str, Any] = {}
|
||||
vllm_runner_kwargs_: dict[str, Any] = {
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
|
||||
@ -304,8 +304,20 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return hf_model
|
||||
|
||||
|
||||
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4."""
|
||||
def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Gemma 3."""
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, **kwargs):
|
||||
return hf_processor(*args, do_pan_and_scan=True, **kwargs)
|
||||
|
||||
hf_model.processor = processor
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
|
||||
hf_processor = hf_model.processor
|
||||
patch_padding_side(hf_processor)
|
||||
|
||||
@ -313,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
if images is None:
|
||||
return hf_processor(*args, **kwargs)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
|
||||
contents = re.findall(
|
||||
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
|
||||
text,
|
||||
)
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": images,
|
||||
"content": text
|
||||
}],
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
|
||||
@ -96,14 +96,14 @@ def _run_check(
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
prompt = "<image>" * len(images)
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
|
||||
processed_inputs = processor.apply("<image>" * len(images), mm_data,
|
||||
mm_processor_kwargs)
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
@ -152,9 +152,7 @@ def test_processor_override(
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
|
||||
@ -9,10 +9,8 @@ from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
@ -25,7 +23,7 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
@ -36,9 +34,7 @@ def test_processor_override(
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
|
||||
@ -56,14 +56,14 @@ def _run_check(
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
prompt = "<image>" * len(images)
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
|
||||
processed_inputs = processor.apply("<image>" * len(images), mm_data,
|
||||
mm_processor_kwargs)
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
@ -109,9 +109,7 @@ def test_processor_override(
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
|
||||
@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
|
||||
@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
|
||||
@ -35,9 +35,7 @@ def test_processor_override(
|
||||
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
|
||||
@ -30,8 +30,7 @@ def test_processor_override(
|
||||
):
|
||||
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
|
||||
@ -10,6 +10,8 @@ from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
TokensText = tuple[list[int], str]
|
||||
|
||||
|
||||
@ -250,10 +252,8 @@ def check_logprobs_close(
|
||||
|
||||
|
||||
def build_model_context(
|
||||
model_name: str,
|
||||
model_id: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
mm_processor_kwargs: Optional[dict] = None,
|
||||
limit_mm_per_prompt: Optional[dict] = None,
|
||||
@ -262,9 +262,7 @@ def build_model_context(
|
||||
"""Creates an InputContext for a given model.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model being considered.
|
||||
tokenizer_name: Name of the tokenizer being considered.
|
||||
trust_remote_code: Whether or not to allow loading remote code.
|
||||
model_id: ID of the model being considered.
|
||||
mm_processor_kwargs: optional processor kwargs for to be leveraged
|
||||
in the input processor, mapper, dummy data creation, etc.
|
||||
limit_mm_per_prompt: Multimodal limits.
|
||||
@ -272,21 +270,24 @@ def build_model_context(
|
||||
Returns:
|
||||
InputContext for the model being considered.
|
||||
"""
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = model_name
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if dtype is None:
|
||||
dtype = "half"
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_name,
|
||||
model_id,
|
||||
task=task,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
|
||||
@ -470,22 +470,184 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
|
||||
assert not output_processor.has_unfinished_requests()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
|
||||
[(False, "stop_token_ids", False, None),
|
||||
(True, "stop_token_ids", False, None),
|
||||
(False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
|
||||
(True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
|
||||
(False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
|
||||
(False, "eos_token_id", True, None)])
|
||||
def test_stop_token(include_stop_str_in_output: bool,
|
||||
num_sample_logprobs: Optional[int], stop_token_type: str,
|
||||
ignore_eos: bool, dummy_test_vectors):
|
||||
"""Test output processor EOS/stop token handling.
|
||||
|
||||
Send mock engine core request to mock engine core and pass core outputs
|
||||
to output processor. Validate output processor tokens, text and
|
||||
(if enabled) sample logprobs. Batch-size one.
|
||||
|
||||
The test emulates a scenario where a model outputs text tokens followed
|
||||
by two identical control tokens:
|
||||
<token><token>...<token><control><control>
|
||||
|
||||
If EOS is under test, the control tokens are EOS; otherwise, they are
|
||||
some other token id.
|
||||
|
||||
Test behavior:
|
||||
|
||||
* If EOS is under test and `ignore_eos=True`, the detokenized string
|
||||
should be <token><token>...<token><control><control> and the finish
|
||||
reason should be "length" (i.e. no stop occurs)
|
||||
|
||||
* else, if `include_stop_str_in_output==True`, the detokenized
|
||||
string should be <token><token>...<token><control> and the finish
|
||||
reason should be "stop" (i.e. first control token causes stop
|
||||
and is represented in output text)
|
||||
|
||||
* else, the detokenized string should be
|
||||
<token><token>...<token> and the finish reason should be "stop"
|
||||
(i.e. first control token causes stop but is not represented
|
||||
in output text.)
|
||||
|
||||
Note: some test details are tuned for meta-llama/Llama-3.2-1B,
|
||||
another model should work only if the test is modified.
|
||||
|
||||
Args:
|
||||
include_stop_str_in_output: stop token str appears in output text
|
||||
num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
|
||||
stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
|
||||
ignore_eos: if True, EOS stops are disabled
|
||||
dummy_test_vectors: dummy engine core outputs and other data structures
|
||||
"""
|
||||
model_id = dummy_test_vectors.tokenizer.name_or_path
|
||||
if model_id != 'meta-llama/Llama-3.2-1B':
|
||||
raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
|
||||
f"{model_id} is in use.")
|
||||
do_logprobs = num_sample_logprobs is not None
|
||||
# EOS under test; if False, stop_token_ids under test
|
||||
is_eos_test = stop_token_type == "eos_token_id"
|
||||
# EOS under test but ignore_eos enabled
|
||||
is_eos_ignore_test = is_eos_test and ignore_eos
|
||||
eos_token_id = (
|
||||
dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
|
||||
) # '<|end_of_text|>'
|
||||
stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>'
|
||||
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
||||
log_stats=False)
|
||||
# Dummy engine core outputs, with control tokens suffixed to test stops
|
||||
suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
|
||||
assert suffix_token is not None and isinstance(suffix_token[0], int)
|
||||
generation_string = dummy_test_vectors.generation_strings[0]
|
||||
generation_tokens = (dummy_test_vectors.generation_tokens[0] +
|
||||
2 * suffix_token)
|
||||
if do_logprobs:
|
||||
generation_logprobs = (
|
||||
dummy_test_vectors.generation_logprobs[0] +
|
||||
2 * [dummy_test_vectors.generation_logprobs[0][-1]])
|
||||
prompt_string = dummy_test_vectors.prompt_strings[0]
|
||||
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=[generation_tokens],
|
||||
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
|
||||
prompt_logprobs_raw=None,
|
||||
eos_token_id=eos_token_id,
|
||||
stop_token_ids=stop_token_ids,
|
||||
ignore_eos=ignore_eos)
|
||||
|
||||
# Make request.
|
||||
request_id = "request-0"
|
||||
request = EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
prompt=prompt_string,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_inputs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
eos_token_id=eos_token_id,
|
||||
lora_request=None,
|
||||
sampling_params=SamplingParams(
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=RequestOutputKind.DELTA,
|
||||
stop=[],
|
||||
stop_token_ids=stop_token_ids,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
logprobs=num_sample_logprobs,
|
||||
prompt_logprobs=None,
|
||||
ignore_eos=ignore_eos,
|
||||
))
|
||||
|
||||
# Add request to the detokenizer.
|
||||
output_processor.add_request(request)
|
||||
|
||||
# Loop over engine core steps; run output processor
|
||||
gen_string = ""
|
||||
gen_tokens = []
|
||||
gen_logprobs = []
|
||||
while True:
|
||||
# Mock output from the EngineCore.
|
||||
outputs = engine_core.get_outputs()
|
||||
if len(outputs) == 0:
|
||||
break
|
||||
|
||||
# Step the Detokenizer.
|
||||
processed_outputs = output_processor.process_outputs(outputs)
|
||||
request_outputs = processed_outputs.request_outputs
|
||||
assert len(request_outputs) == 1
|
||||
# Stop token does not rely on abort
|
||||
assert not processed_outputs.reqs_to_abort
|
||||
|
||||
# Update tracking.
|
||||
request_output = request_outputs[0]
|
||||
if request_output.finished:
|
||||
finish_reason = ("length" if is_eos_ignore_test else "stop")
|
||||
assert request_output.outputs[0].finish_reason == finish_reason
|
||||
|
||||
gen_string += request_output.outputs[0].text
|
||||
gen_tokens.extend(request_output.outputs[0].token_ids)
|
||||
if do_logprobs:
|
||||
gen_logprobs.extend(request_output.outputs[0].logprobs)
|
||||
|
||||
# Validate generated text
|
||||
control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
|
||||
if is_eos_ignore_test:
|
||||
# Length-based stop; expect full string
|
||||
ref_str = generation_string + 2 * control_token
|
||||
elif include_stop_str_in_output:
|
||||
# Stop token triggered; include in output
|
||||
ref_str = generation_string + control_token
|
||||
else:
|
||||
# Stop token triggered but not in output
|
||||
ref_str = generation_string
|
||||
assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
|
||||
|
||||
if do_logprobs:
|
||||
# Validate number of sample logprobs
|
||||
num_tokens = len(gen_tokens)
|
||||
num_logprobs = len(gen_logprobs)
|
||||
assert num_tokens == num_logprobs, (
|
||||
f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
|
||||
|
||||
# Check requests are finished
|
||||
assert output_processor.get_num_unfinished_requests() == 0
|
||||
assert not output_processor.has_unfinished_requests()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
|
||||
@pytest.mark.parametrize("num_sample_logprobs",
|
||||
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
|
||||
@pytest.mark.parametrize("num_prompt_logprobs",
|
||||
[None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
|
||||
def test_stop_string(include_stop_str_in_output: bool,
|
||||
num_sample_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int], dummy_test_vectors):
|
||||
num_sample_logprobs: Optional[int], dummy_test_vectors):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
|
||||
log_stats=False)
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
|
||||
if num_sample_logprobs else None,
|
||||
prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
|
||||
if num_prompt_logprobs else None)
|
||||
prompt_logprobs_raw=None)
|
||||
|
||||
# Make N requests.
|
||||
request_id_list = [
|
||||
@ -510,7 +672,7 @@ def test_stop_string(include_stop_str_in_output: bool,
|
||||
stop=STOP_STRINGS,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
logprobs=num_sample_logprobs,
|
||||
prompt_logprobs=num_prompt_logprobs,
|
||||
prompt_logprobs=None,
|
||||
)) for idx, (prompt, prompt_tokens) in enumerate(
|
||||
zip(dummy_test_vectors.prompt_strings,
|
||||
dummy_test_vectors.prompt_tokens))
|
||||
@ -594,8 +756,7 @@ def test_stop_string(include_stop_str_in_output: bool,
|
||||
# Confirmed tracked logprobs match what we expect
|
||||
_validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
|
||||
gen_cumulative_logprobs, dummy_test_vectors,
|
||||
request_id_list, num_sample_logprobs,
|
||||
num_prompt_logprobs)
|
||||
request_id_list, num_sample_logprobs, None)
|
||||
|
||||
assert output_processor.get_num_unfinished_requests() == 0
|
||||
assert not output_processor.has_unfinished_requests()
|
||||
|
||||
@ -20,7 +20,7 @@ NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
|
||||
# Number of prompt logprobs to request when testing prompt logprobs
|
||||
NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
|
||||
|
||||
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
TOKENIZER_NAME = "meta-llama/Llama-3.2-1B"
|
||||
|
||||
FULL_STRINGS = [
|
||||
"My name is Robert from Neural Magic and I love working on vLLM so much!",
|
||||
@ -330,13 +330,21 @@ class MockEngineCore:
|
||||
# each matrix has dimensions
|
||||
# (num prompt toks) x (num prompt logprobs+1)
|
||||
prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
|
||||
eos_token_id: Optional[int] = None,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
ignore_eos: bool = False,
|
||||
) -> None:
|
||||
self.num_requests = len(tokens_list)
|
||||
self.tokens_list = tokens_list
|
||||
self.current_idx = 0
|
||||
self.generated_logprobs_raw = generated_logprobs_raw
|
||||
self.do_logprobs = generated_logprobs_raw is not None
|
||||
self.prompt_logprobs_raw = prompt_logprobs_raw
|
||||
self.do_prompt_logprobs = prompt_logprobs_raw is not None
|
||||
self.request_finished = [False for _ in range(self.num_requests)]
|
||||
self.eos_token_id = eos_token_id
|
||||
self.stop_token_ids = stop_token_ids
|
||||
self.ignore_eos = ignore_eos
|
||||
|
||||
def get_outputs(self) -> list[EngineCoreOutput]:
|
||||
do_logprobs = self.do_logprobs
|
||||
@ -345,7 +353,7 @@ class MockEngineCore:
|
||||
|
||||
outputs = []
|
||||
for req_idx, token_ids in enumerate(self.tokens_list):
|
||||
if len(token_ids) > token_idx:
|
||||
if not self.request_finished[req_idx]:
|
||||
if do_logprobs:
|
||||
assert self.generated_logprobs_raw is not None
|
||||
(logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
|
||||
@ -365,14 +373,23 @@ class MockEngineCore:
|
||||
prompt_logprobs = None
|
||||
else:
|
||||
prompt_logprobs = None
|
||||
new_token_id = token_ids[token_idx]
|
||||
output = EngineCoreOutput(
|
||||
request_id=f"request-{req_idx}",
|
||||
new_token_ids=[token_ids[token_idx]],
|
||||
new_token_ids=[new_token_id],
|
||||
new_logprobs=logprobs,
|
||||
new_prompt_logprobs_tensors=prompt_logprobs,
|
||||
)
|
||||
if token_idx == len(token_ids) - 1:
|
||||
output.finish_reason = FinishReason.LENGTH
|
||||
self.request_finished[req_idx] = True
|
||||
if not self.ignore_eos and new_token_id == self.eos_token_id:
|
||||
output.finish_reason = FinishReason.STOP
|
||||
self.request_finished[req_idx] = True
|
||||
if new_token_id in (self.stop_token_ids or ()):
|
||||
output.finish_reason = FinishReason.STOP
|
||||
output.stop_reason = new_token_id
|
||||
self.request_finished[req_idx] = True
|
||||
outputs.append(output)
|
||||
|
||||
self.current_idx += 1
|
||||
|
||||
@ -286,14 +286,18 @@ class ModelConfig:
|
||||
if rope_scaling is not None:
|
||||
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
|
||||
hf_overrides_kw.update(hf_override)
|
||||
msg = ("`--rope-scaling` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
|
||||
hf_overrides_str = json.dumps(hf_overrides)
|
||||
msg = (
|
||||
"`--rope-scaling` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
if rope_theta is not None:
|
||||
hf_override = {"rope_theta": rope_theta}
|
||||
hf_overrides_kw.update(hf_override)
|
||||
msg = ("`--rope-theta` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
|
||||
hf_overrides_str = json.dumps(hf_overrides)
|
||||
msg = (
|
||||
"`--rope-theta` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
|
||||
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
|
||||
|
||||
@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
hf_config = self._model_config.hf_config
|
||||
model_type = hf_config.model_type
|
||||
|
||||
if modality in ["image", "image_embeds"]:
|
||||
if modality in ("image", "image_embeds"):
|
||||
if model_type == "chatglm":
|
||||
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
|
||||
if model_type == "phi3_v":
|
||||
# Workaround since this token is not defined in the tokenizer
|
||||
return f"<|image_{current_count}|>"
|
||||
@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
|
||||
if model_type in ("minicpmo", "minicpmv"):
|
||||
return "(<image>./</image>)"
|
||||
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
|
||||
"pixtral"):
|
||||
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
|
||||
# These models do not use image tokens in the prompt
|
||||
return None
|
||||
if model_type == "qwen":
|
||||
|
||||
@ -348,7 +348,11 @@ class InputRegistry:
|
||||
dummy_factory = self._get_dummy_data_factory(model_cls)
|
||||
mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
|
||||
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
|
||||
dummy_factory, overrides=model_config.mm_processor_kwargs)
|
||||
dummy_factory,
|
||||
overrides=model_config.mm_processor_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
dummy_data = dummy_factory(InputContext(model_config), seq_len,
|
||||
_MultiModalCounts(mm_counts),
|
||||
@ -381,6 +385,7 @@ class InputRegistry:
|
||||
self,
|
||||
ctx: InputContext,
|
||||
inputs: ProcessorInputs,
|
||||
**kwargs: object,
|
||||
) -> ProcessorInputs:
|
||||
"""The default input processor is a no-op."""
|
||||
return inputs
|
||||
@ -447,6 +452,8 @@ class InputRegistry:
|
||||
model_config.mm_processor_kwargs,
|
||||
inputs.get("mm_processor_kwargs", {}), # type: ignore
|
||||
processor,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
processed_inputs = processor(
|
||||
|
||||
@ -98,6 +98,13 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
|
||||
|
||||
def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
|
||||
qweight_type: int) -> torch.Tensor:
|
||||
# HACK: when doing chunked prefill we don't generate output tokens
|
||||
# so input to logits generator is empty which causes invalid parameter
|
||||
if x.shape[0] == 0:
|
||||
return torch.empty(x.shape[0],
|
||||
qweight.shape[0],
|
||||
dtype=x.dtype,
|
||||
device=x.device)
|
||||
# there is no need to call any kernel for fp16/bf16
|
||||
if qweight_type in UNQUANTIZED_TYPES:
|
||||
return x @ qweight.T
|
||||
|
||||
@ -161,8 +161,13 @@ class RotaryEmbedding(CustomOp):
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
self.cos_sin_cache = self.cos_sin_cache.to(query.device,
|
||||
dtype=query.dtype)
|
||||
# __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
|
||||
# is expensive, so avoid calling it if possible
|
||||
if self.cos_sin_cache.device != query.device or \
|
||||
self.cos_sin_cache.dtype != query.dtype:
|
||||
self.cos_sin_cache = self.cos_sin_cache.to(query.device,
|
||||
dtype=query.dtype)
|
||||
|
||||
# ops.rotary_embedding()/batched_rotary_embedding()
|
||||
# are in-place operations that update the query and key tensors.
|
||||
if offsets is not None:
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import LayerBlockType
|
||||
|
||||
from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
|
||||
SupportsV0Only)
|
||||
SupportsQuant, SupportsV0Only)
|
||||
from .utils import (is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
@ -367,7 +367,7 @@ class BambaModel(nn.Module):
|
||||
|
||||
|
||||
class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
IsHybrid, SupportsV0Only):
|
||||
IsHybrid, SupportsV0Only, SupportsQuant):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
|
||||
@ -44,7 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsV0Only
|
||||
from .interfaces import SupportsQuant, SupportsV0Only
|
||||
from .utils import maybe_prefix
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@ -697,7 +697,7 @@ class BartDecoder(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BartModel(nn.Module):
|
||||
class BartModel(nn.Module, SupportsQuant):
|
||||
_tied_weights_keys = [
|
||||
"encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
|
||||
]
|
||||
@ -763,7 +763,8 @@ class BartModel(nn.Module):
|
||||
return decoder_outputs
|
||||
|
||||
|
||||
class BartForConditionalGeneration(nn.Module, SupportsV0Only):
|
||||
class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||
base_model_prefix = "model"
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
"""Inference-only ChatGLM model compatible with THUDM weights."""
|
||||
import json
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
|
||||
"The configuration of this model indicates that it supports "
|
||||
"vision inputs, but you instantiated the text-only version "
|
||||
"of this model. Please use the vision model by setting "
|
||||
f"`--hf-overrides {hf_overrides!r}`")
|
||||
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
|
||||
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import math
|
||||
from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
|
||||
Tuple, TypedDict, Union)
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import BatchFeature, Gemma3Config, ProcessorMixin
|
||||
from transformers import BatchFeature, Gemma3Config, Gemma3Processor
|
||||
from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
@ -14,10 +16,11 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptUpdate, PromptUpdateDetails)
|
||||
PromptUpdate, encode_tokens)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@ -31,8 +34,15 @@ logger = init_logger(__name__)
|
||||
|
||||
class Gemma3ImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
data: torch.Tensor
|
||||
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""
|
||||
pixel_values: torch.Tensor
|
||||
"""
|
||||
Shape: `(num_crops_total, num_channels, height, width)`
|
||||
|
||||
`num_crops_total` is the total number of crops
|
||||
over each image over each prompt in the batch.
|
||||
"""
|
||||
num_crops: torch.Tensor
|
||||
"""Shape: `(batch_size * num_images,)`"""
|
||||
|
||||
|
||||
Gemma3ImageInputs = Gemma3ImagePixelInputs
|
||||
@ -40,6 +50,9 @@ Gemma3ImageInputs = Gemma3ImagePixelInputs
|
||||
|
||||
class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -48,22 +61,160 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
hf_config = self.ctx.get_hf_config()
|
||||
return {"image": hf_config.mm_tokens_per_image}
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
|
||||
def _resolve_image_kwargs(
|
||||
self,
|
||||
processor: Gemma3Processor,
|
||||
keys: set[str],
|
||||
) -> dict[str, Any]:
|
||||
image_processor = processor.image_processor
|
||||
kwargs = processor._merge_kwargs(
|
||||
Gemma3ProcessorKwargs,
|
||||
tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
|
||||
)
|
||||
|
||||
images_kwargs = kwargs["images_kwargs"]
|
||||
|
||||
def _resolve_kw(key: str):
|
||||
val = getattr(image_processor, key)
|
||||
if val is None:
|
||||
val = images_kwargs[key]
|
||||
|
||||
return val
|
||||
|
||||
return {k: _resolve_kw(k) for k in keys}
|
||||
|
||||
def get_num_crops(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: Optional[Gemma3Processor],
|
||||
) -> int:
|
||||
if processor is None:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
images_kwargs = self._resolve_image_kwargs(
|
||||
processor, {
|
||||
"do_pan_and_scan", "pan_and_scan_min_crop_size",
|
||||
"pan_and_scan_max_num_crops",
|
||||
"pan_and_scan_min_ratio_to_activate"
|
||||
})
|
||||
|
||||
do_pan_and_scan = images_kwargs["do_pan_and_scan"]
|
||||
pan_and_scan_min_crop_size = images_kwargs[
|
||||
"pan_and_scan_min_crop_size"]
|
||||
pan_and_scan_max_num_crops = images_kwargs[
|
||||
"pan_and_scan_max_num_crops"]
|
||||
pan_and_scan_min_ratio_to_activate = images_kwargs[
|
||||
"pan_and_scan_min_ratio_to_activate"]
|
||||
|
||||
if not do_pan_and_scan:
|
||||
return 0
|
||||
|
||||
# Based on Gemma3ImageProcessor.pan_and_scan
|
||||
if image_width >= image_height:
|
||||
if image_width / image_height < pan_and_scan_min_ratio_to_activate:
|
||||
return 0
|
||||
|
||||
num_crops_w = min(
|
||||
int(math.floor(image_width / pan_and_scan_min_crop_size)),
|
||||
int(math.floor(image_width / image_height + 0.5)),
|
||||
)
|
||||
|
||||
num_crops_w = max(2, num_crops_w)
|
||||
num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
|
||||
num_crops_h = 1
|
||||
else:
|
||||
if image_height / image_width < pan_and_scan_min_ratio_to_activate:
|
||||
return 0
|
||||
|
||||
num_crops_h = min(
|
||||
int(math.floor(image_height / pan_and_scan_min_crop_size)),
|
||||
int(math.floor(image_height / image_width + 0.5)),
|
||||
)
|
||||
|
||||
num_crops_h = max(2, num_crops_h)
|
||||
num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
|
||||
num_crops_w = 1
|
||||
|
||||
crop_size_w = int(math.ceil(image_width / num_crops_w))
|
||||
crop_size_h = int(math.ceil(image_height / num_crops_h))
|
||||
|
||||
if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
|
||||
return 0
|
||||
|
||||
return num_crops_w * num_crops_h
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: Optional[Gemma3Processor],
|
||||
) -> str:
|
||||
if processor is None:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
image_token = processor.boi_token
|
||||
|
||||
num_crops = self.get_num_crops(
|
||||
image_width=image_width,
|
||||
image_height=image_height,
|
||||
processor=processor,
|
||||
)
|
||||
|
||||
if num_crops == 0:
|
||||
image_text = image_token
|
||||
else:
|
||||
crops_image_tokens = " ".join(image_token
|
||||
for _ in range(num_crops))
|
||||
image_text = (
|
||||
f"Here is the original image {image_token} and here are some "
|
||||
f"crops to help you see better {crops_image_tokens}")
|
||||
|
||||
return image_text.replace(image_token, processor.full_image_sequence)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: Optional[ProcessorMixin],
|
||||
processor: Optional[Gemma3Processor],
|
||||
) -> int:
|
||||
hf_config = self.ctx.get_hf_config()
|
||||
return hf_config.mm_tokens_per_image
|
||||
tokenizer = self.get_tokenizer()
|
||||
image_repl = self.get_image_repl(
|
||||
image_width=image_width,
|
||||
image_height=image_height,
|
||||
processor=processor,
|
||||
)
|
||||
|
||||
image_repl_tokens = encode_tokens(
|
||||
tokenizer,
|
||||
image_repl,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
return len(image_repl_tokens)
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
# Result in the max possible feature size (h:w = 16:1)
|
||||
return ImageSize(height=8000, width=50)
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
images_kwargs = self._resolve_image_kwargs(
|
||||
processor, {"pan_and_scan_max_num_crops"})
|
||||
max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
|
||||
|
||||
# Result in the max possible feature size (h:w = max_num_crops:1)
|
||||
return ImageSize(height=50 * max_num_crops, width=50)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
processor=None,
|
||||
)
|
||||
|
||||
|
||||
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
@ -73,10 +224,11 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> ProcessorInputs:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
boi_token = tokenizer.boi_token
|
||||
processor = self.info.get_hf_processor()
|
||||
image_token = processor.boi_token
|
||||
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
@ -86,8 +238,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
|
||||
# NOTE: We need to separate the image tokens here because
|
||||
# encode("\n\n\n\n") != encode("\n\n") * 2, which interferes
|
||||
# with the detection of prompt updates when the image tokens are
|
||||
# right next to each other
|
||||
return ProcessorInputs(
|
||||
prompt_text=" ".join([boi_token] * num_images),
|
||||
prompt_text=" ".join([image_token] * num_images),
|
||||
mm_data=mm_data,
|
||||
)
|
||||
|
||||
@ -100,22 +257,49 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# TODO(woosuk): Support pan-and-scan.
|
||||
img_kwargs = mm_kwargs.get("images_kwargs", {})
|
||||
img_kwargs["do_pan_and_scan"] = False
|
||||
mm_kwargs["images_kwargs"] = img_kwargs
|
||||
return super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt,
|
||||
mm_data,
|
||||
mm_kwargs,
|
||||
)
|
||||
|
||||
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
|
||||
if (images := mm_data.get("images")) is not None:
|
||||
assert isinstance(images, list)
|
||||
|
||||
parsed_images = (self._get_data_parser().parse_mm_data({
|
||||
"image":
|
||||
images
|
||||
}).get_items("image", ImageProcessorItems))
|
||||
image_sizes = [
|
||||
parsed_images.get_image_size(i)
|
||||
for i in range(len(parsed_images))
|
||||
]
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
|
||||
num_crops = [
|
||||
self.info.get_num_crops(image_width=size.width,
|
||||
image_height=size.height,
|
||||
processor=hf_processor)
|
||||
for size in image_sizes
|
||||
]
|
||||
|
||||
processed_outputs["num_crops"] = torch.tensor(num_crops)
|
||||
|
||||
return processed_outputs
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
|
||||
num_crops = hf_inputs.get("num_crops", torch.empty(0))
|
||||
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
||||
"image", num_crops + 1),
|
||||
num_crops=MultiModalFieldConfig.batched("image"),
|
||||
)
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
@ -123,25 +307,23 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
boi_token = tokenizer.boi_token
|
||||
image_token = tokenizer.image_token
|
||||
mm_tokens_per_image = hf_config.mm_tokens_per_image
|
||||
image_tokens_expanded = "".join([image_token] * mm_tokens_per_image)
|
||||
image_token = hf_processor.boi_token
|
||||
|
||||
def get_replacement_gemma3(item_idx: int):
|
||||
return PromptUpdateDetails(
|
||||
full=hf_processor.full_image_sequence,
|
||||
features=image_tokens_expanded,
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
|
||||
image_size = images.get_image_size(item_idx)
|
||||
return self.info.get_image_repl(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
processor=hf_processor,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=boi_token,
|
||||
target=image_token,
|
||||
replacement=get_replacement_gemma3,
|
||||
)
|
||||
]
|
||||
@ -254,19 +436,27 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
num_crops = kwargs.pop("num_crops", None)
|
||||
image_embeds = kwargs.pop("image_embeds", None)
|
||||
assert image_embeds is None, "Gemma3 does not support image_embeds."
|
||||
if pixel_values is None:
|
||||
return None
|
||||
|
||||
if not isinstance(pixel_values, (torch.Tensor, list[torch.Tensor])):
|
||||
if not isinstance(pixel_values, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of pixel values. "
|
||||
f"Got type: {type(pixel_values)}")
|
||||
|
||||
if not isinstance(num_crops, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of num_crops values. "
|
||||
f"Got type: {type(num_crops)}")
|
||||
|
||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
||||
num_crops = flatten_bn(num_crops, concat=True)
|
||||
|
||||
return Gemma3ImagePixelInputs(
|
||||
type="pixel_values",
|
||||
data=self._validate_pixel_values(pixel_values),
|
||||
pixel_values=self._validate_pixel_values(pixel_values),
|
||||
num_crops=num_crops,
|
||||
)
|
||||
|
||||
def _image_pixels_to_features(
|
||||
@ -283,7 +473,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
image_input: Gemma3ImageInputs,
|
||||
) -> torch.Tensor:
|
||||
assert self.vision_tower is not None
|
||||
pixel_values = image_input["data"]
|
||||
|
||||
pixel_values = image_input["pixel_values"]
|
||||
vision_outputs = self._image_pixels_to_features(
|
||||
self.vision_tower,
|
||||
pixel_values,
|
||||
|
||||
@ -981,5 +981,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
# unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
|
||||
skip_prefixes = [
|
||||
"action_embed", "temporal_embed", "track_embed",
|
||||
"track_embed_decoder", "box_token", "cg_criterion", "cg_model",
|
||||
"loc_encoder", "loc_decoder", "sam", "temporal_token",
|
||||
"track_token"
|
||||
]
|
||||
loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
# Copyright (c) Alibaba Cloud.
|
||||
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
|
||||
"""Inference-only QWen model compatible with HuggingFace weights."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
|
||||
"The configuration of this model indicates that it supports "
|
||||
"vision inputs, but you instantiated the text-only version "
|
||||
"of this model. Please use the vision model by setting "
|
||||
f"`--hf-overrides {hf_overrides!r}`")
|
||||
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
|
||||
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
@ -226,7 +226,11 @@ class MultiModalPlugin(ABC):
|
||||
|
||||
if callable(max_mm_tokens):
|
||||
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
|
||||
max_mm_tokens, overrides=model_config.mm_processor_kwargs)
|
||||
max_mm_tokens,
|
||||
overrides=model_config.mm_processor_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
max_mm_tokens = max_mm_tokens(InputContext(model_config),
|
||||
**mm_processor_kwargs)
|
||||
|
||||
|
||||
@ -1488,11 +1488,11 @@ def get_allowed_kwarg_only_overrides(
|
||||
if requires_kw_only:
|
||||
logger.warning(
|
||||
"The following intended overrides are not keyword-only args "
|
||||
"and and will be dropped: %s", dropped_keys)
|
||||
"and will be dropped: %s", dropped_keys)
|
||||
else:
|
||||
logger.warning(
|
||||
"The following intended overrides are not keyword args "
|
||||
"and and will be dropped: %s", dropped_keys)
|
||||
"and will be dropped: %s", dropped_keys)
|
||||
|
||||
return filtered_overrides
|
||||
|
||||
|
||||
@ -556,9 +556,6 @@ class Scheduler(SchedulerInterface):
|
||||
if spec_token_ids is not None:
|
||||
request.spec_token_ids = spec_token_ids[req_index]
|
||||
|
||||
# Get prompt logprobs for this request.
|
||||
prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
|
||||
|
||||
stopped = False
|
||||
new_logprobs = None
|
||||
new_token_ids: list[int] = []
|
||||
@ -591,6 +588,8 @@ class Scheduler(SchedulerInterface):
|
||||
new_token_ids,
|
||||
)
|
||||
|
||||
# Get prompt logprobs for this request.
|
||||
prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
|
||||
# Transmit partial if chunked prefill & prompt logprobs is enabled
|
||||
if new_token_ids or prompt_logprobs_tensors is not None:
|
||||
# Add EngineCoreOutput for this Request.
|
||||
@ -644,8 +643,7 @@ class Scheduler(SchedulerInterface):
|
||||
|
||||
if request.status == RequestStatus.RUNNING:
|
||||
self.running.remove(request)
|
||||
if request.request_id in self.scheduled_req_ids:
|
||||
self.scheduled_req_ids.remove(request.request_id)
|
||||
self.scheduled_req_ids.discard(request.request_id)
|
||||
else:
|
||||
self.waiting.remove(request)
|
||||
request.status = finished_status
|
||||
|
||||
@ -88,7 +88,8 @@ class IncrementalDetokenizer:
|
||||
stop_buffer_length=stop_buffer_length,
|
||||
)
|
||||
|
||||
def update(self, new_token_ids: list[int]) -> Optional[str]:
|
||||
def update(self, new_token_ids: list[int],
|
||||
stop_terminated: bool) -> Optional[str]:
|
||||
"""
|
||||
Update RequestState for the request_id by:
|
||||
1) Detokenize the new token ids incrementally.
|
||||
@ -96,11 +97,22 @@ class IncrementalDetokenizer:
|
||||
|
||||
Return matched stop string or None.
|
||||
"""
|
||||
|
||||
if not new_token_ids:
|
||||
# Skip detokenization if no new token ids
|
||||
return None
|
||||
if self.tokenizer is None:
|
||||
# Skip detokenization if no tokenizer
|
||||
self.token_ids.extend(new_token_ids)
|
||||
return None
|
||||
|
||||
if stop_terminated and not self.include_stop_str_in_output:
|
||||
# If stop-terminated, exclude last token from detokenization
|
||||
# based on include_stop_str_in_output parameter.
|
||||
skipped_stop_token_id = new_token_ids[-1]
|
||||
new_token_ids = new_token_ids[:-1]
|
||||
else:
|
||||
skipped_stop_token_id = None
|
||||
|
||||
# 1) Detokenize the new token ids incrementally.
|
||||
# TODO(woosuk): This method becomes very inefficient when the number of
|
||||
# new_token_ids is more than 1. We need to optimize this.
|
||||
@ -127,7 +139,14 @@ class IncrementalDetokenizer:
|
||||
|
||||
self.output_text += decoded_text
|
||||
|
||||
# 2) Evaluate stop criteria.
|
||||
if stop_terminated:
|
||||
if skipped_stop_token_id is not None:
|
||||
# Cleanup after skipping detokenization
|
||||
self.token_ids.append(skipped_stop_token_id)
|
||||
# Stop token triggered; skip stop string check
|
||||
return None
|
||||
|
||||
# 2) Evaluate stop strings.
|
||||
stop_string = None
|
||||
if self.stop:
|
||||
stop = StopChecker.check_stop_strings(
|
||||
|
||||
@ -299,9 +299,9 @@ class OutputProcessor:
|
||||
# in the EngineCore.
|
||||
req_state.is_prefilling = not new_token_ids
|
||||
|
||||
# 2) Detokenize the token ids into text and check for stop
|
||||
# strings.
|
||||
stop_string = req_state.detokenizer.update(new_token_ids)
|
||||
# 2) Detokenize the token ids into text and perform stop checks.
|
||||
stop_string = req_state.detokenizer.update(
|
||||
new_token_ids, finish_reason == FinishReason.STOP)
|
||||
if stop_string and finish_reason != FinishReason.STOP:
|
||||
finish_reason = FinishReason.STOP
|
||||
stop_reason = stop_string
|
||||
|
||||
@ -27,7 +27,6 @@ logger = init_logger(__name__)
|
||||
class StructuredOutputManager:
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
self.vocab_size = vllm_config.model_config.get_vocab_size()
|
||||
self.vllm_config = vllm_config
|
||||
self.init_complete = False
|
||||
|
||||
@ -41,6 +40,7 @@ class StructuredOutputManager:
|
||||
tokenizer_group.ping()
|
||||
|
||||
tokenizer = tokenizer_group.get_lora_tokenizer(None)
|
||||
self.vocab_size = tokenizer.max_token_id
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# NOTE: ideally, xgrammar should handle this accordingly.
|
||||
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user