diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 233fc35d2cf59..491f8c3962f73 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -365,6 +365,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
+ block_quant_shape: List[int] = None,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
@@ -385,10 +386,17 @@ class BenchmarkWorker:
else:
config = op_config[min(op_config.keys(),
key=lambda x: abs(x - num_tokens))]
- kernel_time = benchmark_config(config, num_tokens, num_experts,
- shard_intermediate_size, hidden_size,
- topk, dtype, use_fp8_w8a8,
- use_int8_w8a16)
+ kernel_time = benchmark_config(config,
+ num_tokens,
+ num_experts,
+ shard_intermediate_size,
+ hidden_size,
+ topk,
+ dtype,
+ use_fp8_w8a8,
+ use_int8_w8a16,
+ num_iters=100,
+ block_quant_shape=block_quant_shape)
return config, kernel_time
def tune(
@@ -487,6 +495,14 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
f.write("\n")
+def get_weight_block_size_safety(config, default_value=None):
+
+ quantization_config = getattr(config, 'quantization_config', {})
+ if isinstance(quantization_config, dict):
+ return quantization_config.get('weight_block_size', default_value)
+ return default_value
+
+
def main(args: argparse.Namespace):
print(args)
block_quant_shape = None
@@ -508,7 +524,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
- block_quant_shape = config.quantization_config['weight_block_size']
+ block_quant_shape = get_weight_block_size_safety(config)
elif config.architectures[0] == "Qwen2MoeForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 98e7572981dee..bcbd7bf9600c5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
* ✅︎
* ✅︎
- * ✅︎\*
+ * ⚠️
- * `GLM4VForCausalLM`^
* GLM-4V
* T + I
@@ -786,9 +786,9 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
- * `InternVLChatModel`
- * InternVL 2.5, Mono-InternVL, InternVL 2.0
+ * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
* T + IE+
- * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+ * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
*
* ✅︎
* ✅︎
@@ -856,12 +856,12 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
* ✅︎
- * `PaliGemmaForConditionalGeneration`
- * PaliGemma ⚠️, PaliGemma 2 ⚠️
+ * PaliGemma, PaliGemma 2
* T + IE
* `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
*
* ✅︎
- * ✅︎
+ * ⚠️
- * `Phi3VForCausalLM`
* Phi-3-Vision, Phi-3.5-Vision
* T + IE+
@@ -926,34 +926,15 @@ See [this page](#generative-models) for more information on how to use generativ
E Pre-computed embeddings can be inputted for this modality.
+ Multiple items can be inputted per text prompt for this modality.
-:::{warning}
-vLLM does not currently support PrefixLM attention mask, so our PaliGemma implementation uses regular causal attention, which causes the model output to be unstable.
-
-We may deprecate this model series in a future release.
-:::
-
-:::{note}
-`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
-:::
-
-:::{note}
-To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-:::
-
-:::{note}
-The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
-For more details, please see:
-:::
-
-:::{note}
-To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
-:::
-
-:::{note}
+:::{important}
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
`pip install git+https://github.com/huggingface/transformers`.
-The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
+Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
+You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
+:::
+
+:::{warning}
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
However, there are differences in how they handle text + image inputs:
@@ -969,9 +950,23 @@ V1 currently uses a simplified attention pattern:
- Will be updated in the future to support the correct behavior
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+:::
-Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
-Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
+:::{note}
+`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
+:::
+
+:::{note}
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+:::
+
+:::{note}
+The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
+For more details, please see:
+:::
+
+:::{warning}
+Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
:::
### Pooling Models
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 39acab4765a30..432cda5e24396 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -123,10 +123,14 @@ def run_gemma3(questions: list[str], modality: str):
assert modality == "image"
model_name = "google/gemma-3-4b-it"
- llm = LLM(model=model_name,
- max_model_len=2048,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ llm = LLM(
+ model=model_name,
+ max_model_len=2048,
+ max_num_seqs=2,
+ # Default is False; setting it to True is not supported in V1 yet
+ mm_processor_kwargs={"do_pan_and_scan": True},
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [("user\n"
f"{question}\n"
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 4963e6a8c4e72..b47004aa96156 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -83,10 +83,14 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
model_name = "google/gemma-3-4b-it"
- llm = LLM(model=model_name,
- max_model_len=8192,
- max_num_seqs=2,
- limit_mm_per_prompt={"image": len(image_urls)})
+ llm = LLM(
+ model=model_name,
+ max_model_len=8192,
+ max_num_seqs=2,
+ # Default is False; setting it to True is not supported in V1 yet
+ mm_processor_kwargs={"do_pan_and_scan": True},
+ limit_mm_per_prompt={"image": len(image_urls)},
+ )
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
deleted file mode 100644
index 2faabcb031f78..0000000000000
--- a/tests/lora/test_ultravox.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import shutil
-from os import path
-from tempfile import TemporaryDirectory
-
-import pytest
-import torch
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file, save_file
-from transformers import AutoTokenizer
-
-from vllm.lora.request import LoRARequest
-
-from ..models.utils import check_outputs_equal
-
-ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
-LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
-
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
-
-PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
- # Simple autouse wrapper to run both engines for each test
- # This can be promoted up to conftest.py to run for every
- # test in a package
- pass
-
-
-def llama3_1_8b_chess_lora_path():
- return snapshot_download(
- repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
-
-
-# can't use llama lora adapter without module name transformation
-# because ultravox nest language model
-def transform_module_names_for_ultravox(state_dict):
- transformed_state_dict = {}
- for key, value in state_dict.items():
- new_key = key.replace("base_model.model",
- "base_model.model.language_model")
- transformed_state_dict[new_key] = value
- return transformed_state_dict
-
-
-def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
- tensor_file = "adapter_model.safetensors"
- state_dict = load_file(path.join(source_repo, tensor_file))
- transformed_state_dict = transform_module_names_for_ultravox(state_dict)
-
- save_file(transformed_state_dict, path.join(target_path, tensor_file))
-
- config_file = "adapter_config.json"
- shutil.copyfile(path.join(source_repo, config_file),
- path.join(target_path, config_file))
- return target_path
-
-
-def _get_prompt(audio_count, question, placeholder, model_name) -> str:
- tokenizer = AutoTokenizer.from_pretrained(model_name)
- placeholder = f"{placeholder}\n" * audio_count
-
- return tokenizer.apply_chat_template([{
- 'role': 'user',
- 'content': f"{placeholder}{question}"
- }],
- tokenize=False,
- add_generation_prompt=True)
-
-
-def test_ultravox_lora(vllm_runner):
- """
- TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
- """
- # Workaround to prevent device mismatch in Whisper.
- # Can be removed when it is fixed upstream in transformer
- # https://github.com/huggingface/transformers/pull/35866
- torch.set_default_device("cpu")
-
- llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
- with TemporaryDirectory() as temp_ultravox_lora_dir:
- llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
- llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
- with vllm_runner(
- ULTRAVOX_MODEL_NAME,
- enforce_eager=True,
- max_num_seqs=2,
- enable_lora=True,
- max_loras=1,
- max_lora_rank=128,
- dtype="bfloat16",
- max_model_len=1024,
- ) as vllm_model:
- ultravox_outputs: list[tuple[
- list[int], str]] = vllm_model.generate_greedy(
- [
- _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
- ULTRAVOX_MODEL_NAME)
- ],
- 256,
- lora_request=LoRARequest(str(1), 1,
- llama3_1_8b_ultravox_chess_lora),
- )
-
- # run llama with and without lora to compare outputs with above
- with vllm_runner(
- LLMA_MODEL_NAME,
- enforce_eager=True,
- max_num_seqs=2,
- enable_lora=True,
- max_loras=1,
- max_lora_rank=128,
- dtype="bfloat16",
- max_model_len=1024,
- ) as vllm_model:
- llama_outputs: list[tuple[list[int], str]] = (
- vllm_model.generate_greedy(
- [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
- 256,
- lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
- ))
-
- check_outputs_equal(
- outputs_0_lst=ultravox_outputs,
- outputs_1_lst=llama_outputs,
- name_0="ultravox",
- name_1="llama",
- )
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2540933bbc23c..84a5260ad9a08 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@ from pathlib import PosixPath
import pytest
from packaging.version import Version
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform
@@ -234,16 +234,41 @@ VLM_TEST_SETTINGS = {
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
+ "gemma3": VLMTestInfo(
+ models=["google/gemma-3-4b-it"],
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts({
+ "stop_sign": "What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "What is the season?", # noqa: E501
+ }),
+ multi_image_prompt="Describe the two images in detail.", # noqa: E501
+ max_model_len=4096,
+ max_num_seqs=2,
+ # TODO: Use AutoModelForVision2Seq once transformers supports this
+ auto_cls=AutoModelForPreTraining,
+ dtype="bfloat16",
+ vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+ patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+ ),
"glm4v": VLMTestInfo(
models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
- prompt_formatter=identity,
- img_idx_to_prompt=lambda idx: "",
+ prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts({
+ "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
+ }),
max_model_len=2048,
max_num_seqs=2,
dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
- patch_hf_runner=model_utils.glm_patch_hf_runner,
+ patch_hf_runner=model_utils.glm4v_patch_hf_runner,
+ # The image embeddings match with HF but the outputs of the language
+ # decoder are only consistent up to 2 decimal places.
+ # So, we need to reduce the number of tokens for the test to pass.
+ max_tokens=8,
+ num_logprobs=10,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index aaad584c9cd51..31f0209b102db 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -61,7 +61,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
- vllm_runner_kwargs_: dict[str, Any] = {}
+ vllm_runner_kwargs_: dict[str, Any] = {
+ "disable_mm_preprocessor_cache": True,
+ }
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 66410f66ca0d6..3b4d1237c37a1 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -304,8 +304,20 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
-def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
- """Patches and returns an instance of the HfRunner to use for GLM4."""
+def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+ """Patches and returns an instance of the HfRunner to use for Gemma 3."""
+ hf_processor = hf_model.processor
+
+ def processor(*args, **kwargs):
+ return hf_processor(*args, do_pan_and_scan=True, **kwargs)
+
+ hf_model.processor = processor
+
+ return hf_model
+
+
+def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+ """Patches and returns an instance of the HfRunner to use for GLM4V."""
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
@@ -313,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if images is None:
return hf_processor(*args, **kwargs)
+ images = [images] if isinstance(images, Image) else images
+
+ contents = re.findall(
+ r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
+ text,
+ )
+ assert len(contents) == len(images)
+
return hf_processor.apply_chat_template(
[{
"role": "user",
- "image": images,
- "content": text
- }],
+ "image": image,
+ "content": content
+ } for image, content in zip(images, contents)],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 84471c92a293f..713fc733e21c6 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -96,14 +96,14 @@ def _run_check(
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
+ prompt = "" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
- processed_inputs = processor.apply("" * len(images), mm_data,
- mm_processor_kwargs)
+ processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("")
@@ -152,9 +152,7 @@ def test_processor_override(
}
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
- trust_remote_code=True,
+ model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 0a0f1cb389380..fdbe2f17692f7 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -9,10 +9,8 @@ from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
-models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
-
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
@@ -25,7 +23,7 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
- model: str,
+ model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
@@ -36,9 +34,7 @@ def test_processor_override(
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx = build_model_context(
- model_name=model,
- tokenizer_name=model,
- trust_remote_code=True,
+ model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index adbc4f5b5586b..f5bd661071ac6 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -56,14 +56,14 @@ def _run_check(
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
+ prompt = "" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
- processed_inputs = processor.apply("" * len(images), mm_data,
- mm_processor_kwargs)
+ processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("")
@@ -109,9 +109,7 @@ def test_processor_override(
}
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
- trust_remote_code=True,
+ model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index dca25e5d4c4c6..74bca0e358996 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
@@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 96abc840f0521..c27898a40b711 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
@@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 420644f70842f..2f0c8e7e5492c 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -35,9 +35,7 @@ def test_processor_override(
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
- trust_remote_code=True,
+ model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index b882528aafb9c..95204c7ebb4d8 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -30,8 +30,7 @@ def test_processor_override(
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx = build_model_context(
- model_name=model_id,
- tokenizer_name=model_id,
+ model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index b0182d545f4b9..2280a6c916d95 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -10,6 +10,8 @@ from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from .registry import HF_EXAMPLE_MODELS
+
TokensText = tuple[list[int], str]
@@ -250,10 +252,8 @@ def check_logprobs_close(
def build_model_context(
- model_name: str,
+ model_id: str,
task: TaskOption = "auto",
- tokenizer_name: Optional[str] = None,
- trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[dict] = None,
limit_mm_per_prompt: Optional[dict] = None,
@@ -262,9 +262,7 @@ def build_model_context(
"""Creates an InputContext for a given model.
Args:
- model_name: Name of the model being considered.
- tokenizer_name: Name of the tokenizer being considered.
- trust_remote_code: Whether or not to allow loading remote code.
+ model_id: ID of the model being considered.
mm_processor_kwargs: optional processor kwargs for to be leveraged
in the input processor, mapper, dummy data creation, etc.
limit_mm_per_prompt: Multimodal limits.
@@ -272,21 +270,24 @@ def build_model_context(
Returns:
InputContext for the model being considered.
"""
- if tokenizer_name is None:
- tokenizer_name = model_name
+ model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+ model_info.check_available_online(on_fail="skip")
+ model_info.check_transformers_version(on_fail="skip")
+
if dtype is None:
dtype = "half"
model_config = ModelConfig(
- model_name,
+ model_id,
task=task,
- tokenizer=tokenizer_name,
- tokenizer_mode="auto",
- trust_remote_code=trust_remote_code,
+ tokenizer=model_info.tokenizer or model_id,
+ tokenizer_mode=model_info.tokenizer_mode,
+ trust_remote_code=model_info.trust_remote_code,
dtype=dtype,
seed=0,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+ hf_overrides=model_info.hf_overrides,
)
return InputContext(model_config)
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 0de853ba6e5e5..388f7f45e051d 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -470,22 +470,184 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
assert not output_processor.has_unfinished_requests()
+@pytest.mark.parametrize(
+ "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
+ [(False, "stop_token_ids", False, None),
+ (True, "stop_token_ids", False, None),
+ (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+ (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+ (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
+ (False, "eos_token_id", True, None)])
+def test_stop_token(include_stop_str_in_output: bool,
+ num_sample_logprobs: Optional[int], stop_token_type: str,
+ ignore_eos: bool, dummy_test_vectors):
+ """Test output processor EOS/stop token handling.
+
+ Send mock engine core request to mock engine core and pass core outputs
+ to output processor. Validate output processor tokens, text and
+ (if enabled) sample logprobs. Batch-size one.
+
+ The test emulates a scenario where a model outputs text tokens followed
+ by two identical control tokens:
+ ...
+
+ If EOS is under test, the control tokens are EOS; otherwise, they are
+ some other token id.
+
+ Test behavior:
+
+ * If EOS is under test and `ignore_eos=True`, the detokenized string
+ should be ... and the finish
+ reason should be "length" (i.e. no stop occurs)
+
+ * else, if `include_stop_str_in_output==True`, the detokenized
+ string should be ... and the finish
+ reason should be "stop" (i.e. first control token causes stop
+ and is represented in output text)
+
+ * else, the detokenized string should be
+ ... and the finish reason should be "stop"
+ (i.e. first control token causes stop but is not represented
+ in output text.)
+
+ Note: some test details are tuned for meta-llama/Llama-3.2-1B,
+ another model should work only if the test is modified.
+
+ Args:
+ include_stop_str_in_output: stop token str appears in output text
+ num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
+ stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
+ ignore_eos: if True, EOS stops are disabled
+ dummy_test_vectors: dummy engine core outputs and other data structures
+ """
+ model_id = dummy_test_vectors.tokenizer.name_or_path
+ if model_id != 'meta-llama/Llama-3.2-1B':
+ raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
+ f"{model_id} is in use.")
+ do_logprobs = num_sample_logprobs is not None
+ # EOS under test; if False, stop_token_ids under test
+ is_eos_test = stop_token_type == "eos_token_id"
+ # EOS under test but ignore_eos enabled
+ is_eos_ignore_test = is_eos_test and ignore_eos
+ eos_token_id = (
+ dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
+ ) # '<|end_of_text|>'
+ stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>'
+
+ output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+ log_stats=False)
+ # Dummy engine core outputs, with control tokens suffixed to test stops
+ suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+ assert suffix_token is not None and isinstance(suffix_token[0], int)
+ generation_string = dummy_test_vectors.generation_strings[0]
+ generation_tokens = (dummy_test_vectors.generation_tokens[0] +
+ 2 * suffix_token)
+ if do_logprobs:
+ generation_logprobs = (
+ dummy_test_vectors.generation_logprobs[0] +
+ 2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+ prompt_string = dummy_test_vectors.prompt_strings[0]
+ prompt_tokens = dummy_test_vectors.prompt_tokens[0]
+ engine_core = MockEngineCore(
+ tokens_list=[generation_tokens],
+ generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+ prompt_logprobs_raw=None,
+ eos_token_id=eos_token_id,
+ stop_token_ids=stop_token_ids,
+ ignore_eos=ignore_eos)
+
+ # Make request.
+ request_id = "request-0"
+ request = EngineCoreRequest(
+ request_id=request_id,
+ prompt=prompt_string,
+ prompt_token_ids=prompt_tokens,
+ arrival_time=0,
+ mm_inputs=None,
+ mm_hashes=None,
+ mm_placeholders=None,
+ eos_token_id=eos_token_id,
+ lora_request=None,
+ sampling_params=SamplingParams(
+ skip_special_tokens=False,
+ spaces_between_special_tokens=False,
+ output_kind=RequestOutputKind.DELTA,
+ stop=[],
+ stop_token_ids=stop_token_ids,
+ include_stop_str_in_output=include_stop_str_in_output,
+ logprobs=num_sample_logprobs,
+ prompt_logprobs=None,
+ ignore_eos=ignore_eos,
+ ))
+
+ # Add request to the detokenizer.
+ output_processor.add_request(request)
+
+ # Loop over engine core steps; run output processor
+ gen_string = ""
+ gen_tokens = []
+ gen_logprobs = []
+ while True:
+ # Mock output from the EngineCore.
+ outputs = engine_core.get_outputs()
+ if len(outputs) == 0:
+ break
+
+ # Step the Detokenizer.
+ processed_outputs = output_processor.process_outputs(outputs)
+ request_outputs = processed_outputs.request_outputs
+ assert len(request_outputs) == 1
+ # Stop token does not rely on abort
+ assert not processed_outputs.reqs_to_abort
+
+ # Update tracking.
+ request_output = request_outputs[0]
+ if request_output.finished:
+ finish_reason = ("length" if is_eos_ignore_test else "stop")
+ assert request_output.outputs[0].finish_reason == finish_reason
+
+ gen_string += request_output.outputs[0].text
+ gen_tokens.extend(request_output.outputs[0].token_ids)
+ if do_logprobs:
+ gen_logprobs.extend(request_output.outputs[0].logprobs)
+
+ # Validate generated text
+ control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+ if is_eos_ignore_test:
+ # Length-based stop; expect full string
+ ref_str = generation_string + 2 * control_token
+ elif include_stop_str_in_output:
+ # Stop token triggered; include in output
+ ref_str = generation_string + control_token
+ else:
+ # Stop token triggered but not in output
+ ref_str = generation_string
+ assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+
+ if do_logprobs:
+ # Validate number of sample logprobs
+ num_tokens = len(gen_tokens)
+ num_logprobs = len(gen_logprobs)
+ assert num_tokens == num_logprobs, (
+ f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+
+ # Check requests are finished
+ assert output_processor.get_num_unfinished_requests() == 0
+ assert not output_processor.has_unfinished_requests()
+
+
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
@pytest.mark.parametrize("num_sample_logprobs",
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs",
- [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
def test_stop_string(include_stop_str_in_output: bool,
- num_sample_logprobs: Optional[int],
- num_prompt_logprobs: Optional[int], dummy_test_vectors):
+ num_sample_logprobs: Optional[int], dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs else None,
- prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
- if num_prompt_logprobs else None)
+ prompt_logprobs_raw=None)
# Make N requests.
request_id_list = [
@@ -510,7 +672,7 @@ def test_stop_string(include_stop_str_in_output: bool,
stop=STOP_STRINGS,
include_stop_str_in_output=include_stop_str_in_output,
logprobs=num_sample_logprobs,
- prompt_logprobs=num_prompt_logprobs,
+ prompt_logprobs=None,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
@@ -594,8 +756,7 @@ def test_stop_string(include_stop_str_in_output: bool,
# Confirmed tracked logprobs match what we expect
_validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
gen_cumulative_logprobs, dummy_test_vectors,
- request_id_list, num_sample_logprobs,
- num_prompt_logprobs)
+ request_id_list, num_sample_logprobs, None)
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index f0e344cfa6fc7..1ee93c72cd263 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -20,7 +20,7 @@ NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
# Number of prompt logprobs to request when testing prompt logprobs
NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+TOKENIZER_NAME = "meta-llama/Llama-3.2-1B"
FULL_STRINGS = [
"My name is Robert from Neural Magic and I love working on vLLM so much!",
@@ -330,13 +330,21 @@ class MockEngineCore:
# each matrix has dimensions
# (num prompt toks) x (num prompt logprobs+1)
prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
+ eos_token_id: Optional[int] = None,
+ stop_token_ids: Optional[list[int]] = None,
+ ignore_eos: bool = False,
) -> None:
+ self.num_requests = len(tokens_list)
self.tokens_list = tokens_list
self.current_idx = 0
self.generated_logprobs_raw = generated_logprobs_raw
self.do_logprobs = generated_logprobs_raw is not None
self.prompt_logprobs_raw = prompt_logprobs_raw
self.do_prompt_logprobs = prompt_logprobs_raw is not None
+ self.request_finished = [False for _ in range(self.num_requests)]
+ self.eos_token_id = eos_token_id
+ self.stop_token_ids = stop_token_ids
+ self.ignore_eos = ignore_eos
def get_outputs(self) -> list[EngineCoreOutput]:
do_logprobs = self.do_logprobs
@@ -345,7 +353,7 @@ class MockEngineCore:
outputs = []
for req_idx, token_ids in enumerate(self.tokens_list):
- if len(token_ids) > token_idx:
+ if not self.request_finished[req_idx]:
if do_logprobs:
assert self.generated_logprobs_raw is not None
(logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
@@ -365,14 +373,23 @@ class MockEngineCore:
prompt_logprobs = None
else:
prompt_logprobs = None
+ new_token_id = token_ids[token_idx]
output = EngineCoreOutput(
request_id=f"request-{req_idx}",
- new_token_ids=[token_ids[token_idx]],
+ new_token_ids=[new_token_id],
new_logprobs=logprobs,
new_prompt_logprobs_tensors=prompt_logprobs,
)
if token_idx == len(token_ids) - 1:
+ output.finish_reason = FinishReason.LENGTH
+ self.request_finished[req_idx] = True
+ if not self.ignore_eos and new_token_id == self.eos_token_id:
output.finish_reason = FinishReason.STOP
+ self.request_finished[req_idx] = True
+ if new_token_id in (self.stop_token_ids or ()):
+ output.finish_reason = FinishReason.STOP
+ output.stop_reason = new_token_id
+ self.request_finished[req_idx] = True
outputs.append(output)
self.current_idx += 1
diff --git a/vllm/config.py b/vllm/config.py
index 3ac7ceabd8d3d..35411ca73ad23 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -286,14 +286,18 @@ class ModelConfig:
if rope_scaling is not None:
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
hf_overrides_kw.update(hf_override)
- msg = ("`--rope-scaling` will be removed in a future release. "
- f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+ hf_overrides_str = json.dumps(hf_overrides)
+ msg = (
+ "`--rope-scaling` will be removed in a future release. "
+ f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
if rope_theta is not None:
hf_override = {"rope_theta": rope_theta}
hf_overrides_kw.update(hf_override)
- msg = ("`--rope-theta` will be removed in a future release. "
- f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+ hf_overrides_str = json.dumps(hf_overrides)
+ msg = (
+ "`--rope-theta` will be removed in a future release. "
+ f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 61f21482f7072..4ce4fa897cc96 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config = self._model_config.hf_config
model_type = hf_config.model_type
- if modality in ["image", "image_embeds"]:
+ if modality in ("image", "image_embeds"):
+ if model_type == "chatglm":
+ return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return f"<|image_{current_count}|>"
@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"):
return "(./)"
- if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
- "pixtral"):
+ if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
# These models do not use image tokens in the prompt
return None
if model_type == "qwen":
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index a0bd8f278fd03..b6ceb5fb82d70 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -348,7 +348,11 @@ class InputRegistry:
dummy_factory = self._get_dummy_data_factory(model_cls)
mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
- dummy_factory, overrides=model_config.mm_processor_kwargs)
+ dummy_factory,
+ overrides=model_config.mm_processor_kwargs,
+ requires_kw_only=False,
+ allow_var_kwargs=True,
+ )
dummy_data = dummy_factory(InputContext(model_config), seq_len,
_MultiModalCounts(mm_counts),
@@ -381,6 +385,7 @@ class InputRegistry:
self,
ctx: InputContext,
inputs: ProcessorInputs,
+ **kwargs: object,
) -> ProcessorInputs:
"""The default input processor is a no-op."""
return inputs
@@ -447,6 +452,8 @@ class InputRegistry:
model_config.mm_processor_kwargs,
inputs.get("mm_processor_kwargs", {}), # type: ignore
processor,
+ requires_kw_only=False,
+ allow_var_kwargs=True,
)
processed_inputs = processor(
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 5d4c1c6ec8930..c92bcbea540a5 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -98,6 +98,13 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
qweight_type: int) -> torch.Tensor:
+ # HACK: when doing chunked prefill we don't generate output tokens
+ # so input to logits generator is empty which causes invalid parameter
+ if x.shape[0] == 0:
+ return torch.empty(x.shape[0],
+ qweight.shape[0],
+ dtype=x.dtype,
+ device=x.device)
# there is no need to call any kernel for fp16/bf16
if qweight_type in UNQUANTIZED_TYPES:
return x @ qweight.T
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d4b8cf25fec57..fd27775b7dc0c 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,8 +161,13 @@ class RotaryEmbedding(CustomOp):
) -> Tuple[torch.Tensor, torch.Tensor]:
from vllm import _custom_ops as ops
- self.cos_sin_cache = self.cos_sin_cache.to(query.device,
- dtype=query.dtype)
+ # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+ # is expensive, so avoid calling it if possible
+ if self.cos_sin_cache.device != query.device or \
+ self.cos_sin_cache.dtype != query.dtype:
+ self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+ dtype=query.dtype)
+
# ops.rotary_embedding()/batched_rotary_embedding()
# are in-place operations that update the query and key tensors.
if offsets is not None:
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index ec62e41d59f0f..61b68125e07e0 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors
from vllm.utils import LayerBlockType
from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
- SupportsV0Only)
+ SupportsQuant, SupportsV0Only)
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
@@ -367,7 +367,7 @@ class BambaModel(nn.Module):
class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
- IsHybrid, SupportsV0Only):
+ IsHybrid, SupportsV0Only, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 109b65d92cf90..04d6cde555e28 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -44,7 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
-from .interfaces import SupportsV0Only
+from .interfaces import SupportsQuant, SupportsV0Only
from .utils import maybe_prefix
logger = logging.get_logger(__name__)
@@ -697,7 +697,7 @@ class BartDecoder(nn.Module):
return hidden_states
-class BartModel(nn.Module):
+class BartModel(nn.Module, SupportsQuant):
_tied_weights_keys = [
"encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
]
@@ -763,7 +763,8 @@ class BartModel(nn.Module):
return decoder_outputs
-class BartForConditionalGeneration(nn.Module, SupportsV0Only):
+class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
+ packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
base_model_prefix = "model"
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6eca25212ee66..14dca23b3934f 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,6 +2,7 @@
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
+import json
from typing import Iterable, Optional, Set, Tuple, Union
import torch
@@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
- f"`--hf-overrides {hf_overrides!r}`")
+ f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 121aee51786b8..ac80059cbe6d8 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,10 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
+import math
from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
Tuple, TypedDict, Union)
import torch
from torch import nn
-from transformers import BatchFeature, Gemma3Config, ProcessorMixin
+from transformers import BatchFeature, Gemma3Config, Gemma3Processor
+from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
from vllm.config import VllmConfig
from vllm.logger import init_logger
@@ -14,10 +16,11 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
-from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+ MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
- PromptUpdate, PromptUpdateDetails)
+ PromptUpdate, encode_tokens)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
@@ -31,8 +34,15 @@ logger = init_logger(__name__)
class Gemma3ImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
- data: torch.Tensor
- """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+ pixel_values: torch.Tensor
+ """
+ Shape: `(num_crops_total, num_channels, height, width)`
+
+ `num_crops_total` is the total number of crops
+ over each image over each prompt in the batch.
+ """
+ num_crops: torch.Tensor
+ """Shape: `(batch_size * num_images,)`"""
Gemma3ImageInputs = Gemma3ImagePixelInputs
@@ -40,6 +50,9 @@ Gemma3ImageInputs = Gemma3ImagePixelInputs
class Gemma3ProcessingInfo(BaseProcessingInfo):
+ def get_hf_processor(self, **kwargs: object):
+ return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
+
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
@@ -48,22 +61,160 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
- hf_config = self.ctx.get_hf_config()
- return {"image": hf_config.mm_tokens_per_image}
+ return {"image": self.get_max_image_tokens()}
+
+ def _resolve_image_kwargs(
+ self,
+ processor: Gemma3Processor,
+ keys: set[str],
+ ) -> dict[str, Any]:
+ image_processor = processor.image_processor
+ kwargs = processor._merge_kwargs(
+ Gemma3ProcessorKwargs,
+ tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
+ )
+
+ images_kwargs = kwargs["images_kwargs"]
+
+ def _resolve_kw(key: str):
+ val = getattr(image_processor, key)
+ if val is None:
+ val = images_kwargs[key]
+
+ return val
+
+ return {k: _resolve_kw(k) for k in keys}
+
+ def get_num_crops(
+ self,
+ *,
+ image_width: int,
+ image_height: int,
+ processor: Optional[Gemma3Processor],
+ ) -> int:
+ if processor is None:
+ processor = self.get_hf_processor()
+
+ images_kwargs = self._resolve_image_kwargs(
+ processor, {
+ "do_pan_and_scan", "pan_and_scan_min_crop_size",
+ "pan_and_scan_max_num_crops",
+ "pan_and_scan_min_ratio_to_activate"
+ })
+
+ do_pan_and_scan = images_kwargs["do_pan_and_scan"]
+ pan_and_scan_min_crop_size = images_kwargs[
+ "pan_and_scan_min_crop_size"]
+ pan_and_scan_max_num_crops = images_kwargs[
+ "pan_and_scan_max_num_crops"]
+ pan_and_scan_min_ratio_to_activate = images_kwargs[
+ "pan_and_scan_min_ratio_to_activate"]
+
+ if not do_pan_and_scan:
+ return 0
+
+ # Based on Gemma3ImageProcessor.pan_and_scan
+ if image_width >= image_height:
+ if image_width / image_height < pan_and_scan_min_ratio_to_activate:
+ return 0
+
+ num_crops_w = min(
+ int(math.floor(image_width / pan_and_scan_min_crop_size)),
+ int(math.floor(image_width / image_height + 0.5)),
+ )
+
+ num_crops_w = max(2, num_crops_w)
+ num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+ num_crops_h = 1
+ else:
+ if image_height / image_width < pan_and_scan_min_ratio_to_activate:
+ return 0
+
+ num_crops_h = min(
+ int(math.floor(image_height / pan_and_scan_min_crop_size)),
+ int(math.floor(image_height / image_width + 0.5)),
+ )
+
+ num_crops_h = max(2, num_crops_h)
+ num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+ num_crops_w = 1
+
+ crop_size_w = int(math.ceil(image_width / num_crops_w))
+ crop_size_h = int(math.ceil(image_height / num_crops_h))
+
+ if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+ return 0
+
+ return num_crops_w * num_crops_h
+
+ def get_image_repl(
+ self,
+ *,
+ image_width: int,
+ image_height: int,
+ processor: Optional[Gemma3Processor],
+ ) -> str:
+ if processor is None:
+ processor = self.get_hf_processor()
+
+ image_token = processor.boi_token
+
+ num_crops = self.get_num_crops(
+ image_width=image_width,
+ image_height=image_height,
+ processor=processor,
+ )
+
+ if num_crops == 0:
+ image_text = image_token
+ else:
+ crops_image_tokens = " ".join(image_token
+ for _ in range(num_crops))
+ image_text = (
+ f"Here is the original image {image_token} and here are some "
+ f"crops to help you see better {crops_image_tokens}")
+
+ return image_text.replace(image_token, processor.full_image_sequence)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
- processor: Optional[ProcessorMixin],
+ processor: Optional[Gemma3Processor],
) -> int:
- hf_config = self.ctx.get_hf_config()
- return hf_config.mm_tokens_per_image
+ tokenizer = self.get_tokenizer()
+ image_repl = self.get_image_repl(
+ image_width=image_width,
+ image_height=image_height,
+ processor=processor,
+ )
+
+ image_repl_tokens = encode_tokens(
+ tokenizer,
+ image_repl,
+ add_special_tokens=False,
+ )
+ return len(image_repl_tokens)
def get_image_size_with_most_features(self) -> ImageSize:
- # Result in the max possible feature size (h:w = 16:1)
- return ImageSize(height=8000, width=50)
+ processor = self.get_hf_processor()
+
+ images_kwargs = self._resolve_image_kwargs(
+ processor, {"pan_and_scan_max_num_crops"})
+ max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
+
+ # Result in the max possible feature size (h:w = max_num_crops:1)
+ return ImageSize(height=50 * max_num_crops, width=50)
+
+ def get_max_image_tokens(self) -> int:
+ target_width, target_height = self.get_image_size_with_most_features()
+
+ return self.get_num_image_tokens(
+ image_width=target_width,
+ image_height=target_height,
+ processor=None,
+ )
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
@@ -73,10 +224,11 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
- tokenizer = self.info.get_tokenizer()
- boi_token = tokenizer.boi_token
+ processor = self.info.get_hf_processor()
+ image_token = processor.boi_token
num_images = mm_counts.get("image", 0)
+
target_width, target_height = \
self.info.get_image_size_with_most_features()
@@ -86,8 +238,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
height=target_height,
num_images=num_images)
}
+
+ # NOTE: We need to separate the image tokens here because
+ # encode("\n\n\n\n") != encode("\n\n") * 2, which interferes
+ # with the detection of prompt updates when the image tokens are
+ # right next to each other
return ProcessorInputs(
- prompt_text=" ".join([boi_token] * num_images),
+ prompt_text=" ".join([image_token] * num_images),
mm_data=mm_data,
)
@@ -100,22 +257,49 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
- # TODO(woosuk): Support pan-and-scan.
- img_kwargs = mm_kwargs.get("images_kwargs", {})
- img_kwargs["do_pan_and_scan"] = False
- mm_kwargs["images_kwargs"] = img_kwargs
- return super()._call_hf_processor(
- prompt=prompt,
- mm_data=mm_data,
- mm_kwargs=mm_kwargs,
+ processed_outputs = super()._call_hf_processor(
+ prompt,
+ mm_data,
+ mm_kwargs,
)
+ # HF processor pops the `num_crops` kwarg, which is needed by vLLM
+ if (images := mm_data.get("images")) is not None:
+ assert isinstance(images, list)
+
+ parsed_images = (self._get_data_parser().parse_mm_data({
+ "image":
+ images
+ }).get_items("image", ImageProcessorItems))
+ image_sizes = [
+ parsed_images.get_image_size(i)
+ for i in range(len(parsed_images))
+ ]
+ hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+ num_crops = [
+ self.info.get_num_crops(image_width=size.width,
+ image_height=size.height,
+ processor=hf_processor)
+ for size in image_sizes
+ ]
+
+ processed_outputs["num_crops"] = torch.tensor(num_crops)
+
+ return processed_outputs
+
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
- return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+ num_crops = hf_inputs.get("num_crops", torch.empty(0))
+
+ return dict(
+ pixel_values=MultiModalFieldConfig.flat_from_sizes(
+ "image", num_crops + 1),
+ num_crops=MultiModalFieldConfig.batched("image"),
+ )
def _get_prompt_updates(
self,
@@ -123,25 +307,23 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
- tokenizer = self.info.get_tokenizer()
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
- hf_config = self.info.get_hf_config()
-
- boi_token = tokenizer.boi_token
- image_token = tokenizer.image_token
- mm_tokens_per_image = hf_config.mm_tokens_per_image
- image_tokens_expanded = "".join([image_token] * mm_tokens_per_image)
+ image_token = hf_processor.boi_token
def get_replacement_gemma3(item_idx: int):
- return PromptUpdateDetails(
- full=hf_processor.full_image_sequence,
- features=image_tokens_expanded,
+ images = mm_items.get_items("image", ImageProcessorItems)
+
+ image_size = images.get_image_size(item_idx)
+ return self.info.get_image_repl(
+ image_width=image_size.width,
+ image_height=image_size.height,
+ processor=hf_processor,
)
return [
PromptReplacement(
modality="image",
- target=boi_token,
+ target=image_token,
replacement=get_replacement_gemma3,
)
]
@@ -254,19 +436,27 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
pixel_values = kwargs.pop("pixel_values", None)
+ num_crops = kwargs.pop("num_crops", None)
image_embeds = kwargs.pop("image_embeds", None)
assert image_embeds is None, "Gemma3 does not support image_embeds."
if pixel_values is None:
return None
- if not isinstance(pixel_values, (torch.Tensor, list[torch.Tensor])):
+ if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
+ if not isinstance(num_crops, (torch.Tensor, list)):
+ raise ValueError("Incorrect type of num_crops values. "
+ f"Got type: {type(num_crops)}")
+
pixel_values = flatten_bn(pixel_values, concat=True)
+ num_crops = flatten_bn(num_crops, concat=True)
+
return Gemma3ImagePixelInputs(
type="pixel_values",
- data=self._validate_pixel_values(pixel_values),
+ pixel_values=self._validate_pixel_values(pixel_values),
+ num_crops=num_crops,
)
def _image_pixels_to_features(
@@ -283,7 +473,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
image_input: Gemma3ImageInputs,
) -> torch.Tensor:
assert self.vision_tower is not None
- pixel_values = image_input["data"]
+
+ pixel_values = image_input["pixel_values"]
vision_outputs = self._image_pixels_to_features(
self.vision_tower,
pixel_values,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 1aa8455bad821..fcaf7fecaafc9 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -981,5 +981,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]:
- loader = AutoWeightsLoader(self)
+ # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
+ skip_prefixes = [
+ "action_embed", "temporal_embed", "track_embed",
+ "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+ "loc_encoder", "loc_decoder", "sam", "temporal_token",
+ "track_token"
+ ]
+ loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 96abfb9d1096c..a33739a8eef9d 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -5,7 +5,7 @@
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights."""
-
+import json
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
import torch
@@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
- f"`--hf-overrides {hf_overrides!r}`")
+ f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index e0b160a65047a..5159b0bca8c1c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -226,7 +226,11 @@ class MultiModalPlugin(ABC):
if callable(max_mm_tokens):
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
- max_mm_tokens, overrides=model_config.mm_processor_kwargs)
+ max_mm_tokens,
+ overrides=model_config.mm_processor_kwargs,
+ requires_kw_only=False,
+ allow_var_kwargs=True,
+ )
max_mm_tokens = max_mm_tokens(InputContext(model_config),
**mm_processor_kwargs)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9cad2b8854a23..a8eba27dbcdbd 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1488,11 +1488,11 @@ def get_allowed_kwarg_only_overrides(
if requires_kw_only:
logger.warning(
"The following intended overrides are not keyword-only args "
- "and and will be dropped: %s", dropped_keys)
+ "and will be dropped: %s", dropped_keys)
else:
logger.warning(
"The following intended overrides are not keyword args "
- "and and will be dropped: %s", dropped_keys)
+ "and will be dropped: %s", dropped_keys)
return filtered_overrides
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 458abedbe71ba..db9fc05652e2e 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -556,9 +556,6 @@ class Scheduler(SchedulerInterface):
if spec_token_ids is not None:
request.spec_token_ids = spec_token_ids[req_index]
- # Get prompt logprobs for this request.
- prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
-
stopped = False
new_logprobs = None
new_token_ids: list[int] = []
@@ -591,6 +588,8 @@ class Scheduler(SchedulerInterface):
new_token_ids,
)
+ # Get prompt logprobs for this request.
+ prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
# Transmit partial if chunked prefill & prompt logprobs is enabled
if new_token_ids or prompt_logprobs_tensors is not None:
# Add EngineCoreOutput for this Request.
@@ -644,8 +643,7 @@ class Scheduler(SchedulerInterface):
if request.status == RequestStatus.RUNNING:
self.running.remove(request)
- if request.request_id in self.scheduled_req_ids:
- self.scheduled_req_ids.remove(request.request_id)
+ self.scheduled_req_ids.discard(request.request_id)
else:
self.waiting.remove(request)
request.status = finished_status
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 92754920b62d1..bf06a17507b21 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -88,7 +88,8 @@ class IncrementalDetokenizer:
stop_buffer_length=stop_buffer_length,
)
- def update(self, new_token_ids: list[int]) -> Optional[str]:
+ def update(self, new_token_ids: list[int],
+ stop_terminated: bool) -> Optional[str]:
"""
Update RequestState for the request_id by:
1) Detokenize the new token ids incrementally.
@@ -96,11 +97,22 @@ class IncrementalDetokenizer:
Return matched stop string or None.
"""
-
+ if not new_token_ids:
+ # Skip detokenization if no new token ids
+ return None
if self.tokenizer is None:
+ # Skip detokenization if no tokenizer
self.token_ids.extend(new_token_ids)
return None
+ if stop_terminated and not self.include_stop_str_in_output:
+ # If stop-terminated, exclude last token from detokenization
+ # based on include_stop_str_in_output parameter.
+ skipped_stop_token_id = new_token_ids[-1]
+ new_token_ids = new_token_ids[:-1]
+ else:
+ skipped_stop_token_id = None
+
# 1) Detokenize the new token ids incrementally.
# TODO(woosuk): This method becomes very inefficient when the number of
# new_token_ids is more than 1. We need to optimize this.
@@ -127,7 +139,14 @@ class IncrementalDetokenizer:
self.output_text += decoded_text
- # 2) Evaluate stop criteria.
+ if stop_terminated:
+ if skipped_stop_token_id is not None:
+ # Cleanup after skipping detokenization
+ self.token_ids.append(skipped_stop_token_id)
+ # Stop token triggered; skip stop string check
+ return None
+
+ # 2) Evaluate stop strings.
stop_string = None
if self.stop:
stop = StopChecker.check_stop_strings(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 83180b66bea0d..04235eda09266 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -299,9 +299,9 @@ class OutputProcessor:
# in the EngineCore.
req_state.is_prefilling = not new_token_ids
- # 2) Detokenize the token ids into text and check for stop
- # strings.
- stop_string = req_state.detokenizer.update(new_token_ids)
+ # 2) Detokenize the token ids into text and perform stop checks.
+ stop_string = req_state.detokenizer.update(
+ new_token_ids, finish_reason == FinishReason.STOP)
if stop_string and finish_reason != FinishReason.STOP:
finish_reason = FinishReason.STOP
stop_reason = stop_string
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 45fec1122cce3..a341d74c5812b 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -27,7 +27,6 @@ logger = init_logger(__name__)
class StructuredOutputManager:
def __init__(self, vllm_config: VllmConfig):
- self.vocab_size = vllm_config.model_config.get_vocab_size()
self.vllm_config = vllm_config
self.init_complete = False
@@ -41,6 +40,7 @@ class StructuredOutputManager:
tokenizer_group.ping()
tokenizer = tokenizer_group.get_lora_tokenizer(None)
+ self.vocab_size = tokenizer.max_token_id
if isinstance(tokenizer, MistralTokenizer):
# NOTE: ideally, xgrammar should handle this accordingly.
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98