diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 35aa3bfdd12b..cae4a88de163 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -664,9 +664,9 @@ Text Generation (``--task generate``) - ✅︎ - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - - PaliGemma + - PaliGemma, PaliGemma 2 - T + I\ :sup:`E` - - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. + - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - ✅︎ - diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 5e210126dc8f..c430f42fdc81 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -137,6 +137,18 @@ def run_paligemma(question: str, modality: str): return llm, prompt, stop_token_ids +# PaliGemma 2 +def run_paligemma2(question: str, modality: str): + assert modality == "image" + + # PaliGemma 2 has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma2-3b-ft-docci-448", + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + # Chameleon def run_chameleon(question: str, modality: str): assert modality == "image" @@ -473,6 +485,7 @@ model_example_map = { "fuyu": run_fuyu, "phi3_v": run_phi3v, "paligemma": run_paligemma, + "paligemma2": run_paligemma2, "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 253e689e50a3..f9ad0c67adab 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -105,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext, orig_prompt_ids.remove(hf_config.image_token_index) new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n" + + # The PaliGemma 2 tokenizer does not include a starting BOS token + if orig_prompt_ids[0] != hf_config.bos_token_id: + orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids + new_token_ids = image_token_ids_pad + orig_prompt_ids + [108] #newline # NOTE: Create a defensive copy of the original inputs @@ -149,7 +154,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config - config.text_config.architectures = ["GemmaForCausalLM"] + + if config.text_config.model_type == "gemma": + config.text_config.architectures = ["GemmaForCausalLM"] + else: + config.text_config.architectures = ["Gemma2ForCausalLM"] self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config,