From 8865da157bea1b25afc4ff28031858d557363290 Mon Sep 17 00:00:00 2001
From: "sangho.lee" <sanghol@allenai.org>
Date: Wed, 15 Oct 2025 02:13:59 -0500
Subject: [PATCH] [Bugfix][Multi Modal] Fix incorrect Molmo token processing
 (#26873)

Signed-off-by: sanghol <sanghol@allenai.org>
---
 vllm/model_executor/models/molmo.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 106aaf413e99b..dce94d181c4cd 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1264,13 +1264,16 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
     ) -> list[int]:
         processor = self.info.get_hf_processor()
 
-        # Apply the chat template to the tokens
+        # The chat template is already applied to the prompt tokens
+        # Use message_format="none" to avoid applying it again
+        # Prepend an empty space if `always_start_with_space` is True
         tokens = processor.processor.get_tokens_input(  # type: ignore
             self.info.get_tokenizer().decode(prompt_tokens),
-            message_format=processor.message_format,
+            message_format="none",
             always_start_with_space=processor.always_start_with_space,
         )
 
+        # Prepend a BOS token id to the tokens
         processed_data = self.info.ctx.call_hf_processor(
             processor,  # type: ignore
             dict(tokens=tokens),