[Bugfix] Update Florence-2 tokenizer to make grounding tasks work (#16734)

Signed-off-by: Isotr0py <2037008807@qq.com>
2025-12-24 15:56:42 +08:00 · 2025-04-17 12:17:39 +08:00 · 2025-04-17 12:17:39 +08:00 · cb072ce93b
commit cb072ce93b
parent 95aca283b4
5 changed files with 16 additions and 10 deletions
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
 def run_florence2():
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
        max_num_seqs=8,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
@ -165,6 +165,7 @@ def main(args):
        temperature=0,
        top_p=1.0,
        max_tokens=64,
+        skip_special_tokens=False,
    )

    start = time.time()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:

    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -925,6 +925,7 @@ class VllmRunner:
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
@ -932,6 +933,7 @@ class VllmRunner:
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
        )
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@ -13,12 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close

 MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-# Therefore, we borrow the BartTokenizer from the original Bart model
-TOKENIZER = "facebook/bart-base"
+# Florence-2 model repo's tokenizer config is missing some special tokens.
+# Therefore, we use a converted tokenizer from a forked repo
+TOKENIZER = "Isotr0py/Florence-2-tokenizer"
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
-    "<CAPTION>",  # special task token
+    "<OD>",  # special task token which will output special tokens
    "cherry_blossom":
    "Describe in detail what is shown in the image.",
 })
@ -45,7 +45,6 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str,
    output_ids, output_str, out_logprobs = hf_output

    output_str = output_str.replace("</s>", "").replace("<s>", "")
-    output_ids = [ids for ids in output_ids if ids not in [0, 2]]

    return output_ids, output_str, out_logprobs

@ -71,8 +70,11 @@ def run_test(
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_case = [
            vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs)
-            for prompts in inputs
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                skip_special_tokens=False,
+            ) for prompts in inputs
        ]

    hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
@ -93,6 +95,7 @@ def run_test(
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
+            num_outputs_0_skip_tokens=1,
        )


--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -366,7 +366,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
    # Therefore, we borrow the BartTokenizer from the original Bart model
    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
                                                         trust_remote_code=True),  # noqa: E501
    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501