[Model] Add PaddleOCR-VL Model Support (#27758)

Signed-off-by: zhangyue <zhangyue66@baidu.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
zhang-prog 2025-11-03 19:04:22 +08:00 committed by GitHub
parent 32257297dd
commit 40b69e33e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1475 additions and 0 deletions

View File

@ -675,6 +675,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |

View File

@ -1242,6 +1242,32 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
)
# PaddleOCR-VL
def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "PaddlePaddle/PaddleOCR-VL"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
)
placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
prompts = [
(f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1817,6 +1843,7 @@ model_example_map = {
"NVLM_D": run_nvlm_d,
"ovis": run_ovis,
"ovis2_5": run_ovis2_5,
"paddleocr_vl": run_paddleocr_vl,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,

View File

@ -801,6 +801,27 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "PaddlePaddle/PaddleOCR-VL"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b"
@ -1312,6 +1333,7 @@ model_example_map = {
"NVLM_D": load_nvlm_d,
"ovis": load_ovis,
"ovis2_5": load_ovis2_5,
"paddleocr_vl": load_paddleocr_vl,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,

View File

@ -712,6 +712,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
},
),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL",
trust_remote_code=True,
),
"PaliGemmaForConditionalGeneration": _HfExamplesInfo(
"google/paligemma-3b-mix-224",
extras={"v2": "google/paligemma2-3b-ft-docci-448"},

View File

@ -23,12 +23,22 @@
# limitations under the License.
"""Inference-only Erine model compatible with HuggingFace weights."""
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.model_executor.models.llama import LlamaForCausalLM
from .utils import PPMissingLayer
@support_torch_compile(
# set dynamic_arg_dims to support mrope
dynamic_arg_dims={
"input_ids": 0,
"positions": -1,
"intermediate_tensors": 0,
"inputs_embeds": 0,
}
)
class Ernie4_5ForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)

File diff suppressed because it is too large Load Diff

View File

@ -340,6 +340,10 @@ _MULTIMODAL_MODELS = {
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
"PaddleOCRVLForConditionalGeneration": (
"paddleocr_vl",
"PaddleOCRVLForConditionalGeneration",
),
"PaliGemmaForConditionalGeneration": (
"paligemma",
"PaliGemmaForConditionalGeneration",