mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-21 01:09:09 +08:00
[Model] Add PaddleOCR-VL Model Support (#27758)
Signed-off-by: zhangyue <zhangyue66@baidu.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: zhangyue66 <zhangyue66@baidu.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
32257297dd
commit
40b69e33e7
@ -675,6 +675,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
@ -1242,6 +1242,32 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# PaddleOCR-VL
|
||||
def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "PaddlePaddle/PaddleOCR-VL"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
|
||||
prompts = [
|
||||
(f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# PaliGemma
|
||||
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1817,6 +1843,7 @@ model_example_map = {
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"ovis": run_ovis,
|
||||
"ovis2_5": run_ovis2_5,
|
||||
"paddleocr_vl": run_paddleocr_vl,
|
||||
"paligemma": run_paligemma,
|
||||
"paligemma2": run_paligemma2,
|
||||
"phi3_v": run_phi3v,
|
||||
|
||||
@ -801,6 +801,27 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "PaddlePaddle/PaddleOCR-VL"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
|
||||
prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
@ -1312,6 +1333,7 @@ model_example_map = {
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"ovis": load_ovis,
|
||||
"ovis2_5": load_ovis2_5,
|
||||
"paddleocr_vl": load_paddleocr_vl,
|
||||
"phi3_v": load_phi3v,
|
||||
"phi4_mm": load_phi4mm,
|
||||
"phi4_multimodal": load_phi4_multimodal,
|
||||
|
||||
@ -712,6 +712,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
},
|
||||
),
|
||||
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
|
||||
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
|
||||
"PaddlePaddle/PaddleOCR-VL",
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"PaliGemmaForConditionalGeneration": _HfExamplesInfo(
|
||||
"google/paligemma-3b-mix-224",
|
||||
extras={"v2": "google/paligemma2-3b-ft-docci-448"},
|
||||
|
||||
@ -23,12 +23,22 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only Erine model compatible with HuggingFace weights."""
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.models.llama import LlamaForCausalLM
|
||||
|
||||
from .utils import PPMissingLayer
|
||||
|
||||
|
||||
@support_torch_compile(
|
||||
# set dynamic_arg_dims to support mrope
|
||||
dynamic_arg_dims={
|
||||
"input_ids": 0,
|
||||
"positions": -1,
|
||||
"intermediate_tensors": 0,
|
||||
"inputs_embeds": 0,
|
||||
}
|
||||
)
|
||||
class Ernie4_5ForCausalLM(LlamaForCausalLM):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
1407
vllm/model_executor/models/paddleocr_vl.py
Normal file
1407
vllm/model_executor/models/paddleocr_vl.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -340,6 +340,10 @@ _MULTIMODAL_MODELS = {
|
||||
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
|
||||
"Ovis": ("ovis", "Ovis"),
|
||||
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
|
||||
"PaddleOCRVLForConditionalGeneration": (
|
||||
"paddleocr_vl",
|
||||
"PaddleOCRVLForConditionalGeneration",
|
||||
),
|
||||
"PaliGemmaForConditionalGeneration": (
|
||||
"paligemma",
|
||||
"PaliGemmaForConditionalGeneration",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user