mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 04:45:01 +08:00
[Model][VLM] Support Keye-VL-8B-Preview (#20126)
Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
This commit is contained in:
parent
2e7cbf2d7d
commit
8452946c06
@ -559,6 +559,7 @@ Specified using `--task generate`.
|
|||||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
||||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
||||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
|
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
||||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||||
|
|||||||
@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Keye-VL
|
||||||
|
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=8192,
|
||||||
|
trust_remote_code=True,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
if modality == "image":
|
||||||
|
placeholder = "<|image_pad|>"
|
||||||
|
elif modality == "video":
|
||||||
|
placeholder = "<|video_pad|>"
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
(
|
||||||
|
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||||
|
f"{question}<|im_end|>\n"
|
||||||
|
"<|im_start|>assistant\n"
|
||||||
|
)
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Kimi-VL
|
# Kimi-VL
|
||||||
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -1154,6 +1185,7 @@ model_example_map = {
|
|||||||
"h2ovl_chat": run_h2ovl,
|
"h2ovl_chat": run_h2ovl,
|
||||||
"idefics3": run_idefics3,
|
"idefics3": run_idefics3,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
|
"keye_vl": run_keye_vl,
|
||||||
"kimi_vl": run_kimi_vl,
|
"kimi_vl": run_kimi_vl,
|
||||||
"llava": run_llava,
|
"llava": run_llava,
|
||||||
"llava-next": run_llava_next,
|
"llava-next": run_llava_next,
|
||||||
|
|||||||
@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
||||||
|
|
||||||
@ -862,6 +899,7 @@ model_example_map = {
|
|||||||
"h2ovl_chat": load_h2ovl,
|
"h2ovl_chat": load_h2ovl,
|
||||||
"idefics3": load_idefics3,
|
"idefics3": load_idefics3,
|
||||||
"internvl_chat": load_internvl,
|
"internvl_chat": load_internvl,
|
||||||
|
"keye_vl": load_keye_vl,
|
||||||
"kimi_vl": load_kimi_vl,
|
"kimi_vl": load_kimi_vl,
|
||||||
"llava": load_llava,
|
"llava": load_llava,
|
||||||
"llava-next": load_llava_next,
|
"llava-next": load_llava_next,
|
||||||
|
|||||||
@ -351,6 +351,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
||||||
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
||||||
|
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
|
||||||
|
trust_remote_code=True),
|
||||||
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
||||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
|||||||
@ -540,7 +540,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
return "<image>"
|
return "<image>"
|
||||||
if model_type in ("mllama", "llama4"):
|
if model_type in ("mllama", "llama4"):
|
||||||
return "<|image|>"
|
return "<|image|>"
|
||||||
if model_type in ("qwen2_vl", "qwen2_5_vl"):
|
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
|
||||||
return "<|vision_start|><|image_pad|><|vision_end|>"
|
return "<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
if model_type == "qwen2_5_omni":
|
if model_type == "qwen2_5_omni":
|
||||||
return "<|vision_start|><|IMAGE|><|vision_end|>"
|
return "<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
@ -570,7 +570,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
return "<video>"
|
return "<video>"
|
||||||
if model_type == "glm4v":
|
if model_type == "glm4v":
|
||||||
return "<|begin_of_video|><|video|><|end_of_video|>"
|
return "<|begin_of_video|><|video|><|end_of_video|>"
|
||||||
if model_type in ("qwen2_vl", "qwen2_5_vl"):
|
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
|
||||||
return "<|vision_start|><|video_pad|><|vision_end|>"
|
return "<|vision_start|><|video_pad|><|vision_end|>"
|
||||||
if model_type == "qwen2_5_omni":
|
if model_type == "qwen2_5_omni":
|
||||||
return "<|vision_start|><|VIDEO|><|vision_end|>"
|
return "<|vision_start|><|VIDEO|><|vision_end|>"
|
||||||
|
|||||||
1725
vllm/model_executor/models/keye.py
Normal file
1725
vllm/model_executor/models/keye.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -197,6 +197,7 @@ _MULTIMODAL_MODELS = {
|
|||||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||||
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
|
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
|
||||||
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
|
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
|
||||||
|
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
|
||||||
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
|
||||||
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
||||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user