mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:57:03 +08:00
[Doc] Add example for Step3-VL (#22061)
Signed-off-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
parent
da31f6ad3d
commit
27a145e893
@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# SmolVLM2-2.2B-Instruct
|
|
||||||
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
|
|
||||||
assert modality == "image"
|
|
||||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
max_model_len=8192,
|
|
||||||
max_num_seqs=2,
|
|
||||||
enforce_eager=True,
|
|
||||||
mm_processor_kwargs={
|
|
||||||
"max_image_size": {"longest_edge": 384},
|
|
||||||
},
|
|
||||||
limit_mm_per_prompt={modality: 1},
|
|
||||||
)
|
|
||||||
prompts = [
|
|
||||||
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
|
|
||||||
for question in questions
|
|
||||||
]
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompts=prompts,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Intern-S1
|
# Intern-S1
|
||||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1"
|
model_name = "internlm/Intern-S1"
|
||||||
@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Nemontron_VL
|
|
||||||
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
|
|
||||||
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=8192,
|
|
||||||
limit_mm_per_prompt={modality: 1},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert modality == "image"
|
|
||||||
placeholder = "<image>"
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
messages = [
|
|
||||||
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
|
||||||
for question in questions
|
|
||||||
]
|
|
||||||
prompts = tokenizer.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Stop tokens for InternVL
|
|
||||||
# models variants may have different stop tokens
|
|
||||||
# please refer to the model card for the correct "stop words":
|
|
||||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
|
||||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
|
||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
|
||||||
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompts=prompts,
|
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Keye-VL
|
# Keye-VL
|
||||||
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
|
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
assert modality == "image"
|
||||||
|
|
||||||
|
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=4,
|
||||||
|
tensor_parallel_size=8,
|
||||||
|
gpu_memory_utilization=0.4,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
messages = [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
prompts = tokenizer.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=True, tokenize=False
|
||||||
|
)
|
||||||
|
stop_token_ids = None
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-1.5
|
# LLaVA-1.5
|
||||||
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -857,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
|
|
||||||
assert modality == "image"
|
|
||||||
|
|
||||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
max_model_len=8192,
|
|
||||||
max_num_seqs=4,
|
|
||||||
tensor_parallel_size=8,
|
|
||||||
gpu_memory_utilization=0.4,
|
|
||||||
limit_mm_per_prompt={modality: 1},
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
messages = [
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
for question in questions
|
|
||||||
]
|
|
||||||
prompts = tokenizer.apply_chat_template(
|
|
||||||
messages, add_generation_prompt=True, tokenize=False
|
|
||||||
)
|
|
||||||
stop_token_ids = None
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompts=prompts,
|
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Molmo
|
# Molmo
|
||||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -917,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Nemontron_VL
|
||||||
|
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=8192,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert modality == "image"
|
||||||
|
placeholder = "<image>"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
messages = [
|
||||||
|
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
prompts = tokenizer.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stop tokens for InternVL
|
||||||
|
# models variants may have different stop tokens
|
||||||
|
# please refer to the model card for the correct "stop words":
|
||||||
|
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||||
|
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||||
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# NVLM-D
|
# NVLM-D
|
||||||
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -1274,6 +1248,94 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# SkyworkR1V
|
||||||
|
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
assert modality == "image"
|
||||||
|
|
||||||
|
model_name = "Skywork/Skywork-R1V-38B"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
messages = [
|
||||||
|
[{"role": "user", "content": f"<image>\n{question}"}] for question in questions
|
||||||
|
]
|
||||||
|
prompts = tokenizer.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stop tokens for SkyworkR1V
|
||||||
|
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
|
||||||
|
stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
|
||||||
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# SmolVLM2-2.2B-Instruct
|
||||||
|
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
assert modality == "image"
|
||||||
|
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=2,
|
||||||
|
enforce_eager=True,
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"max_image_size": {"longest_edge": 384},
|
||||||
|
},
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
)
|
||||||
|
prompts = [
|
||||||
|
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Step3
|
||||||
|
def run_step3(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
assert modality == "image"
|
||||||
|
|
||||||
|
model_name = "stepfun-ai/step3-fp8"
|
||||||
|
|
||||||
|
# NOTE: Below are verified configurations for step3-fp8
|
||||||
|
# on 8xH100 GPUs.
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
gpu_memory_utilization=0.85,
|
||||||
|
tensor_parallel_size=8,
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
reasoning_parser="step3",
|
||||||
|
)
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
|
||||||
|
f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# omni-research/Tarsier-7b
|
# omni-research/Tarsier-7b
|
||||||
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -1324,39 +1386,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# SkyworkR1V
|
|
||||||
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
|
||||||
assert modality == "image"
|
|
||||||
|
|
||||||
model_name = "Skywork/Skywork-R1V-38B"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
trust_remote_code=True,
|
|
||||||
max_model_len=4096,
|
|
||||||
limit_mm_per_prompt={modality: 1},
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
messages = [
|
|
||||||
[{"role": "user", "content": f"<image>\n{question}"}] for question in questions
|
|
||||||
]
|
|
||||||
prompts = tokenizer.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Stop tokens for SkyworkR1V
|
|
||||||
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
|
|
||||||
stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
|
|
||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompts=prompts,
|
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"aria": run_aria,
|
"aria": run_aria,
|
||||||
"aya_vision": run_aya_vision,
|
"aya_vision": run_aya_vision,
|
||||||
@ -1373,9 +1402,9 @@ model_example_map = {
|
|||||||
"idefics3": run_idefics3,
|
"idefics3": run_idefics3,
|
||||||
"interns1": run_interns1,
|
"interns1": run_interns1,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
"nemotron_vl": run_nemotron_vl,
|
|
||||||
"keye_vl": run_keye_vl,
|
"keye_vl": run_keye_vl,
|
||||||
"kimi_vl": run_kimi_vl,
|
"kimi_vl": run_kimi_vl,
|
||||||
|
"llama4": run_llama4,
|
||||||
"llava": run_llava,
|
"llava": run_llava,
|
||||||
"llava-next": run_llava_next,
|
"llava-next": run_llava_next,
|
||||||
"llava-next-video": run_llava_next_video,
|
"llava-next-video": run_llava_next_video,
|
||||||
@ -1385,8 +1414,8 @@ model_example_map = {
|
|||||||
"minicpmv": run_minicpmv,
|
"minicpmv": run_minicpmv,
|
||||||
"mistral3": run_mistral3,
|
"mistral3": run_mistral3,
|
||||||
"mllama": run_mllama,
|
"mllama": run_mllama,
|
||||||
"llama4": run_llama4,
|
|
||||||
"molmo": run_molmo,
|
"molmo": run_molmo,
|
||||||
|
"nemotron_vl": run_nemotron_vl,
|
||||||
"NVLM_D": run_nvlm_d,
|
"NVLM_D": run_nvlm_d,
|
||||||
"ovis": run_ovis,
|
"ovis": run_ovis,
|
||||||
"paligemma": run_paligemma,
|
"paligemma": run_paligemma,
|
||||||
@ -1401,6 +1430,7 @@ model_example_map = {
|
|||||||
"qwen2_5_omni": run_qwen2_5_omni,
|
"qwen2_5_omni": run_qwen2_5_omni,
|
||||||
"skywork_chat": run_skyworkr1v,
|
"skywork_chat": run_skyworkr1v,
|
||||||
"smolvlm": run_smolvlm,
|
"smolvlm": run_smolvlm,
|
||||||
|
"step3": run_step3,
|
||||||
"tarsier": run_tarsier,
|
"tarsier": run_tarsier,
|
||||||
"tarsier2": run_tarsier2,
|
"tarsier2": run_tarsier2,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_hyperclovax_seed_vision(
|
||||||
|
question: str, image_urls: list[str]
|
||||||
|
) -> ModelRequestData:
|
||||||
|
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=16384,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
message = {"role": "user", "content": list()}
|
||||||
|
for _image_url in image_urls:
|
||||||
|
message["content"].append(
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"image": _image_url,
|
||||||
|
"ocr": "",
|
||||||
|
"lens_keywords": "",
|
||||||
|
"lens_local_keywords": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
message["content"].append(
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[
|
||||||
|
message,
|
||||||
|
],
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
stop_token_ids=None,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||||
|
|
||||||
@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
|
||||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
|
||||||
|
|
||||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
max_model_len=8192,
|
|
||||||
max_num_seqs=16,
|
|
||||||
enforce_eager=True,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
|
||||||
mm_processor_kwargs={
|
|
||||||
"max_image_size": {"longest_edge": 384},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
placeholders = "\n".join(
|
|
||||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
|
||||||
)
|
|
||||||
prompt = (
|
|
||||||
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
|
||||||
)
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompt=prompt,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "internlm/Intern-S1"
|
model_name = "internlm/Intern-S1"
|
||||||
|
|
||||||
@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_hyperclovax_seed_vision(
|
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
question: str, image_urls: list[str]
|
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||||
) -> ModelRequestData:
|
|
||||||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
max_model_len=131072,
|
||||||
max_model_len=16384,
|
tensor_parallel_size=8,
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
)
|
)
|
||||||
|
|
||||||
message = {"role": "user", "content": list()}
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
for _image_url in image_urls:
|
messages = [
|
||||||
message["content"].append(
|
|
||||||
{
|
|
||||||
"type": "image",
|
|
||||||
"image": _image_url,
|
|
||||||
"ocr": "",
|
|
||||||
"lens_keywords": "",
|
|
||||||
"lens_local_keywords": "",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
message["content"].append(
|
|
||||||
{
|
{
|
||||||
"type": "text",
|
"role": "user",
|
||||||
"text": question,
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
}
|
}
|
||||||
)
|
]
|
||||||
|
|
||||||
prompt = tokenizer.apply_chat_template(
|
processor = AutoProcessor.from_pretrained(model_name)
|
||||||
[
|
|
||||||
message,
|
prompt = processor.apply_chat_template(
|
||||||
],
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
tokenize=False,
|
|
||||||
add_generation_prompt=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
engine_args=engine_args,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=None,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
|
||||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
|
||||||
model=model_name,
|
|
||||||
max_model_len=131072,
|
|
||||||
tensor_parallel_size=8,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
|
||||||
)
|
|
||||||
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*placeholders,
|
|
||||||
{"type": "text", "text": question},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(model_name)
|
|
||||||
|
|
||||||
prompt = processor.apply_chat_template(
|
|
||||||
messages, tokenize=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
return ModelRequestData(
|
|
||||||
engine_args=engine_args,
|
|
||||||
prompt=prompt,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
|
|
||||||
@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||||
|
|
||||||
|
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=8192,
|
||||||
|
max_num_seqs=16,
|
||||||
|
enforce_eager=True,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"max_image_size": {"longest_edge": 384},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = "\n".join(
|
||||||
|
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||||
|
)
|
||||||
|
prompt = (
|
||||||
|
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||||
|
)
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "stepfun-ai/step3-fp8"
|
||||||
|
|
||||||
|
# NOTE: Below are verified configurations for step3-fp8
|
||||||
|
# on 8xH100 GPUs.
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
gpu_memory_utilization=0.85,
|
||||||
|
tensor_parallel_size=8,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
reasoning_parser="step3",
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
|
||||||
|
f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
|
||||||
|
">assistant\n<think>\n"
|
||||||
|
)
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "omni-research/Tarsier-7b"
|
model_name = "omni-research/Tarsier-7b"
|
||||||
|
|
||||||
@ -1006,16 +1034,16 @@ model_example_map = {
|
|||||||
"deepseek_vl_v2": load_deepseek_vl2,
|
"deepseek_vl_v2": load_deepseek_vl2,
|
||||||
"gemma3": load_gemma3,
|
"gemma3": load_gemma3,
|
||||||
"h2ovl_chat": load_h2ovl,
|
"h2ovl_chat": load_h2ovl,
|
||||||
|
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||||
"idefics3": load_idefics3,
|
"idefics3": load_idefics3,
|
||||||
"interns1": load_interns1,
|
"interns1": load_interns1,
|
||||||
"internvl_chat": load_internvl,
|
"internvl_chat": load_internvl,
|
||||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
|
||||||
"keye_vl": load_keye_vl,
|
"keye_vl": load_keye_vl,
|
||||||
"kimi_vl": load_kimi_vl,
|
"kimi_vl": load_kimi_vl,
|
||||||
|
"llama4": load_llama4,
|
||||||
"llava": load_llava,
|
"llava": load_llava,
|
||||||
"llava-next": load_llava_next,
|
"llava-next": load_llava_next,
|
||||||
"llava-onevision": load_llava_onevision,
|
"llava-onevision": load_llava_onevision,
|
||||||
"llama4": load_llama4,
|
|
||||||
"mistral3": load_mistral3,
|
"mistral3": load_mistral3,
|
||||||
"mllama": load_mllama,
|
"mllama": load_mllama,
|
||||||
"NVLM_D": load_nvlm_d,
|
"NVLM_D": load_nvlm_d,
|
||||||
@ -1028,6 +1056,7 @@ model_example_map = {
|
|||||||
"qwen2_vl": load_qwen2_vl,
|
"qwen2_vl": load_qwen2_vl,
|
||||||
"qwen2_5_vl": load_qwen2_5_vl,
|
"qwen2_5_vl": load_qwen2_5_vl,
|
||||||
"smolvlm": load_smolvlm,
|
"smolvlm": load_smolvlm,
|
||||||
|
"step3": load_step3,
|
||||||
"tarsier": load_tarsier,
|
"tarsier": load_tarsier,
|
||||||
"tarsier2": load_tarsier2,
|
"tarsier2": load_tarsier2,
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user