From 27a145e8931582fc74c1f46e0e4630c610b96160 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 01:35:49 -0700 Subject: [PATCH] [Doc] Add example for Step3-VL (#22061) Signed-off-by: Roger Wang --- examples/offline_inference/vision_language.py | 298 ++++++++++-------- .../vision_language_multi_image.py | 215 +++++++------ 2 files changed, 286 insertions(+), 227 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 0edcd0407747c..a75b8e2b047d8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ) -# SmolVLM2-2.2B-Instruct -def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [ - (f"<|im_start|>User:{question}\nAssistant:") - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ) -# Nemontron_VL -def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=8192, - limit_mm_per_prompt={modality: 1}, - ) - - assert modality == "image" - placeholder = "" - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"{placeholder}\n{question}"}] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Keye-VL def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ) +def run_llama4(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=4, + tensor_parallel_size=8, + gpu_memory_utilization=0.4, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + stop_token_ids = None + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -857,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) -def run_llama4(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=8, - gpu_memory_utilization=0.4, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - stop_token_ids = None - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -917,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ) +# Nemontron_VL +def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={modality: 1}, + ) + + assert modality == "image" + placeholder = "" + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # NVLM-D def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1274,6 +1248,94 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# SkyworkR1V +def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "Skywork/Skywork-R1V-38B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for SkyworkR1V + # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py + stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + +# SmolVLM2-2.2B-Instruct +def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [ + (f"<|im_start|>User:{question}\nAssistant:") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# Step3 +def run_step3(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={modality: 1}, + reasoning_parser="step3", + ) + + prompts = [ + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{question} <|EOT|><|BOT|>assistant\n\n" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # omni-research/Tarsier-7b def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1324,39 +1386,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ) -# SkyworkR1V -def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "Skywork/Skywork-R1V-38B" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"\n{question}"}] for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for SkyworkR1V - # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py - stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - model_example_map = { "aria": run_aria, "aya_vision": run_aya_vision, @@ -1373,9 +1402,9 @@ model_example_map = { "idefics3": run_idefics3, "interns1": run_interns1, "internvl_chat": run_internvl, - "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, "kimi_vl": run_kimi_vl, + "llama4": run_llama4, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, @@ -1385,8 +1414,8 @@ model_example_map = { "minicpmv": run_minicpmv, "mistral3": run_mistral3, "mllama": run_mllama, - "llama4": run_llama4, "molmo": run_molmo, + "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, "ovis": run_ovis, "paligemma": run_paligemma, @@ -1401,6 +1430,7 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, + "step3": run_step3, "tarsier": run_tarsier, "tarsier2": run_tarsier2, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index dd50f3639709e..1ab405fa14f3a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_hyperclovax_seed_vision( + question: str, image_urls: list[str] +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=16384, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + message = {"role": "user", "content": list()} + for _image_url in image_urls: + message["content"].append( + { + "type": "image", + "image": _image_url, + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + } + ) + message["content"].append( + { + "type": "text", + "text": question, + } + ) + + prompt = tokenizer.apply_chat_template( + [ + message, + ], + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" @@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - ) - - placeholders = "\n".join( - f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) - ) - prompt = ( - f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - ) - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_hyperclovax_seed_vision( - question: str, image_urls: list[str] -) -> ModelRequestData: - model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" engine_args = EngineArgs( model=model_name, - trust_remote_code=True, - max_model_len=16384, + max_model_len=131072, + tensor_parallel_size=8, limit_mm_per_prompt={"image": len(image_urls)}, ) - message = {"role": "user", "content": list()} - for _image_url in image_urls: - message["content"].append( - { - "type": "image", - "image": _image_url, - "ocr": "", - "lens_keywords": "", - "lens_local_keywords": "", - } - ) - message["content"].append( + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ { - "type": "text", - "text": question, + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], } - ) + ] - prompt = tokenizer.apply_chat_template( - [ - message, - ], - tokenize=False, - add_generation_prompt=True, + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True ) return ModelRequestData( engine_args=engine_args, prompt=prompt, - stop_token_ids=None, image_data=[fetch_image(url) for url in image_urls], ) @@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ) -def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=131072, - tensor_parallel_size=8, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - } - ] - - processor = AutoProcessor.from_pretrained(model_name) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + ) + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={"image": len(image_urls)}, + reasoning_parser="step3", + ) + + prompt = ( + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{'' * len(image_urls)}{question} <|EOT|><|BOT|" + ">assistant\n\n" + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "omni-research/Tarsier-7b" @@ -1006,16 +1034,16 @@ model_example_map = { "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, + "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, "internvl_chat": load_internvl, - "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, "kimi_vl": load_kimi_vl, + "llama4": load_llama4, "llava": load_llava, "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, - "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, @@ -1028,6 +1056,7 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, + "step3": load_step3, "tarsier": load_tarsier, "tarsier2": load_tarsier2, }