diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index a51caa2aec8b..5d7fb819d347 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -15,40 +15,46 @@ prompts = [ "What is annapurna labs?", ] -# Create a sampling params object. -sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) -# Create an LLM. -llm = LLM( - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", - speculative_config={ - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", - "num_speculative_tokens": 5, - "max_model_len": 2048, - }, - max_num_seqs=4, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in neuronx-distributed-inference. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - }, -) +def main(): + # Create a sampling params object. + sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + # Create an LLM. + llm = LLM( + model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", + speculative_config={ + "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", + "num_speculative_tokens": 5, + "max_model_len": 2048, + }, + max_num_seqs=4, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in neuronx-distributed-inference. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=32, + override_neuron_config={ + "enable_eagle_speculation": True, + "enable_fused_speculation": True, + }, + ) + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index c30541a598ce..16d44cbadbc9 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -6,14 +6,19 @@ This folder provides several example scripts on how to inference Qwen2.5-Omni of ```bash # Audio + image + video -python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q mixed_modalities # Read vision and audio inputs from a single video file # NOTE: V1 engine does not support interleaved modalities yet. -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q use_audio_in_video # Multiple audios -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q multi_audios ``` This script will run the thinker part of Qwen2.5-Omni, and generate text response. @@ -22,11 +27,16 @@ You can also test Qwen2.5-Omni on a single modality: ```bash # Process audio inputs -python examples/offline_inference/audio_language.py --model-type qwen2_5_omni +python examples/offline_inference/audio_language.py \ + --model-type qwen2_5_omni # Process image inputs -python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality image \ + --model-type qwen2_5_omni # Process video inputs -python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality video \ + --model-type qwen2_5_omni ```