diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/mistral-small.py similarity index 92% rename from examples/offline_inference/pixtral.py rename to examples/offline_inference/mistral-small.py index 5379f4562162f..43be2aa80773f 100644 --- a/examples/offline_inference/pixtral.py +++ b/examples/offline_inference/mistral-small.py @@ -6,14 +6,16 @@ import argparse from vllm import LLM from vllm.sampling_params import SamplingParams -# This script is an offline demo for running Mistral-Small-3 +# This script is an offline demo for running Mistral-Small-3.1 # # If you want to run a server/client setup, please follow this code: # # - Server: # # ```bash -# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384 +# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ +# --tokenizer-mode mistral --config-format mistral --load-format mistral \ +# --limit-mm-per-prompt 'image=4' --max-model-len 16384 # ``` # # - Client: @@ -51,6 +53,8 @@ def run_simple_demo(args: argparse.Namespace): llm = LLM( model=model_name, tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", max_model_len=4096, max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, @@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace): llm = LLM( model=model_name, tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", limit_mm_per_prompt={"image": max_img_per_msg}, max_model_len=max_img_per_msg * max_tokens_per_img, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,