From 86ae693f207fc9433f0b6d2c4331ef021dad50fe Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 28 Jul 2025 10:42:40 +0800 Subject: [PATCH] [Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470) Signed-off-by: DarkLight1337 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/multimodal_inputs.md | 4 +- docs/features/prompt_embeds.md | 2 +- docs/models/generative_models.md | 13 +- docs/models/pooling_models.md | 77 ++- docs/models/supported_models.md | 101 ++-- docs/serving/openai_compatible_server.md | 24 +- examples/offline_inference/basic/classify.py | 6 +- examples/offline_inference/basic/embed.py | 4 +- examples/offline_inference/basic/score.py | 6 +- .../embed_jina_embeddings_v3.py | 6 +- .../offline_inference/embed_matryoshka_fy.py | 6 +- examples/offline_inference/qwen3_reranker.py | 4 +- .../vision_language_pooling.py | 6 +- ...i_chat_completion_client_for_multimodal.py | 2 +- ...ai_chat_embedding_client_for_multimodal.py | 2 +- .../openai_cross_encoder_score.py | 2 +- ...enai_cross_encoder_score_for_multimodal.py | 2 +- .../online_serving/openai_pooling_client.py | 2 +- ...ompt_embed_inference_with_openai_client.py | 2 +- tests/compile/test_async_tp.py | 3 - tests/compile/test_basic_correctness.py | 6 +- tests/compile/test_fusion_all_reduce.py | 3 - tests/compile/test_sequence_parallelism.py | 3 - tests/conftest.py | 8 +- tests/distributed/test_expert_parallel.py | 26 +- tests/distributed/test_pipeline_parallel.py | 44 +- tests/distributed/test_sequence_parallel.py | 30 +- .../openai/correctness/test_mteb_embed.py | 3 +- .../openai/correctness/test_mteb_score.py | 3 +- .../openai/test_chat_logit_bias_validation.py | 4 - .../entrypoints/openai/test_chat_template.py | 1 + tests/entrypoints/openai/test_embedding.py | 4 +- .../openai/test_embedding_dimensions.py | 4 +- .../entrypoints/openai/test_openai_schema.py | 2 +- .../openai/test_optional_middleware.py | 4 +- tests/entrypoints/openai/test_pooling.py | 4 +- .../entrypoints/openai/test_skip_tokenizer.py | 4 +- tests/entrypoints/openai/test_truncation.py | 4 +- tests/entrypoints/openai/test_video.py | 2 +- tests/entrypoints/openai/test_vision.py | 2 +- .../openai/test_vision_embedding.py | 4 +- tests/entrypoints/test_chat_utils.py | 42 +- tests/lora/test_worker.py | 5 - .../model_executor/test_guided_processors.py | 10 +- .../test_model_load_with_params.py | 2 - tests/models/language/pooling/embed_utils.py | 2 +- tests/models/language/pooling/mteb_utils.py | 7 +- .../models/language/pooling/test_embedding.py | 2 +- tests/models/language/pooling/test_gritlm.py | 13 +- tests/models/language/pooling/test_jina.py | 7 +- .../pooling/test_nomic_max_model_len.py | 20 +- tests/models/language/pooling/test_scoring.py | 18 +- .../pooling/test_truncation_control.py | 6 +- .../multimodal/generation/test_common.py | 5 +- .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_interleaved.py | 2 +- .../multimodal/generation/test_phi4mm.py | 2 +- .../multimodal/generation/test_qwen2_vl.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 6 +- .../multimodal/generation/vlm_utils/types.py | 6 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 2 +- .../multimodal/pooling/test_llava_next.py | 2 +- tests/models/multimodal/pooling/test_phi3v.py | 2 +- .../multimodal/pooling/test_prithvi_mae.py | 2 +- .../multimodal/processing/test_common.py | 5 +- tests/models/multimodal/test_mapping.py | 5 +- .../models/quantization/test_bitsandbytes.py | 2 +- tests/models/test_initialization.py | 6 +- tests/models/test_registry.py | 21 +- tests/models/test_transformers.py | 14 +- tests/models/utils.py | 7 +- tests/multimodal/test_processing.py | 25 +- tests/quantization/test_configs.py | 10 +- tests/test_config.py | 326 ++++------- tests/test_sampling_params.py | 5 - tests/v1/core/test_kv_cache_utils.py | 12 +- tests/v1/core/test_scheduler.py | 3 - tests/v1/core/utils.py | 3 - tests/v1/kv_connector/unit/utils.py | 3 - tests/v1/spec_decode/test_eagle.py | 9 +- tests/v1/spec_decode/test_ngram.py | 9 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 4 - tests/v1/worker/test_gpu_model_runner.py | 4 - vllm/config.py | 530 +++++++++++------- vllm/engine/arg_utils.py | 29 +- vllm/entrypoints/llm.py | 93 ++- vllm/entrypoints/openai/api_server.py | 1 - vllm/model_executor/model_loader/utils.py | 133 +---- vllm/model_executor/models/config.py | 6 +- vllm/model_executor/models/registry.py | 249 ++++++-- vllm/transformers_utils/dynamic_module.py | 60 ++ vllm/transformers_utils/tokenizer_group.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 6 +- 94 files changed, 1117 insertions(+), 1083 deletions(-) create mode 100644 vllm/transformers_utils/dynamic_module.py diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e83dfdb11dadc..d4c8852206bba 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -343,7 +343,7 @@ Here is a simple example using Phi-3.5-Vision. First, launch the OpenAI-compatible server: ```bash -vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ +vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' ``` @@ -422,7 +422,7 @@ Instead of `image_url`, you can pass a video file via `video_url`. Here is a sim First, launch the OpenAI-compatible server: ```bash -vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192 +vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --runner generate --max-model-len 8192 ``` Then, you can use the OpenAI client as follows: diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 6f5616e05d8c1..83993bd0140fa 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -34,7 +34,7 @@ Prompt embeddings are passed in as base64 encoded torch tensors. First, launch the OpenAI-compatible server: ```bash -vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \ +vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \ --max-model-len 4096 --enable-prompt-embeds ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 21ad115e411a3..a3ad413593f3c 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -2,12 +2,19 @@ vLLM provides first-class support for generative models, which covers most of LLMs. -In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. +In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text. -For generative models, the only supported `--task` option is `"generate"`. -Usually, this is automatically inferred so you don't have to specify it. +## Configuration + +### Model Runner (`--runner`) + +Run a model in generation mode via the option `--runner generate`. + +!!! tip + There is no need to set this option in the vast majority of cases as vLLM can automatically + detect the model runner to use via `--runner auto`. ## Offline Inference diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 741ae2d79c1e5..a06d86523af1a 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -1,9 +1,9 @@ # Pooling Models -vLLM also supports pooling models, including embedding, reranking and reward models. +vLLM also supports pooling models, such as embedding, classification and reward models. In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. -These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input +These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input before returning them. !!! note @@ -11,18 +11,39 @@ before returning them. As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -If the model doesn't implement this interface, you can set `--task` which tells vLLM -to convert the model into a pooling model. +## Configuration -| `--task` | Model type | Supported pooling tasks | -|------------|----------------------|-------------------------------| -| `embed` | Embedding model | `encode`, `embed` | -| `classify` | Classification model | `encode`, `classify`, `score` | -| `reward` | Reward model | `encode` | +### Model Runner -## Pooling Tasks +Run a model in pooling mode via the option `--runner pooling`. -In vLLM, we define the following pooling tasks and corresponding APIs: +!!! tip + There is no need to set this option in the vast majority of cases as vLLM can automatically + detect the model runner to use via `--runner auto`. + +### Model Conversion + +vLLM can adapt models for various pooling tasks via the option `--convert `. + +If `--runner pooling` has been set (manually or automatically) but the model does not implement the +[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface, +vLLM will attempt to automatically convert the model according to the architecture names +shown in the table below. + +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|-------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `encode`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `encode`, `classify`, `score` | +| `*ForRewardModeling`, `*RewardModel` | `reward` | `encode` | + +!!! tip + You can explicitly set `--convert ` to specify how to convert the model. + +### Pooling Tasks + +Each pooling model in vLLM supports one or more of these tasks according to +[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], +enabling the corresponding APIs: | Task | APIs | |------------|--------------------| @@ -31,11 +52,19 @@ In vLLM, we define the following pooling tasks and corresponding APIs: | `classify` | `classify` | | `score` | `score` | -\*The `score` API falls back to `embed` task if the model does not support `score` task. +\* The `score` API falls back to `embed` task if the model does not support `score` task. -Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks]. +### Pooler Configuration -By default, the pooler assigned to each task has the following attributes: +#### Predefined models + +If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, +you can override some of its attributes via the `--override-pooler-config` option. + +#### Converted models + +If the model has been converted via `--convert` (see above), +the pooler assigned to each task has the following attributes by default: | Task | Pooling Type | Normalization | Softmax | |------------|----------------|---------------|---------| @@ -43,20 +72,12 @@ By default, the pooler assigned to each task has the following attributes: | `embed` | `LAST` | ✅︎ | ❌ | | `classify` | `LAST` | ❌ | ✅︎ | -These defaults may be overridden by the model's implementation in vLLM. - When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`), -which takes priority over the model's defaults. +its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. You can further customize this via the `--override-pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. -!!! note - - The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler - that is not based on [PoolerConfig][vllm.config.PoolerConfig]. - ## Offline Inference The [LLM][vllm.LLM] class provides various methods for offline inference. @@ -70,7 +91,7 @@ It returns the extracted hidden states directly, which is useful for reward mode ```python from vllm import LLM -llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") +llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling") (output,) = llm.encode("Hello, my name is") data = output.outputs.data @@ -85,7 +106,7 @@ It is primarily designed for embedding models. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") +llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling") (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding @@ -102,7 +123,7 @@ It is primarily designed for classification models. ```python from vllm import LLM -llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") (output,) = llm.classify("Hello, my name is") probs = output.outputs.probs @@ -123,7 +144,7 @@ It is designed for embedding models and cross encoder models. Embedding models u ```python from vllm import LLM -llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") +llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") (output,) = llm.score("What is the capital of France?", "The capital of Brazil is Brasilia.") @@ -175,7 +196,7 @@ You can change the output dimensions of embedding models that support Matryoshka from vllm import LLM, PoolingParams llm = LLM(model="jinaai/jina-embeddings-v3", - task="embed", + runner="pooling", trust_remote_code=True) outputs = llm.embed(["Follow the white rabbit."], pooling_params=PoolingParams(dimensions=32)) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 355ac57094195..5d2b5ea2d9292 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -1,7 +1,6 @@ # Supported Models vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. -If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. @@ -24,7 +23,7 @@ To check if the modeling backend is Transformers, you can simply do this: ```python from vllm import LLM -llm = LLM(model=..., task="generate") # Name or path of your model +llm = LLM(model=...) # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` @@ -158,13 +157,13 @@ The [Transformers backend][transformers-backend] enables you to run models direc ```python from vllm import LLM - # For generative models (task=generate) only - llm = LLM(model=..., task="generate") # Name or path of your model + # For generative models (runner=generate) only + llm = LLM(model=..., runner="generate") # Name or path of your model output = llm.generate("Hello, my name is") print(output) - # For pooling models (task={embed,classify,reward,score}) only - llm = LLM(model=..., task="embed") # Name or path of your model + # For pooling models (runner=pooling) only + llm = LLM(model=..., runner="pooling") # Name or path of your model output = llm.encode("Hello, my name is") print(output) ``` @@ -281,13 +280,13 @@ And use with `trust_remote_code=True`. ```python from vllm import LLM -llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) +llm = LLM(model=..., revision=..., runner=..., trust_remote_code=True) -# For generative models (task=generate) only +# For generative models (runner=generate) only output = llm.generate("Hello, my name is") print(output) -# For pooling models (task={embed,classify,reward,score}) only +# For pooling models (runner=pooling) only output = llm.encode("Hello, my name is") print(output) ``` @@ -312,8 +311,6 @@ See [this page](generative_models.md) for more information on how to use generat #### Text Generation -Specified using `--task generate`. -