diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index d2fbb1870dde6..0521a22c07029 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -228,7 +228,7 @@ outputs = llm.embed(["Follow the white rabbit."], print(outputs[0].outputs) ``` -A code example can be found here: +A code example can be found here: ### Online Inference @@ -258,4 +258,4 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -An OpenAI client example can be found here: +An OpenAI client example can be found here: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6295a2aa8dc2f..4f95280233ac7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -530,7 +530,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index dfed15d4ace97..181a874efa3cb 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -239,7 +239,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) which will be treated as a single prompt to the model. -Code example: +Code example: #### Multi-modal inputs @@ -313,7 +313,7 @@ and passing a list of `messages` in the request. Refer to the examples below for `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -Full example: +Full example: #### Extra parameters @@ -421,7 +421,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: [](){ #classification-api } @@ -431,7 +431,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. -Code example: +Code example: #### Example Requests @@ -760,7 +760,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: +Code example: #### Example Request diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md new file mode 100644 index 0000000000000..8693f5e08e0ba --- /dev/null +++ b/examples/offline_inference/pooling/README.md @@ -0,0 +1,33 @@ +# Pooling models + +## Convert llm model to seq cls + +```bash +# for BAAI/bge-reranker-v2-gemma +# Caution: "Yes" and "yes" are two different tokens +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls +# for mxbai-rerank-v2 +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls +# for Qwen3-Reranker +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls +``` + +## Embed jina_embeddings_v3 usage + +Only text matching task is supported for now. See + +```bash +python examples/offline_inference/pooling/embed_jina_embeddings_v3.py +``` + +## Embed matryoshka dimensions usage + +```bash +python examples/offline_inference/pooling/embed_matryoshka_fy.py +``` + +## Qwen3 reranker usage + +```bash +python qwen3_reranker.py +``` diff --git a/examples/offline_inference/convert_model_to_seq_cls.py b/examples/offline_inference/pooling/convert_model_to_seq_cls.py similarity index 100% rename from examples/offline_inference/convert_model_to_seq_cls.py rename to examples/offline_inference/pooling/convert_model_to_seq_cls.py diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py similarity index 100% rename from examples/offline_inference/embed_jina_embeddings_v3.py rename to examples/offline_inference/pooling/embed_jina_embeddings_v3.py diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py similarity index 100% rename from examples/offline_inference/embed_matryoshka_fy.py rename to examples/offline_inference/pooling/embed_matryoshka_fy.py diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/pooling/qwen3_reranker.py similarity index 100% rename from examples/offline_inference/qwen3_reranker.py rename to examples/offline_inference/pooling/qwen3_reranker.py diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index f356d7d4529ea..56888c8aa0e4c 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -120,7 +120,7 @@ echo " - API Key: $API_KEY" echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" echo "" echo "๐Ÿงช Test the server with:" -echo " python examples/online_serving/openai_embedding_long_text_client.py" +echo " python examples/online_serving/openai_embedding_long_text/client.py" echo "" echo "๐Ÿ“š Enhanced features enabled:" echo " โœ… Intelligent native pooling type detection" diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md new file mode 100644 index 0000000000000..f7926542202d6 --- /dev/null +++ b/examples/online_serving/pooling/README.md @@ -0,0 +1,43 @@ +# Pooling models + +## Cohere rerank usage + +```bash +python examples/online_serving/pooling/cohere_rerank_client.py +``` + +## Jinaai rerank usage + +```bash +python examples/online_serving/pooling/jinaai_rerank_client.py +``` + +## Openai chat embedding for multimodal usage + +```bash +python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +``` + +## Openai classification usage + +```bash +python examples/online_serving/pooling/openai_classification_client.py +``` + +## Openai embedding usage + +```bash +python examples/online_serving/pooling/openai_embedding_client.py +``` + +## Openai embedding matryoshka dimensions usage + +```bash +python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py +``` + +## Openai pooling usage + +```bash +python examples/online_serving/pooling/openai_pooling_client.py +``` diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py similarity index 100% rename from examples/online_serving/cohere_rerank_client.py rename to examples/online_serving/pooling/cohere_rerank_client.py diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/pooling/jinaai_rerank_client.py similarity index 100% rename from examples/online_serving/jinaai_rerank_client.py rename to examples/online_serving/pooling/jinaai_rerank_client.py diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py similarity index 92% rename from examples/online_serving/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 771ad8511e972..30cb3325b9b18 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -1,5 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +"""Example Python client for multimodal embedding API using vLLM API server +NOTE: + start a supported multimodal embeddings model server with `vllm serve`, e.g. + vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024 +""" import argparse import base64 diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/pooling/openai_classification_client.py similarity index 86% rename from examples/online_serving/openai_classification_client.py rename to examples/online_serving/pooling/openai_classification_client.py index b10e7acbd26c1..d8dc2ef001112 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/pooling/openai_classification_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for classification API using vLLM API server +NOTE: + start a supported classification model server with `vllm serve`, e.g. + vllm serve jason9693/Qwen2.5-1.5B-apeach +""" import argparse import pprint diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/pooling/openai_embedding_client.py similarity index 82% rename from examples/online_serving/openai_embedding_client.py rename to examples/online_serving/pooling/openai_embedding_client.py index 6bc390861e2ee..f5f6820d07d73 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/pooling/openai_embedding_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for embedding API using vLLM API server +NOTE: + start a supported embeddings model server with `vllm serve`, e.g. + vllm serve intfloat/e5-small +""" from openai import OpenAI diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py similarity index 100% rename from examples/online_serving/openai_embedding_matryoshka_fy.py rename to examples/online_serving/pooling/openai_embedding_matryoshka_fy.py diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/pooling/openai_pooling_client.py similarity index 89% rename from examples/online_serving/openai_pooling_client.py rename to examples/online_serving/pooling/openai_pooling_client.py index 95555d41cbea5..569015746b128 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/pooling/openai_pooling_client.py @@ -4,7 +4,9 @@ Example online usage of Pooling API. Run `vllm serve --runner pooling` -to start up the server in vLLM. +to start up the server in vLLM. e.g. + +vllm serve internlm/internlm2-1_8b-reward --trust-remote-code """ import argparse @@ -23,7 +25,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") + parser.add_argument("--model", type=str, default="internlm/internlm2-1_8b-reward") return parser.parse_args()