From 2eb4fe912916aea8998d085786df7abd7737e1f3 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 2 Dec 2025 23:54:28 +0800 Subject: [PATCH] [examples] Resettle pooling examples. (#29365) Signed-off-by: wang.yuqi Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 22 +-- .github/CODEOWNERS | 4 +- docs/.nav.yml | 6 +- docs/design/io_processor_plugins.md | 2 +- docs/mkdocs/hooks/generate_examples.py | 148 ++++++++++-------- docs/models/pooling_models.md | 4 +- docs/models/supported_models.md | 4 +- docs/serving/openai_compatible_server.md | 14 +- examples/offline_inference/pooling/README.md | 57 ------- examples/online_serving/pooling/README.md | 97 ------------ .../classify}/openai_classification_client.py | 0 .../embed}/embed_jina_embeddings_v3.py | 0 .../embed}/embed_matryoshka_fy.py | 0 .../embedding_requests_base64_client.py | 0 .../embed}/embedding_requests_bytes_client.py | 0 ...ai_chat_embedding_client_for_multimodal.py | 0 .../embed}/openai_embedding_client.py | 0 .../openai_embedding_long_text/README.md | 0 .../openai_embedding_long_text/client.py | 0 .../openai_embedding_long_text/service.sh | 0 .../embed}/openai_embedding_matryoshka_fy.py | 0 .../plugin/prithvi_geospatial_mae_client.py} | 0 .../prithvi_geospatial_mae_io_processor.py | 0 .../plugin/prithvi_geospatial_mae_offline.py} | 0 .../pooling/openai_pooling_client.py | 0 .../pooling}/vision_language_pooling.py | 0 .../score}/cohere_rerank_client.py | 0 .../score}/convert_model_to_seq_cls.py | 0 .../score}/jinaai_rerank_client.py | 0 .../score}/openai_cross_encoder_score.py | 0 ...enai_cross_encoder_score_for_multimodal.py | 0 .../score}/qwen3_reranker.py | 0 .../pooling => pooling/token_classify}/ner.py | 0 .../token_classify}/ner_client.py | 0 .../token_embed}/multi_vector_retrieval.py | 0 .../multi_vector_retrieval_client.py | 0 36 files changed, 109 insertions(+), 249 deletions(-) delete mode 100644 examples/offline_inference/pooling/README.md delete mode 100644 examples/online_serving/pooling/README.md rename examples/{online_serving/pooling => pooling/classify}/openai_classification_client.py (100%) rename examples/{offline_inference/pooling => pooling/embed}/embed_jina_embeddings_v3.py (100%) rename examples/{offline_inference/pooling => pooling/embed}/embed_matryoshka_fy.py (100%) rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_base64_client.py (100%) rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_bytes_client.py (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_chat_embedding_client_for_multimodal.py (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_client.py (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/README.md (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/client.py (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/service.sh (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_matryoshka_fy.py (100%) rename examples/{online_serving/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_client.py} (100%) rename examples/{offline_inference/pooling => pooling/plugin}/prithvi_geospatial_mae_io_processor.py (100%) rename examples/{offline_inference/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_offline.py} (100%) rename examples/{online_serving => pooling}/pooling/openai_pooling_client.py (100%) rename examples/{offline_inference => pooling/pooling}/vision_language_pooling.py (100%) rename examples/{online_serving/pooling => pooling/score}/cohere_rerank_client.py (100%) rename examples/{offline_inference/pooling => pooling/score}/convert_model_to_seq_cls.py (100%) rename examples/{online_serving/pooling => pooling/score}/jinaai_rerank_client.py (100%) rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score.py (100%) rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score_for_multimodal.py (100%) rename examples/{offline_inference/pooling => pooling/score}/qwen3_reranker.py (100%) rename examples/{offline_inference/pooling => pooling/token_classify}/ner.py (100%) rename examples/{online_serving/pooling => pooling/token_classify}/ner_client.py (100%) rename examples/{offline_inference/pooling => pooling/token_embed}/multi_vector_retrieval.py (100%) rename examples/{online_serving/pooling => pooling/token_embed}/multi_vector_retrieval_client.py (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9f2107fb1e5a..52c848c784e5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,20 +390,24 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ecb10d1a450f..d6447649cd89 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -146,10 +146,10 @@ mkdocs.yaml @hmellor /requirements/kv_connectors.txt @NickLucche # Pooling models -/examples/*/pooling/ @noooop +/examples/pooling @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop -/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop +/vllm/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop /vllm/pooling_params.py @noooop /vllm/model_executor/layers/pooler.py @noooop diff --git a/docs/.nav.yml b/docs/.nav.yml index d30c0f12eba4..aa98ad52be21 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,11 +5,7 @@ nav: - Getting Started: - getting_started/quickstart.md - getting_started/installation - - Examples: - - examples/README.md - - Offline Inference: examples/offline_inference - - Online Serving: examples/online_serving - - Others: examples/others + - Examples: examples - General: - usage/v1_guide.md - usage/* diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index b4a30cda35a0..5a86940fa9f1 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py). -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples. ## Using an IO Processor plugin diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6e4fb039e3a0..e886a91e6573 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import logging -from dataclasses import dataclass, field +from dataclasses import dataclass +from functools import cached_property from pathlib import Path from typing import Literal @@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples" EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples" -def fix_case(text: str) -> str: +def title(text: str) -> str: + # Default title case + text = text.replace("_", " ").replace("/", " - ").title() + # Custom substitutions subs = { + "io": "IO", "api": "API", "cli": "CLI", "cpu": "CPU", "llm": "LLM", "mae": "MAE", + "ner": "NER", "tpu": "TPU", "gguf": "GGUF", "lora": "LoRA", @@ -48,71 +54,65 @@ class Example: Attributes: path (Path): The path to the main directory or file. category (str): The category of the document. - main_file (Path): The main file in the directory. - other_files (list[Path]): list of other files in the directory. - title (str): The title of the document. + + Properties:: + main_file() -> Path | None: Determines the main file in the given path. + other_files() -> list[Path]: Determines other files in the directory excluding + the main file. + title() -> str: Determines the title of the document. Methods: - __post_init__(): Initializes the main_file, other_files, and title attributes. - determine_main_file() -> Path: Determines the main file in the given path. - determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. - determine_title() -> str: Determines the title of the document. generate() -> str: Generates the documentation content. - """ # noqa: E501 + """ path: Path - category: str = None - main_file: Path = field(init=False) - other_files: list[Path] = field(init=False) - title: str = field(init=False) + category: str - def __post_init__(self): - self.main_file = self.determine_main_file() - self.other_files = self.determine_other_files() - self.title = self.determine_title() + @cached_property + def main_file(self) -> Path | None: + """Determines the main file in the given path. - @property - def is_code(self) -> bool: - return self.main_file.suffix != ".md" + If path is a file, it returns the path itself. If path is a directory, it + searches for Markdown files (*.md) in the directory and returns the first one + found. If no Markdown files are found, it returns None.""" + # Single file example + if self.path.is_file(): + return self.path + # Multi file example with a README + if md_paths := list(self.path.glob("*.md")): + return md_paths[0] + # Multi file example without a README + return None - def determine_main_file(self) -> Path: - """ - Determines the main file in the given path. - If the path is a file, it returns the path itself. Otherwise, it searches - for Markdown files (*.md) in the directory and returns the first one found. - Returns: - Path: The main file path, either the original path if it's a file or the first - Markdown file found in the directory. - Raises: - IndexError: If no Markdown files are found in the directory. - """ # noqa: E501 - return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop() + @cached_property + def other_files(self) -> list[Path]: + """Determine other files in the directory excluding the main file. - def determine_other_files(self) -> list[Path]: - """ - Determine other files in the directory excluding the main file. - - This method checks if the given path is a file. If it is, it returns an empty list. - Otherwise, it recursively searches through the directory and returns a list of all - files that are not the main file. - - Returns: - list[Path]: A list of Path objects representing the other files in the directory. - """ # noqa: E501 + If path is a file, it returns an empty list. Otherwise, it returns every file + in the directory except the main file in a list.""" + # Single file example if self.path.is_file(): return [] + # Multi file example is_other_file = lambda file: file.is_file() and file != self.main_file - return [file for file in self.path.rglob("*") if is_other_file(file)] + return sorted(file for file in self.path.rglob("*") if is_other_file(file)) - def determine_title(self) -> str: - if not self.is_code: - # Specify encoding for building on Windows - with open(self.main_file, encoding="utf-8") as f: - first_line = f.readline().strip() - match = re.match(r"^#\s+(?P.+)$", first_line) - if match: - return match.group("title") - return fix_case(self.path.stem.replace("_", " ").title()) + @cached_property + def is_code(self) -> bool: + return self.main_file is not None and self.main_file.suffix != ".md" + + @cached_property + def title(self) -> str: + # Generate title from filename if no main md file found + if self.main_file is None or self.is_code: + return title(self.path.stem) + # Specify encoding for building on Windows + with open(self.main_file, encoding="utf-8") as f: + first_line = f.readline().strip() + match = re.match(r"^#\s+(?P<title>.+)$", first_line) + if match: + return match.group("title") + raise ValueError(f"Title not found in {self.main_file}") def fix_relative_links(self, content: str) -> str: """ @@ -156,24 +156,35 @@ class Example: # included files containing code fences too code_fence = "``````" - if self.is_code: - content += ( - f"{code_fence}{self.main_file.suffix[1:]}\n" - f'--8<-- "{self.main_file}"\n' - f"{code_fence}\n" - ) + if self.main_file is not None: + # Single file example or multi file example with a README + if self.is_code: + content += ( + f"{code_fence}{self.main_file.suffix[1:]}\n" + f'--8<-- "{self.main_file}"\n' + f"{code_fence}\n" + ) + else: + with open(self.main_file, encoding="utf-8") as f: + # Skip the title from md snippets as it's been included above + main_content = f.readlines()[1:] + content += self.fix_relative_links("".join(main_content)) + content += "\n" else: - with open(self.main_file) as f: - # Skip the title from md snippets as it's been included above - main_content = f.readlines()[1:] - content += self.fix_relative_links("".join(main_content)) - content += "\n" + # Multi file example without a README + for file in self.other_files: + file_title = title(str(file.relative_to(self.path).with_suffix(""))) + content += f"## {file_title}\n\n" + content += ( + f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n' + ) + return content if not self.other_files: return content content += "## Example materials\n\n" - for file in sorted(self.other_files): + for file in self.other_files: content += f'??? abstract "{file.relative_to(self.path)}"\n' if file.suffix != ".md": content += f" {code_fence}{file.suffix[1:]}\n" @@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): glob_patterns = ["*.py", "*.md", "*.sh"] # Find categorised examples for category in categories: + logger.info("Processing category: %s", category.stem) globs = [category.glob(pattern) for pattern in glob_patterns] for path in itertools.chain(*globs): examples.append(Example(path, category.stem)) # Find examples in subdirectories - for path in category.glob("*/*.md"): + globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns] + for path in itertools.chain(*globs): examples.append(Example(path.parent, category.stem)) # Generate the example documentation @@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): with open(doc_path, "w+", encoding="utf-8") as f: f.write(example.generate()) logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) + logger.info("Total examples generated: %d", len(examples)) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index aca865f4bf77..e2d427e8a459 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -274,7 +274,7 @@ outputs = llm.embed( print(outputs[0].outputs) ``` -A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py) +A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py) ### Online Inference @@ -304,7 +304,7 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) +An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py) ## Deprecated Features diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6ea2285b92bb..040107c11efc 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py). + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py). ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' @@ -606,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | !!! note - Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py). + Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py). ## List of Multimodal Language Models diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 672663dc50b1..01453483a8d6 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -234,7 +234,7 @@ The following extra parameters are supported: Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py) +Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py) If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: @@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py) +Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py) #### Extra parameters @@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py) +Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py) ### Classification API @@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. -Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py) +Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py) #### Example Requests @@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) +Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) #### Single inference @@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][1]["score"]) ``` -Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) +Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py) #### Extra parameters @@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py) +Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py) #### Example Request diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md deleted file mode 100644 index ad78be38716b..000000000000 --- a/examples/offline_inference/pooling/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# Pooling models - -## Convert llm model to seq cls - -```bash -# for BAAI/bge-reranker-v2-gemma -# Caution: "Yes" and "yes" are two different tokens -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls -# for mxbai-rerank-v2 -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls -# for Qwen3-Reranker -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls -``` - -## Embed jina_embeddings_v3 usage - -Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120> - -```bash -python examples/offline_inference/pooling/embed_jina_embeddings_v3.py -``` - -## Embed matryoshka dimensions usage - -```bash -python examples/offline_inference/pooling/embed_matryoshka_fy.py -``` - -## Multi vector retrieval usage - -```bash -python examples/offline_inference/pooling/multi_vector_retrieval.py -``` - -## Named Entity Recognition (NER) usage - -```bash -python examples/offline_inference/pooling/ner.py -``` - -## Prithvi Geospatial MAE usage - -```bash -python examples/offline_inference/pooling/prithvi_geospatial_mae.py -``` - -## IO Processor Plugins for Prithvi Geospatial MAE - -```bash -python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py -``` - -## Qwen3 reranker usage - -```bash -python examples/offline_inference/pooling/qwen3_reranker.py -``` diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md deleted file mode 100644 index b76ad21f0481..000000000000 --- a/examples/online_serving/pooling/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# Pooling models - -## Cohere rerank usage - -```bash -# vllm serve BAAI/bge-reranker-base -python examples/online_serving/pooling/cohere_rerank_client.py -``` - -## Embedding requests base64 encoding_format usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/embedding_requests_base64_client.py -``` - -## Embedding requests bytes encoding_format usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/embedding_requests_bytes_client.py -``` - -## Jinaai rerank usage - -```bash -# vllm serve BAAI/bge-reranker-base -python examples/online_serving/pooling/jinaai_rerank_client.py -``` - -## Multi vector retrieval usage - -```bash -# vllm serve BAAI/bge-m3 -python examples/online_serving/pooling/multi_vector_retrieval_client.py -``` - -## Named Entity Recognition (NER) usage - -```bash -# vllm serve boltuix/NeuroBERT-NER -python examples/online_serving/pooling/ner_client.py -``` - -## OpenAI chat embedding for multimodal usage - -```bash -python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py -``` - -## OpenAI classification usage - -```bash -# vllm serve jason9693/Qwen2.5-1.5B-apeach -python examples/online_serving/pooling/openai_classification_client.py -``` - -## OpenAI cross_encoder score usage - -```bash -# vllm serve BAAI/bge-reranker-v2-m3 -python examples/online_serving/pooling/openai_cross_encoder_score.py -``` - -## OpenAI cross_encoder score for multimodal usage - -```bash -# vllm serve jinaai/jina-reranker-m0 -python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py -``` - -## OpenAI embedding usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/openai_embedding_client.py -``` - -## OpenAI embedding matryoshka dimensions usage - -```bash -# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code -python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py -``` - -## OpenAI pooling usage - -```bash -# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code -python examples/online_serving/pooling/openai_pooling_client.py -``` - -## Online Prithvi Geospatial MAE usage - -```bash -python examples/online_serving/pooling/prithvi_geospatial_mae.py -``` diff --git a/examples/online_serving/pooling/openai_classification_client.py b/examples/pooling/classify/openai_classification_client.py similarity index 100% rename from examples/online_serving/pooling/openai_classification_client.py rename to examples/pooling/classify/openai_classification_client.py diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/pooling/embed/embed_jina_embeddings_v3.py similarity index 100% rename from examples/offline_inference/pooling/embed_jina_embeddings_v3.py rename to examples/pooling/embed/embed_jina_embeddings_v3.py diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/pooling/embed/embed_matryoshka_fy.py similarity index 100% rename from examples/offline_inference/pooling/embed_matryoshka_fy.py rename to examples/pooling/embed/embed_matryoshka_fy.py diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/pooling/embed/embedding_requests_base64_client.py similarity index 100% rename from examples/online_serving/pooling/embedding_requests_base64_client.py rename to examples/pooling/embed/embedding_requests_base64_client.py diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py similarity index 100% rename from examples/online_serving/pooling/embedding_requests_bytes_client.py rename to examples/pooling/embed/embedding_requests_bytes_client.py diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py rename to examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/online_serving/pooling/openai_embedding_client.py b/examples/pooling/embed/openai_embedding_client.py similarity index 100% rename from examples/online_serving/pooling/openai_embedding_client.py rename to examples/pooling/embed/openai_embedding_client.py diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md similarity index 100% rename from examples/online_serving/openai_embedding_long_text/README.md rename to examples/pooling/embed/openai_embedding_long_text/README.md diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py similarity index 100% rename from examples/online_serving/openai_embedding_long_text/client.py rename to examples/pooling/embed/openai_embedding_long_text/client.py diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh similarity index 100% rename from examples/online_serving/openai_embedding_long_text/service.sh rename to examples/pooling/embed/openai_embedding_long_text/service.sh diff --git a/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py b/examples/pooling/embed/openai_embedding_matryoshka_fy.py similarity index 100% rename from examples/online_serving/pooling/openai_embedding_matryoshka_fy.py rename to examples/pooling/embed/openai_embedding_matryoshka_fy.py diff --git a/examples/online_serving/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py similarity index 100% rename from examples/online_serving/pooling/prithvi_geospatial_mae.py rename to examples/pooling/plugin/prithvi_geospatial_mae_client.py diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py similarity index 100% rename from examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py rename to examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_offline.py similarity index 100% rename from examples/offline_inference/pooling/prithvi_geospatial_mae.py rename to examples/pooling/plugin/prithvi_geospatial_mae_offline.py diff --git a/examples/online_serving/pooling/openai_pooling_client.py b/examples/pooling/pooling/openai_pooling_client.py similarity index 100% rename from examples/online_serving/pooling/openai_pooling_client.py rename to examples/pooling/pooling/openai_pooling_client.py diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py similarity index 100% rename from examples/offline_inference/vision_language_pooling.py rename to examples/pooling/pooling/vision_language_pooling.py diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_client.py similarity index 100% rename from examples/online_serving/pooling/cohere_rerank_client.py rename to examples/pooling/score/cohere_rerank_client.py diff --git a/examples/offline_inference/pooling/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py similarity index 100% rename from examples/offline_inference/pooling/convert_model_to_seq_cls.py rename to examples/pooling/score/convert_model_to_seq_cls.py diff --git a/examples/online_serving/pooling/jinaai_rerank_client.py b/examples/pooling/score/jinaai_rerank_client.py similarity index 100% rename from examples/online_serving/pooling/jinaai_rerank_client.py rename to examples/pooling/score/jinaai_rerank_client.py diff --git a/examples/online_serving/pooling/openai_cross_encoder_score.py b/examples/pooling/score/openai_cross_encoder_score.py similarity index 100% rename from examples/online_serving/pooling/openai_cross_encoder_score.py rename to examples/pooling/score/openai_cross_encoder_score.py diff --git a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py similarity index 100% rename from examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py rename to examples/pooling/score/openai_cross_encoder_score_for_multimodal.py diff --git a/examples/offline_inference/pooling/qwen3_reranker.py b/examples/pooling/score/qwen3_reranker.py similarity index 100% rename from examples/offline_inference/pooling/qwen3_reranker.py rename to examples/pooling/score/qwen3_reranker.py diff --git a/examples/offline_inference/pooling/ner.py b/examples/pooling/token_classify/ner.py similarity index 100% rename from examples/offline_inference/pooling/ner.py rename to examples/pooling/token_classify/ner.py diff --git a/examples/online_serving/pooling/ner_client.py b/examples/pooling/token_classify/ner_client.py similarity index 100% rename from examples/online_serving/pooling/ner_client.py rename to examples/pooling/token_classify/ner_client.py diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/pooling/token_embed/multi_vector_retrieval.py similarity index 100% rename from examples/offline_inference/pooling/multi_vector_retrieval.py rename to examples/pooling/token_embed/multi_vector_retrieval.py diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/pooling/token_embed/multi_vector_retrieval_client.py similarity index 100% rename from examples/online_serving/pooling/multi_vector_retrieval_client.py rename to examples/pooling/token_embed/multi_vector_retrieval_client.py