From 8188196a1c8af26134d8e366ebe564c18fb95379 Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Mon, 21 Jul 2025 11:13:02 +0800 Subject: [PATCH 01/16] [CI] Cleanup modelscope version constraint in Dockerfile (#21243) Signed-off-by: Kay Yan --- docker/Dockerfile | 2 +- docker/Dockerfile.xpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b06c4d33626df..d1fa92ce6d19d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -510,7 +510,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 41b4c42e4c4b1..3130435ca7215 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer pytest 'modelscope!=1.15.0' + pip install accelerate hf_transfer pytest modelscope ENV VLLM_USAGE_SOURCE production-docker-image \ TRITON_XPU_PROFILE 1 From 92615d7fe80b68206f71b26b00583e6c530d4387 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 20 Jul 2025 21:58:07 -0700 Subject: [PATCH 02/16] [Docs] Add RFC Meeting to Issue Template (#21279) Signed-off-by: simon-mo --- .github/ISSUE_TEMPLATE/750-RFC.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index e447c077473f0..7ee57c42895ca 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -46,7 +46,7 @@ body: - type: markdown attributes: value: > - Thanks for contributing 🎉! + Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit). - type: checkboxes id: askllm attributes: From 940af1f03a6d47415655ba32c0ba551b24161faa Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sun, 20 Jul 2025 22:29:18 -0700 Subject: [PATCH 03/16] Add the instruction to run e2e validation manually before release (#21023) Signed-off-by: Huy Do --- RELEASE.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 7f52707152128..9352e7ef706c6 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -52,3 +52,36 @@ After branch cut, we approach finalizing the release branch with clear criteria * Release branch specific changes (e.g. change version identifiers or CI fixes) Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes. + +## Manual validations + +### E2E Performance Validation + +Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI. + +**Current Coverage:** +* Models: Llama3, Llama4, and Mixtral +* Hardware: NVIDIA H100 and AMD MI300x +* *Note: Coverage may change based on new model releases and hardware availability* + +**Performance Validation Process:** + +**Step 1: Get Access** +Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow. + +**Step 2: Review Benchmark Setup** +Familiarize yourself with the benchmark configurations: +* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda) +* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm) + +**Step 3: Run the Benchmark** +Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure: +* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`) +* **vLLM commit**: Set to the RC commit hash + +**Step 4: Review Results** +Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit. + +**Step 5: Performance Comparison** +Compare the current results against the previous release to verify no performance regressions have occurred. Here is an +example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms). From 378d33c3929aab549282ebaab193fe43918e591a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 21 Jul 2025 13:50:06 +0800 Subject: [PATCH 04/16] [Bugfix] Fix missing placeholder in logger debug (#21280) Signed-off-by: DarkLight1337 --- vllm/transformers_utils/configs/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index e66f762eb8090..8a9c660b882fd 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -42,7 +42,7 @@ def adapt_config_dict(config_dict: dict[str, Any], config = PretrainedConfig.from_dict(config_dict) - logger.debug("Initialized config", config) + logger.debug("Initialized config %s", config) return config From 042af0c8d3f0b8b5319f34e4cb9b690981bb5da4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 21 Jul 2025 17:22:21 +0800 Subject: [PATCH 05/16] [Model][1/N] Support multiple poolers at model level (#21227) Signed-off-by: DarkLight1337 --- docs/models/pooling_models.md | 53 ++- tests/models/test_transformers.py | 2 +- .../my_gemma_embedding.py | 15 +- vllm/config.py | 8 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/model_executor/layers/pooler.py | 350 +++++++++--------- vllm/model_executor/models/adapters.py | 102 +++-- vllm/model_executor/models/bert.py | 132 +++++-- vllm/model_executor/models/gpt2.py | 16 +- vllm/model_executor/models/gritlm.py | 39 +- vllm/model_executor/models/internlm2.py | 12 +- vllm/model_executor/models/jamba.py | 29 +- vllm/model_executor/models/jina_vl.py | 18 +- vllm/model_executor/models/modernbert.py | 50 ++- vllm/model_executor/models/qwen2_rm.py | 35 +- vllm/model_executor/models/roberta.py | 44 ++- vllm/model_executor/pooling_metadata.py | 7 + vllm/v1/pool/metadata.py | 8 + vllm/v1/worker/gpu_model_runner.py | 16 +- vllm/v1/worker/tpu_model_runner.py | 7 +- vllm/worker/model_runner_base.py | 7 +- vllm/worker/pooling_model_runner.py | 10 +- 22 files changed, 549 insertions(+), 413 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f0de84a66f8b0..eef8f20e4e5c6 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -11,26 +11,51 @@ before returning them. As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -For pooling models, we support the following `--task` options. -The selected option sets the default pooler used to extract the final hidden states: +If the model doesn't implement this interface, you can set `--task` which tells vLLM +to convert the model into a pooling model. -| Task | Pooling Type | Normalization | Softmax | -|---------------------------------|----------------|-----------------|-----------| -| Embedding (`embed`) | `LAST` | ✅︎ | ❌ | -| Classification (`classify`) | `LAST` | ❌ | ✅︎ | -| Sentence Pair Scoring (`score`) | \* | \* | \* | +| `--task` | Model type | Supported pooling tasks | +|------------|----------------------|-------------------------------| +| `embed` | Embedding model | `encode`, `embed` | +| `classify` | Classification model | `encode`, `classify`, `score` | +| `reward` | Reward model | `encode` | -\*The default pooler is always defined by the model. +## Pooling Tasks -!!! note - If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. +In vLLM, we define the following pooling tasks and corresponding APIs: + +| Task | APIs | +|------------|--------------------| +| `encode` | `encode` | +| `embed` | `embed`, `score`\* | +| `classify` | `classify` | +| `score` | `score` | + +\*The `score` API falls back to `embed` task if the model does not support `score` task. + +Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks]. + +By default, the pooler assigned to each task has the following attributes: + +| Task | Pooling Type | Normalization | Softmax | +|------------|----------------|---------------|---------| +| `encode` | `ALL` | ❌ | ❌ | +| `embed` | `LAST` | ✅︎ | ❌ | +| `classify` | `LAST` | ❌ | ✅︎ | + +These defaults may be overridden by the model's implementation in vLLM. When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). +we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`), +which takes priority over the model's defaults. -!!! tip - You can customize the model's pooling method via the `--override-pooler-config` option, - which takes priority over both the model's and Sentence Transformers's defaults. +You can further customize this via the `--override-pooler-config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +!!! note + + The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler + that is not based on [PoolerConfig][vllm.config.PoolerConfig]. ## Offline Inference diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index b87290e96a27e..16b9bcffd2650 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -144,7 +144,7 @@ def test_quantization( "model", ["jason9693/Qwen2.5-1.5B-apeach"], ) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", ["float"]) def test_classify( hf_runner, vllm_runner, diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index 797353e4f7a8b..fc654f20fff22 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.models.gemma2 import Gemma2Model from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix from vllm.sequence import IntermediateTensors @@ -26,12 +26,13 @@ class MyGemma2Embedding(nn.Module): self.model = Gemma2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.pooler = Pooler.from_config_with_defaults( - vllm_config.model_config.pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False, - ) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), + }) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/config.py b/vllm/config.py index 44106dd279b6b..4cafbc9260525 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -94,7 +94,7 @@ ConfigT = TypeVar("ConfigT", bound=ConfigType) TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription", "draft"] -_ResolvedTask = Literal["generate", "transcription", "pooling", "embed", +_ResolvedTask = Literal["generate", "transcription", "encode", "embed", "classify", "reward", "draft"] RunnerOption = Literal["auto", "generate", "pooling", "draft"] @@ -103,7 +103,7 @@ RunnerType = Literal["generate", "pooling", "draft"] _RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = { "generate": ["generate", "transcription"], - "pooling": ["pooling", "embed", "classify", "reward"], + "pooling": ["encode", "embed", "classify", "reward"], "draft": [], } @@ -579,7 +579,7 @@ class ModelConfig: # user-selected task if runner_type == "pooling" and self.task == "auto": selected_task = all_supported_tasks[runner_type][-1] - assert selected_task != "pooling" + assert selected_task != "encode" self.task = selected_task self.supported_runner_types = supported_runner_types self.runner_type = runner_type @@ -884,7 +884,7 @@ class ModelConfig: supported_tasks = list[_ResolvedTask]() if registry.is_pooling_model(architectures): - supported_tasks.append("pooling") + supported_tasks.append("encode") # For now, users must specify the task (other than "pooling") # to use for pooling models diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3f0c1c85dee61..57240bb4f3330 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1668,7 +1668,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if "pooling" in model_config.supported_tasks else None + ) if "encode" in model_config.supported_tasks else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 6a474b8e73a35..c06cca080227e 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from collections.abc import Mapping, Set from dataclasses import dataclass from enum import IntEnum +from itertools import groupby from typing import Callable, Optional, TypeVar, Union import torch import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from typing_extensions import assert_never from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.pooling_metadata import ( # noqa: E501 @@ -21,6 +22,10 @@ from vllm.utils import resolve_obj_by_qualname from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata] +PoolingFn = Callable[ + [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], + Union[torch.Tensor, list[torch.Tensor]]] +ClassifierFn = Callable[[torch.Tensor], torch.Tensor] class PoolingType(IntEnum): @@ -79,37 +84,81 @@ class Pooler(nn.Module, ABC): """The interface required for all poolers used in pooling models in vLLM.""" @staticmethod - def from_config_with_defaults( + def for_encode( pooler_config: PoolerConfig, - pooling_type: PoolingType, - normalize: bool, - softmax: bool, - step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, - ) -> "Pooler": + *, + default_pooling_type: PoolingType = PoolingType.ALL, + default_normalize: bool = False, + default_softmax: bool = False, + default_step_tag_id: Optional[int] = None, + default_returned_token_ids: Optional[list[int]] = None, + ): resolved_config = ResolvedPoolingConfig.from_config_with_defaults( pooler_config=pooler_config, - pooling_type=pooling_type, - normalize=normalize, - softmax=softmax, - step_tag_id=step_tag_id, - returned_token_ids=returned_token_ids, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + step_tag_id=default_step_tag_id, + returned_token_ids=default_returned_token_ids, ) - if pooling_type == PoolingType.STEP: + if resolved_config.pooling_type == PoolingType.STEP: return StepPooler.from_config(resolved_config) return SimplePooler.from_config(resolved_config) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + @staticmethod + def for_embed( + pooler_config: PoolerConfig, + *, + default_pooling_type: PoolingType = PoolingType.LAST, + default_normalize: bool = True, + default_softmax: bool = False, + ): + resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + pooler_config=pooler_config, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + ) + + return SimplePooler.from_config(resolved_config) + + @staticmethod + def for_classify( + pooler_config: PoolerConfig, + classifier: Optional[ClassifierFn], + *, + default_pooling_type: PoolingType = PoolingType.LAST, + default_normalize: bool = False, + default_softmax: bool = True, + ): + resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + pooler_config=pooler_config, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + ) + base_pooler = SimplePooler.from_config(resolved_config) + if classifier is None: + return base_pooler + + return ClassifierPooler( + pooling=base_pooler.pooling, + classifier=classifier, + act_fn=base_pooler.head.activation, + ) + + @abstractmethod + def get_supported_tasks(self) -> Set[PoolingTask]: + """Determine which pooling tasks are supported.""" + raise NotImplementedError + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: """ - Construct the pooling parameters to use for a task, - or `None` if the task is not supported. + Construct the updated pooling parameters to use for a supported task. """ - return None + return PoolingParamsUpdate() @abstractmethod def forward( @@ -127,9 +176,8 @@ def get_prompt_lens( if isinstance(pooling_metadata, V1PoolingMetadata): return pooling_metadata.prompt_lens - assert isinstance(hidden_states, torch.Tensor) return PoolingTensors.from_pooling_metadata( - pooling_metadata, hidden_states.device).prompt_lens + pooling_metadata, hidden_states[0].device).prompt_lens def get_prompt_token_ids( @@ -149,6 +197,21 @@ def get_prompt_token_ids( ] +def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]: + if isinstance(pooling_metadata, V0PoolingMetadata): + pooling_params = [p for _, p in pooling_metadata.seq_groups] + else: + pooling_params = pooling_metadata.pooling_params + + tasks: list[PoolingTask] = [ + task for pooling_param in pooling_params + if (task := pooling_param.task) is not None + ] + assert len(pooling_params) == len(tasks) + + return tasks + + def get_classification_activation_function(config: PretrainedConfig): return PoolerClassify() @@ -172,7 +235,8 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): return PoolerScore() -def build_output(all_data: torch.Tensor) -> PoolerOutput: +def build_output( + all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput: all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data] return PoolerOutput(outputs=all_outputs) @@ -193,12 +257,12 @@ class PoolingMethod(nn.Module, ABC): raise NotImplementedError(f"Unsupported method: {pooling_type}") @abstractmethod - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: raise NotImplementedError + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate() + @abstractmethod def forward_one( self, @@ -237,16 +301,8 @@ class PoolingMethod(nn.Module, ABC): class CLSPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -270,16 +326,8 @@ class CLSPool(PoolingMethod): class LastPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -299,18 +347,8 @@ class LastPool(PoolingMethod): class AllPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - if task == "encode": - return PoolingParamsUpdate() - - # The equalities are split up to keep mypy happy - if task == "embed" or task == "classify" or task == "score": - return None - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode"} def forward_one( self, @@ -327,28 +365,13 @@ class AllPool(PoolingMethod): hidden_states: torch.Tensor, prompt_lens: torch.Tensor, ) -> Union[list[torch.Tensor], torch.Tensor]: - offset = 0 - pooled_data = list[torch.Tensor]() - - for prompt_len in prompt_lens: - pooled_data.append(hidden_states[offset:offset + prompt_len]) - offset += prompt_len - - return pooled_data + return list(hidden_states.split_with_sizes(prompt_lens.tolist())) class MeanPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -529,24 +552,6 @@ class SimplePooler(Pooler): 3. Returns structured results as `PoolerOutput`. """ - @classmethod - def from_config_with_defaults( # type: ignore[override] - cls, - pooler_config: PoolerConfig, - pooling_type: PoolingType, - normalize: bool, - softmax: bool, - ) -> "SimplePooler": - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( - pooler_config=pooler_config, - pooling_type=pooling_type, - normalize=normalize, - softmax=softmax, - ) - assert resolved_config.pooling_type != PoolingType.STEP - - return cls.from_config(resolved_config) - @classmethod def from_config( cls, @@ -563,10 +568,10 @@ class SimplePooler(Pooler): self.pooling = pooling self.head = head - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) def forward( @@ -627,18 +632,11 @@ class StepPooler(Pooler): return pooled_data - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - if task == "encode": - return PoolingParamsUpdate(requires_token_ids=True) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode"} - # The equalities are split up to keep mypy happy - if task == "embed" or task == "classify" or task == "score": - return None - - assert_never(task) + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) def forward( self, @@ -650,68 +648,43 @@ class StepPooler(Pooler): return build_output(pooled_data) -PoolingFn = Callable[ - [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], - Union[torch.Tensor, list[torch.Tensor]]] -ClassifierFn = Callable[[torch.Tensor], torch.Tensor] - - -class ClassifierPooler(nn.Module): +class ClassifierPooler(Pooler): """A pooling layer for classification tasks. This layer does the following: 1. Applies a classification layer to the hidden states. 2. Optionally applies a pooler layer. - 3. Applies an activation function to the output. In the case of - classification models it is either sigmoid or softmax. In the - case of scoring models, the same behavior is configuration - dependent, as in the sentence-transformers library. + 3. Applies an activation function to the output. """ + @staticmethod + def act_fn_for_seq_cls(config: ModelConfig): + return get_classification_activation_function(config.hf_config) + + @staticmethod + def act_fn_for_cross_encoder(config: ModelConfig): + return get_cross_encoder_activation_function(config.hf_config) + def __init__( self, - config: ModelConfig, pooling: PoolingFn, classifier: ClassifierFn, - act_fn: Optional[PoolerActivation] = None, + act_fn: PoolerActivation, ) -> None: super().__init__() self.pooling = pooling self.classifier = classifier + self.act_fn = act_fn - self.classification_act_fn = get_classification_activation_function( - config.hf_config) if act_fn is None else act_fn - self.cross_encoder_act_fn = get_cross_encoder_activation_function( - config.hf_config) if act_fn is None else act_fn - - def _get_act_fn(self, task: PoolingTask): - if task == "encode" or task == "classify": - return self.classification_act_fn - if task == "score": - return self.cross_encoder_act_fn - - raise ValueError(f"Unsupported task: {task!r}") - - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if task == "encode" or task == "classify" or task == "score": - return PoolingParamsUpdate() - - if task == "embed": - return None - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"classify", "score"} def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> PoolerOutput: - """Pools sentence pair scores from the hidden_states.""" pooled_data = self.pooling(hidden_states, pooling_metadata) # apply classifier once on the full batch if possible @@ -722,28 +695,59 @@ class ClassifierPooler(nn.Module): else: pooled_output = [self.classifier(data) for data in pooled_data] - task_list: list[PoolingTask] - if isinstance(pooling_metadata, V0PoolingMetadata): - task_list = [ - task for _, pooling_param in pooling_metadata.seq_groups - if (task := pooling_param.task) is not None - ] - else: - task_list = [ - task for pooling_param in pooling_metadata.pooling_params - if (task := pooling_param.task) is not None - ] - - assert len(task_list) == len(pooled_output) - - # shape of scores: (batch_size, num_labels) - if len(set(task_list)) <= 1: - act_fn = self._get_act_fn(task_list[0]) - scores = act_fn(pooled_output) - else: - scores = torch.stack([ - self._get_act_fn(task)(vecs) - for task, vecs in zip(task_list, pooled_output) - ]) + scores = self.act_fn(pooled_output) return build_output(scores) + + +class DispatchPooler(Pooler): + """Dispatches calls to a sub-pooler based on the pooling task.""" + + def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None: + super().__init__() + + for task, pooler in poolers_by_task.items(): + if task not in pooler.get_supported_tasks(): + raise ValueError( + f"{pooler=} does not support {task=}. " + f"Supported tasks: {pooler.get_supported_tasks()}") + + self.poolers_by_task = poolers_by_task + + def get_supported_tasks(self) -> Set[PoolingTask]: + return set(self.poolers_by_task) + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return self.poolers_by_task[task].get_pooling_updates(task) + + def forward( + self, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + poolers_by_task = self.poolers_by_task + + if isinstance(hidden_states, list): + hidden_states_lst = hidden_states + else: + prompt_lens = get_prompt_lens(hidden_states, pooling_metadata) + hidden_states_lst = list(hidden_states.split(prompt_lens.tolist())) + + outputs = list[PoolingSequenceGroupOutput]() + offset = 0 + for task, group in groupby(get_tasks(pooling_metadata)): + if not (pooler := poolers_by_task.get(task)): + raise ValueError( + f"Unsupported task: {task} " + f"Supported tasks: {self.get_supported_tasks()}") + + num_items = len(list(group)) + group_output: PoolerOutput = pooler( + hidden_states_lst[offset:offset + num_items], + pooling_metadata[offset:offset + num_items], + ) + + outputs.extend(group_output.outputs) + offset += num_items + + return PoolerOutput(outputs) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 31b1d9a8b3c0d..867de2c68b4c5 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -13,7 +13,6 @@ from .interfaces_base import VllmModelForPooling, is_pooling_model if TYPE_CHECKING: from vllm.config import VllmConfig - from vllm.model_executor.layers.pooler import PoolingType _T = TypeVar("_T", bound=type[nn.Module]) @@ -34,16 +33,8 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str: return model_name + pooling_suffix -def _create_pooling_model_cls( - orig_cls: _T, - *, - default_pooling_type: "PoolingType", - default_normalize: bool, - default_softmax: bool, -) -> _T: +def _create_pooling_model_cls(orig_cls: _T) -> _T: # Lazy import - from vllm.model_executor.layers.pooler import Pooler - from .utils import AutoWeightsLoader, WeightsMapper class ModelForPooling(orig_cls, VllmModelForPooling): @@ -71,15 +62,7 @@ def _create_pooling_model_cls( self._init_pooler(vllm_config, prefix=prefix) def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): - pooler_config = vllm_config.model_config.pooler_config - assert pooler_config is not None - - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=default_pooling_type, - normalize=default_normalize, - softmax=default_softmax, - ) + raise NotImplementedError def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: Support uninitialized params tracking @@ -132,14 +115,20 @@ def as_embedding_model(cls: _T) -> _T: return cls # Lazy import - from vllm.model_executor.layers.pooler import PoolingType + from vllm.model_executor.layers.pooler import DispatchPooler, Pooler + + class ModelForEmbedding(_create_pooling_model_cls(cls)): + + def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler( + { + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), + }, ) - ModelForEmbedding = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.LAST, - default_normalize=True, - default_softmax=False, - ) ModelForEmbedding.__name__ = \ _get_pooling_model_name(cls.__name__, "ForEmbedding") @@ -165,20 +154,14 @@ def as_seq_cls_model(cls: _T) -> _T: # Lazy import from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.pooler import (ClassifierPooler, - PoolingType, SimplePooler) + DispatchPooler, Pooler, + PoolingMethod, PoolingType) from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.sequence import IntermediateTensors from .utils import maybe_prefix - ModelForPooling = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.LAST, - default_normalize=False, - default_softmax=True, - ) - - class ModelForSequenceClassification(ModelForPooling, + class ModelForSequenceClassification(_create_pooling_model_cls(cls), SupportsCrossEncoding): def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): @@ -198,19 +181,28 @@ def as_seq_cls_model(cls: _T) -> _T: pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - pooler = SimplePooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True, - ) + pooling_type_str = pooler_config.pooling_type + pooling_type = (PoolingType.LAST if pooling_type_str is None else + PoolingType[pooling_type_str]) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=pooler.pooling, - classifier=self._classifier, - act_fn=pooler.head.activation, - ) + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=PoolingMethod.from_pooling_type(pooling_type), + classifier=self._classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=PoolingMethod.from_pooling_type(pooling_type), + classifier=self._classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def _classifier(self, x: torch.Tensor): x, _ = self.score(x.float()) @@ -259,14 +251,16 @@ def as_reward_model(cls: _T) -> _T: return cls # Lazy import - from vllm.model_executor.layers.pooler import PoolingType + from vllm.model_executor.layers.pooler import DispatchPooler, Pooler - ModelForReward = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.ALL, - default_normalize=False, - default_softmax=False, - ) + class ModelForReward(_create_pooling_model_cls(cls)): + + def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) ModelForReward.__name__ = \ _get_pooling_model_name(cls.__name__, "ForReward") diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 006f547bb4617..9dc6115f850ec 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Iterable, Set from typing import Optional, Union import torch @@ -17,7 +17,8 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, + DispatchPooler, Pooler, PoolingMethod, PoolingParamsUpdate, PoolingType) @@ -92,20 +93,29 @@ class BertPooler(Pooler): self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) + def _head(self, pooled_output: torch.Tensor): + pooled_output = self.dense(pooled_output) + pooled_output = self.activation(pooled_output) + return pooled_output + def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[torch.Tensor, list[torch.Tensor]]: pooled_output = self.pooling(hidden_states, pooling_metadata) - pooled_output = self.dense(pooled_output) - pooled_output = self.activation(pooled_output) + + if isinstance(pooled_output, list): + pooled_output = [self._head(output) for output in pooled_output] + else: + pooled_output = self._head(pooled_output) + return pooled_output @@ -333,18 +343,19 @@ class BertModel(nn.Module, SupportsQuant): packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]} - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = "", - embedding_class: type = BertEmbedding, - add_pooling_layer: bool = False): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + embedding_class: type[nn.Module] = BertEmbedding, + ) -> None: super().__init__() + config = vllm_config.model_config.hf_config self.embeddings = embedding_class(config) self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") - self.pooler = BertPooler(config) if add_pooling_layer else None def forward( self, @@ -366,8 +377,7 @@ class BertModel(nn.Module, SupportsQuant): token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -395,10 +405,43 @@ class BertModel(nn.Module, SupportsQuant): if name in params_dict: other_weights.append((name, loaded_weight)) - loader = AutoWeightsLoader( - self, - skip_prefixes=(["pooler."] if self.pooler is None else []), + return other_weights, loaded_stacked_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + other_weights, loaded_stacked_params = self._load_weights(weights) + + loader = AutoWeightsLoader(self, skip_prefixes=["pooler."]) + loaded_params = loader.load_weights(other_weights) + loaded_params.update(loaded_stacked_params) + return loaded_params + + +class BertPoolingModel(BertModel): + + is_pooling_model = True + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + embedding_class: type[nn.Module] = BertEmbedding, + ) -> None: + super().__init__( + vllm_config=vllm_config, + prefix=prefix, + embedding_class=embedding_class, ) + + config = vllm_config.model_config.hf_config + self.pooler = BertPooler(config) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + other_weights, loaded_stacked_params = self._load_weights(weights) + + loader = AutoWeightsLoader(self) loaded_params = loader.load_weights(other_weights) loaded_params.update(loaded_stacked_params) return loaded_params @@ -421,6 +464,8 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): super().__init__() pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + self.model = self._build_model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.pooler = self._build_pooler(pooler_config) @@ -456,10 +501,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): embedding_class=BertEmbedding) def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: - return Pooler.from_config_with_defaults(pooler_config, - pooling_type=PoolingType.CLS, - normalize=True, - softmax=False) + return DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "embed": + Pooler.for_embed( + pooler_config, + default_pooling_type=PoolingType.CLS, + ), + }) class BertForSequenceClassification(nn.Module, SupportsV0Only, @@ -481,16 +531,32 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only, config = vllm_config.model_config.hf_config self.num_labels = config.num_labels - self.bert = BertModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "bert"), - embedding_class=BertEmbedding, - add_pooling_layer=True) + self.bert = BertPoolingModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=BertEmbedding) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=self.bert.pooler, - classifier=self.classifier, - ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=self.bert.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=self.bert.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 82883bfa890de..98d76337395b9 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from ..layers.pooler import Pooler, PoolingType +from ..layers.pooler import DispatchPooler, Pooler from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -339,12 +339,16 @@ class GPT2ForSequenceClassification(nn.Module): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "gpt2")) self.score = nn.Linear(config.n_embd, config.num_labels, bias=False) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify(pooler_config, classifier=None), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 8443482119b0c..8a3fbc6a49f04 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,17 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +from collections.abc import Set from typing import Optional, Union import numpy as np import torch import torch.nn as nn -from typing_extensions import assert_never from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import (Pooler, PoolerHead, - PoolerNormalize, +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolerHead, PoolerNormalize, PoolingParamsUpdate, build_output, get_prompt_lens, get_prompt_token_ids) @@ -135,18 +134,11 @@ class GritLMMeanPool(nn.Module): return instruction_len - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if task == "encode" or task == "embed": - return PoolingParamsUpdate(requires_token_ids=True) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed"} - if task == "classify" or task == "score": - return None - - assert_never(task) + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) def forward_one( self, @@ -207,10 +199,10 @@ class GritLMPooler(Pooler): self.pooling = GritLMMeanPool(model_config) self.head = PoolerHead(PoolerNormalize()) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) def forward( @@ -262,4 +254,11 @@ class GritLM(LlamaForCausalLM, SupportsV0Only): super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) - self.pooler = GritLMPooler(vllm_config.model_config) + pooler_config = vllm_config.model_config.pooler_config + if pooler_config is not None: + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "embed": + GritLMPooler(vllm_config.model_config), + }) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d9bbee0a2463c..d29779a35e5c9 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -429,12 +429,10 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM): ) pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False, - ) + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) def forward( self, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index e95f3491c6b6e..34281b2e99ee8 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -19,8 +19,8 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer -from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType, - SimplePooler) +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -584,16 +584,15 @@ class JambaForSequenceClassification(JambaForCausalLM): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - pooler = SimplePooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=False, - ) - - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=pooler.pooling, - classifier=self.score, - act_fn=pooler.head.activation, - ) + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify( + pooler_config, + classifier=self.score, + default_pooling_type=PoolingType.LAST, + default_normalize=False, + default_softmax=False, + ), + }) diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 6b191b09b4bfd..0c4284f7daaac 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -12,7 +12,7 @@ from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors @@ -96,11 +96,17 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, self.score = JinaVLScorer(config) - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify(pooler_config, classifier=None), + "score": + Pooler.for_classify(pooler_config, classifier=None), + }) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 74986f9f57340..be1c3438d9db1 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Iterable, Set from typing import Optional, Union import torch @@ -13,7 +13,8 @@ from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, + DispatchPooler, Pooler, PoolingMethod, PoolingParamsUpdate, PoolingType) @@ -271,19 +272,27 @@ class ModernBertPooler(Pooler): eps=config.norm_eps, bias=config.norm_bias) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) + def _head(self, pooled_output: torch.Tensor): + return self.norm(self.act(self.dense(pooled_output))) + def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[torch.Tensor, list[torch.Tensor]]: pooled_output = self.pooling(hidden_states, pooling_metadata) - pooled_output = self.norm(self.act(self.dense(pooled_output))) + + if isinstance(pooled_output, list): + pooled_output = [self._head(output) for output in pooled_output] + else: + pooled_output = self._head(pooled_output) + return pooled_output @@ -299,11 +308,28 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=ModernBertPooler(config), - classifier=self.classifier, - ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=ModernBertPooler(config), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=ModernBertPooler(config), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 58f95d6eebfb4..f12e9a041a944 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -15,7 +15,8 @@ from torch import nn from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolingType) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -26,7 +27,7 @@ from .utils import AutoWeightsLoader, maybe_prefix class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): is_pooling_model = True - pooler: SimplePooler + pooler: Pooler packed_modules_mapping = { "qkv_proj": [ @@ -94,12 +95,12 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config.model_config.hf_config.num_labels = 1 super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): @@ -107,11 +108,17 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config.model_config.hf_config.num_labels = 2 super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.STEP, - normalize=False, - softmax=True, - step_tag_id=151651, - ) + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode( + pooler_config, + default_pooling_type=PoolingType.STEP, + default_normalize=False, + default_softmax=True, + default_step_tag_id=151651, + ) + }) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 7d3b56ced5c40..c6b4116440346 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -9,7 +9,8 @@ from torch import nn from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import ClassifierPooler, CLSPool +from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool, + DispatchPooler, Pooler) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel @@ -63,16 +64,10 @@ class RobertaEmbedding(nn.Module): # References: # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - + seq_lens_list = seq_lens.tolist() new_pos_list = [] - for positions, tokens in zip(pos_list, token_list): + for positions, tokens in zip(position_ids.split(seq_lens_list), + input_ids.split(seq_lens_list)): # Verify assumption that incoming position are # always a sequence from 0 to N. expected_pos = torch.arange(positions.size()[0], @@ -184,15 +179,30 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, self.num_labels = config.num_labels self.roberta = BertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "bert"), - embedding_class=RobertaEmbedding, - add_pooling_layer=False) + embedding_class=RobertaEmbedding) self.classifier = RobertaClassificationHead(config) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=CLSPool(), - classifier=self.classifier, - ) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=CLSPool(), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=CLSPool(), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index 4dd443bc26ea0..e6f1ca61dd291 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -38,6 +38,13 @@ class PoolingMetadata: f"seq_data={self.seq_data}, " f"prompt_lens={self.prompt_lens})") + def __getitem__(self, indices: slice): + return PoolingMetadata( + seq_groups=self.seq_groups[indices], + seq_data=dict(list(self.seq_data.items())[indices]), + prompt_lens=self.prompt_lens[indices], + ) + @dataclass class PoolingTensors: diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index 5f321cd87c524..28af720d05fd1 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -15,3 +15,11 @@ class PoolingMetadata: prompt_lens: torch.Tensor prompt_token_ids: Optional[torch.Tensor] pooling_params: list[PoolingParams] + + def __getitem__(self, indices: slice): + return PoolingMetadata( + prompt_lens=self.prompt_lens[indices], + prompt_token_ids=None if self.prompt_token_ids is None else + self.prompt_token_ids[indices], + pooling_params=self.pooling_params[indices], + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 670e653929cea..cd66d8bcd6342 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import copy import gc import time from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Optional, Union, cast, get_args +from typing import TYPE_CHECKING, Any, Optional, Union, cast import numpy as np import torch @@ -415,15 +415,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): generator = None if pooling_params: - assert pooling_params.task is not None, ( + assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") model = cast(VllmModelForPooling, self.model) - to_update = (model.pooler.get_pooling_updates( - pooling_params.task)) - assert to_update is not None, ( - f"{pooling_params.task=} is not supported by the model") - + to_update = model.pooler.get_pooling_updates(task) to_update.apply(pooling_params) self.requests[req_id] = CachedRequestState( @@ -1122,10 +1118,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def apply_grammar_bitmask( self, @@ -2247,7 +2240,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): dummy_pooling_params = PoolingParams(task=dummy_task) to_update = model.pooler.get_pooling_updates(dummy_task) - assert to_update is not None to_update.apply(dummy_pooling_params) dummy_metadata = PoolingMetadata( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7ed1cf41011ba..aad45b6abd128 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -3,7 +3,7 @@ import bisect import gc import time -from typing import TYPE_CHECKING, Any, Optional, cast, get_args +from typing import TYPE_CHECKING, Any, Optional, cast from unittest.mock import patch import numpy as np @@ -491,10 +491,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index b0737dfe31978..62f26ac57a98b 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -4,7 +4,7 @@ import dataclasses from abc import ABC, abstractmethod from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar, get_args) + TypeVar) import torch import torch.nn as nn @@ -230,10 +230,7 @@ class ModelRunnerBase(ABC, Generic[T]): if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def execute_model( self, diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 2c3f4eb3ad4d4..d91b16be83d70 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -199,15 +199,11 @@ class PoolingModelRunner( pooling_params = seq_group_metadata.pooling_params assert pooling_params is not None - assert pooling_params.task is not None, ( + assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") - to_update = (cast(VllmModelForPooling, - self.model).pooler.get_pooling_updates( - pooling_params.task)) - assert to_update is not None, ( - f"{pooling_params.task=} is not supported by the model") - + model = cast(VllmModelForPooling, self.model) + to_update = model.pooler.get_pooling_updates(task) to_update.apply(pooling_params) seq_groups.append((seq_ids, pooling_params)) From be54a951a3bddedc98db3afdacc2382431a2e3d0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 21 Jul 2025 10:23:57 +0100 Subject: [PATCH 06/16] [Docs] Fix hardcoded links in docs (#21287) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/v1/metrics.md | 5 ++--- docs/features/multimodal_inputs.md | 2 +- docs/features/quantization/bitblas.md | 2 +- docs/features/tool_calling.md | 2 +- docs/models/extensions/tensorizer.md | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index eec42d79d8206..e23308f2637cf 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../.. ### Grafana Dashboard -vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: @@ -672,8 +672,7 @@ v0 has support for OpenTelemetry tracing: `--collect-detailed-traces` - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) -- [User-facing - docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html) +- [User-facing docs](../../examples/online_serving/opentelemetry.md) - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index f9df2c89c6007..e820ace4f8fe7 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -98,7 +98,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis Full example: -If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: +If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: ```python from vllm import LLM diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index ba014d28cde4a..6f53a448ee364 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic !!! note Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. - For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). + For details see [supported hardware](supported_hardware.md). Below are the steps to utilize BitBLAS with vLLM. diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 9b9d6e1360e9b..8d89dc4c8d8e4 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -95,7 +95,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 5aa647b199275..6ea61b080cda3 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -7,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html). +the [vLLM example script](../../examples/others/tensorize_vllm_model.md). !!! note Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. From e6b90a2805e809022580f2c1f4928c64b5f531f1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 21 Jul 2025 10:25:02 +0100 Subject: [PATCH 07/16] [Docs] Make tables more space efficient in `supported_models.md` (#21291) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0a2f69bd77111..33b297ef2d7da 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -314,6 +314,13 @@ See [this page](generative_models.md) for more information on how to use generat Specified using `--task generate`. + + | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | From d97841078b6e0dde8da36d5a2b8e8857a2c37944 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 21 Jul 2025 19:18:33 +0800 Subject: [PATCH 08/16] [Misc] unify variable for LLM instance (#20996) Signed-off-by: Andy Xie --- docs/configuration/model_resolution.md | 2 +- docs/features/lora.md | 4 +- docs/features/quantization/fp8.md | 10 ++- docs/features/quantization/int4.md | 3 +- docs/features/quantization/int8.md | 3 +- docs/models/pooling_models.md | 10 +-- examples/offline_inference/basic/classify.py | 4 +- examples/offline_inference/basic/embed.py | 4 +- examples/offline_inference/basic/score.py | 4 +- .../embed_jina_embeddings_v3.py | 4 +- .../offline_inference/embed_matryoshka_fy.py | 4 +- .../offline_inference/neuron_speculation.py | 12 +-- .../prithvi_geospatial_mae.py | 4 +- examples/offline_inference/qwen3_reranker.py | 8 +- .../test_basic_correctness.py | 4 +- tests/basic_correctness/test_preemption.py | 10 +-- tests/conftest.py | 32 ++++---- tests/core/test_num_computed_tokens_update.py | 2 +- tests/detokenizer/test_stop_reason.py | 2 +- tests/detokenizer/test_stop_strings.py | 42 +++++------ tests/lora/test_llama_tp.py | 20 ++--- tests/metrics/test_metrics.py | 14 ++-- .../test_model_load_with_params.py | 10 +-- .../models/language/generation/test_hybrid.py | 2 +- .../language/generation/test_mistral.py | 14 ++-- tests/models/language/pooling/mteb_utils.py | 18 ++--- tests/models/language/pooling/test_gritlm.py | 4 +- tests/models/language/pooling/test_jina.py | 4 +- .../pooling/test_nomic_max_model_len.py | 6 +- .../pooling/test_truncation_control.py | 6 +- .../multimodal/generation/test_pixtral.py | 5 +- .../multimodal/generation/test_whisper.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 2 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 2 +- tests/models/quantization/test_modelopt.py | 6 +- tests/models/quantization/test_nvfp4.py | 6 +- .../test_disable_sliding_window.py | 22 +++--- tests/prefix_caching/test_prefix_caching.py | 6 +- tests/quantization/test_gptq_dynamic.py | 2 +- tests/quantization/test_quark.py | 4 +- .../test_register_quantization_config.py | 2 +- tests/samplers/test_ignore_eos.py | 2 +- tests/samplers/test_logits_processor.py | 10 +-- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_no_bad_words.py | 12 +-- tests/samplers/test_seeded_generate.py | 2 +- tests/tokenization/test_detokenize.py | 2 +- tests/v1/core/test_scheduler_e2e.py | 12 +-- tests/v1/engine/test_llm_engine.py | 14 ++-- tests/v1/sample/test_logprobs.py | 8 +- tests/v1/sample/test_sampling_params_e2e.py | 74 +++++++++---------- tests/v1/test_oracle.py | 6 +- 53 files changed, 237 insertions(+), 236 deletions(-) diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md index d98142a835c76..49576a8217d0a 100644 --- a/docs/configuration/model_resolution.md +++ b/docs/configuration/model_resolution.md @@ -14,7 +14,7 @@ For example: ```python from vllm import LLM -model = LLM( +llm = LLM( model="cerebras/Cerebras-GPT-1.3B", hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 ) diff --git a/docs/features/lora.md b/docs/features/lora.md index 6acfdcce44587..ea1b495138c1b 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au return tokenizer.apply_chat_template(chat, tokenize=False) - model = LLM( + llm = LLM( model=model_id, enable_lora=True, max_lora_rank=64, @@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au } - outputs = model.generate( + outputs = llm.generate( inputs, sampling_params=SamplingParams( temperature=0.2, diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index a6c0fd78e76b6..0661933acd61f 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -86,8 +86,9 @@ Load and run the model in `vllm`: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") -result = model.generate("Hello my name is") + +llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +result = llm.generate("Hello my name is") print(result[0].outputs[0].text) ``` @@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei ```python from vllm import LLM -model = LLM("facebook/opt-125m", quantization="fp8") + +llm = LLM("facebook/opt-125m", quantization="fp8") # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB -result = model.generate("Hello, my name is") +result = llm.generate("Hello, my name is") print(result[0].outputs[0].text) ``` diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index f26de73c2f0fa..1df32a11ed9db 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 7e1cb3fee94a3..45fae58a64868 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index eef8f20e4e5c6..741ae2d79c1e5 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -174,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -model = LLM(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) -outputs = model.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) +outputs = llm.embed(["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32)) print(outputs[0].outputs) ``` diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 219064e97429b..aaf0e83c9dee8 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -28,10 +28,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="classify" for classification models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate logits. The output is a list of ClassificationRequestOutputs. - outputs = model.classify(prompts) + outputs = llm.classify(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 1114033d5cea4..7ff9c7f5e0eb1 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -31,10 +31,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 6a08de2d2c38c..d37527b0a131b 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -27,10 +27,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="score" for cross-encoder models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate scores. The output is a list of ScoringRequestOutputs. - outputs = model.score(text_1, texts_2) + outputs = llm.score(text_1, texts_2) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index e68128399ba21..7d78b8c63c634 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -30,11 +30,11 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. # Only text matching task is supported for now. See #16120 - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 7f5d74d9a3ae0..50a645ba82702 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -30,10 +30,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32)) + outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32)) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index 2ef69f29863d7..26276cba202b6 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -25,7 +25,7 @@ def config_buckets(): os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" -def initialize_model(): +def initialize_llm(): """Create an LLM with speculative decoding.""" return LLM( model="openlm-research/open_llama_7b", @@ -43,9 +43,9 @@ def initialize_model(): ) -def process_requests(model: LLM, sampling_params: SamplingParams): +def process_requests(llm: LLM, sampling_params: SamplingParams): """Generate texts from prompts and print them.""" - outputs = model.generate(prompts, sampling_params) + outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text @@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams): def main(): - """Main function that sets up the model and processes prompts.""" + """Main function that sets up the llm and processes prompts.""" config_buckets() - model = initialize_model() + llm = initialize_llm() # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, top_k=1) - process_requests(model, sampling_params) + process_requests(llm, sampling_params) if __name__ == "__main__": diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 567c448a8c97b..6dc03e85baa99 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -140,7 +140,7 @@ datamodule_config = { class PrithviMAE: def __init__(self): print("Initializing PrithviMAE model") - self.model = LLM( + self.llm = LLM( model=os.path.join(os.path.dirname(__file__), "./model"), skip_tokenizer_init=True, dtype="float32", @@ -158,7 +158,7 @@ class PrithviMAE: prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} - outputs = self.model.encode(prompt, use_tqdm=False) + outputs = self.llm.encode(prompt, use_tqdm=False) print("################ Inference done (it took seconds) ##############") return outputs[0].outputs.data diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py index fe3cebc348f16..b0fd57237d472 100644 --- a/examples/offline_inference/qwen3_reranker.py +++ b/examples/offline_inference/qwen3_reranker.py @@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B" # Models converted offline using this method can not only be more efficient # and support the vllm score API, but also make the init parameters more # concise, for example. -# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") +# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") # If you want to load the official original version, the init parameters are # as follows. -def get_model() -> LLM: +def get_llm() -> LLM: """Initializes and returns the LLM model for Qwen3-Reranker.""" return LLM( model=model_name, @@ -77,8 +77,8 @@ def main() -> None: ] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] - model = get_model() - outputs = model.score(queries, documents) + llm = get_llm() + outputs = llm.score(queries, documents) print("-" * 30) print([output.outputs.score for output in outputs]) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 2e103019f7af6..13ddf035a55e0 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None: monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model: - if isinstance(vllm_model.model.llm_engine, LLMEngineV1): + if isinstance(vllm_model.llm.llm_engine, LLMEngineV1): v1_test_failed_model_execution(vllm_model) def v1_test_failed_model_execution(vllm_model): - engine = vllm_model.model.llm_engine + engine = vllm_model.llm.llm_engine mocked_execute_model = Mock( side_effect=RuntimeError("Mocked Critical Error")) engine.engine_core.engine_core.model_executor.execute_model =\ diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 341a39a42b85e..db2fa2f6bef6f 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -81,7 +81,7 @@ def test_chunked_prefill_recompute( disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) for i in range(len(example_prompts)): @@ -118,10 +118,10 @@ def test_preemption( distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) total_preemption = ( - vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption) + vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) check_outputs_equal( outputs_0_lst=hf_outputs, @@ -174,12 +174,12 @@ def test_preemption_infeasible( ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) - req_outputs = vllm_model.model.generate( + req_outputs = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params, ) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) # Verify the request is ignored and not hang. diff --git a/tests/conftest.py b/tests/conftest.py index f3524d1fe2a67..a18dbf58c803d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -784,7 +784,7 @@ class VllmRunner: enforce_eager: Optional[bool] = False, **kwargs, ) -> None: - self.model = LLM( + self.llm = LLM( model=model_name, task=task, tokenizer=tokenizer_name, @@ -854,9 +854,9 @@ class VllmRunner: videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) outputs: list[tuple[list[list[int]], list[str]]] = [] for req_output in req_outputs: @@ -902,9 +902,9 @@ class VllmRunner: videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) @@ -924,8 +924,8 @@ class VllmRunner: ''' assert sampling_params.logprobs is not None - req_outputs = self.model.generate(encoder_decoder_prompts, - sampling_params=sampling_params) + req_outputs = self.llm.generate(encoder_decoder_prompts, + sampling_params=sampling_params) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) # Omit prompt logprobs if not required by sampling params @@ -1018,7 +1018,7 @@ class VllmRunner: videos=videos, audios=audios) - outputs = self.model.beam_search( + outputs = self.llm.beam_search( inputs, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) returned_outputs = [] @@ -1029,7 +1029,7 @@ class VllmRunner: return returned_outputs def classify(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.classify(prompts) + req_outputs = self.llm.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] def embed(self, @@ -1044,11 +1044,11 @@ class VllmRunner: videos=videos, audios=audios) - req_outputs = self.model.embed(inputs, *args, **kwargs) + req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.encode(prompts) + req_outputs = self.llm.encode(prompts) return [req_output.outputs.data for req_output in req_outputs] def score( @@ -1058,18 +1058,18 @@ class VllmRunner: *args, **kwargs, ) -> list[float]: - req_outputs = self.model.score(text_1, text_2, *args, **kwargs) + req_outputs = self.llm.score(text_1, text_2, *args, **kwargs) return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - executor = self.model.llm_engine.model_executor + executor = self.llm.llm_engine.model_executor return executor.apply_model(func) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - del self.model + del self.llm cleanup_dist_env_and_memory() diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index 1b958e34df870..9e1b7913dfb99 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, num_scheduler_steps=num_scheduler_steps, enable_chunked_prefill=enable_chunked_prefill, enforce_eager=enforce_eager) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine # In multi-step + chunked-prefill there is no separate single prompt step. # What is scheduled will run for num_scheduler_steps always. diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py index 9716f7d72a585..1ff679789c959 100644 --- a/tests/detokenizer/test_stop_reason.py +++ b/tests/detokenizer/test_stop_reason.py @@ -28,7 +28,7 @@ def vllm_model(vllm_runner): def test_stop_reason(vllm_model, example_prompts): tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) - llm = vllm_model.model + llm = vllm_model.llm # test stop token outputs = llm.generate(example_prompts, diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index efe938a20c4f4..cb87c44cc3999 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -101,42 +101,42 @@ def _stop_token_id(llm): def test_stop_strings(): # If V0, must set enforce_eager=False since we use # async output processing below. - vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) + llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) if envs.VLLM_USE_V1: - _stop_basic(vllm_model) + _stop_basic(llm) else: - _set_async_mode(vllm_model, True) - _stop_basic(vllm_model) + _set_async_mode(llm, True) + _stop_basic(llm) - _set_async_mode(vllm_model, False) - _stop_basic(vllm_model) + _set_async_mode(llm, False) + _stop_basic(llm) if envs.VLLM_USE_V1: - _stop_multi_tokens(vllm_model) + _stop_multi_tokens(llm) else: - _set_async_mode(vllm_model, True) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, True) + _stop_multi_tokens(llm) - _set_async_mode(vllm_model, False) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, False) + _stop_multi_tokens(llm) if envs.VLLM_USE_V1: - _stop_partial_token(vllm_model) + _stop_partial_token(llm) else: - _set_async_mode(vllm_model, True) - _stop_partial_token(vllm_model) + _set_async_mode(llm, True) + _stop_partial_token(llm) - _set_async_mode(vllm_model, False) - _stop_partial_token(vllm_model) + _set_async_mode(llm, False) + _stop_partial_token(llm) if envs.VLLM_USE_V1: # FIXME: this does not respect include_in_output=False - # _stop_token_id(vllm_model) + # _stop_token_id(llm) pass else: - _set_async_mode(vllm_model, True) - _stop_token_id(vllm_model) + _set_async_mode(llm, True) + _stop_token_id(llm) - _set_async_mode(vllm_model, False) - _stop_token_id(vllm_model) + _set_async_mode(llm, False) + _stop_token_id(llm) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index bebf44b6dfd7c..b1ad1fdd06064 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, model_uri = tmp_path / "vllm" / model_ref / suffix / model_name tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) - loaded_vllm_model = LLM(model=model_ref, - load_format="tensorizer", - enable_lora=True, - enforce_eager=True, - model_loader_extra_config=tensorizer_config, - max_num_seqs=13, - tensor_parallel_size=2, - max_loras=2) + loaded_llm = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) tc_as_dict = tensorizer_config.to_serializable() print("lora adapter created") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 54dbb747de09a..8cae8a80d38ed 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens( dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() prompt_token_counts = [ len(tokenizer.encode(p)) for p in example_prompts ] @@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens( vllm_prompt_token_count = sum(prompt_token_counts) _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_prompt_tokens.labels( **stat_logger.labels)._value.get() @@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens( disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step( disable_async_output_proc=disable_async_output_proc, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, disable_log_stats=False, gpu_memory_utilization=0.3, served_model_name=served_model_name) as vllm_model: - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metrics_tag_content = stat_logger.labels["model_name"] if envs.VLLM_CI_USE_S3: diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 1d2d9f9a65bb0..273747630215d 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.llm.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name def check_model(model): diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e4294512338be..2238924c1b50f 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -274,7 +274,7 @@ def test_models_preemption_recompute( Tests that outputs are identical with and w/o preemptions (recompute). """ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.model.llm_engine.scheduler[0] + scheduler = vllm_model.llm.llm_engine.scheduler[0] scheduler.ENABLE_ARTIFICIAL_PREEMPT = True preempt_vllm_outputs = vllm_model.generate_greedy( example_prompts, max_tokens) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index c70698ede37a5..81a88f2d485eb 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, load_format="mistral") as vllm_model: for prompt in SYMBOLIC_LANG_PROMPTS: msg = {"role": "user", "content": prompt} - outputs = vllm_model.model.chat([msg], - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat([msg], + sampling_params=SAMPLING_PARAMS) assert "�" not in outputs[0].outputs[0].text.strip() @@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: load_format="mistral") as vllm_model: msgs = copy.deepcopy(MSGS) - outputs = vllm_model.model.chat(msgs, - tools=TOOLS, - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat(msgs, + tools=TOOLS, + sampling_params=SAMPLING_PARAMS) - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() tool_parser = MistralToolParser(tokenizer) model_output = outputs[0].outputs[0].text.strip() @@ -308,7 +308,7 @@ def test_mistral_guided_decoding( f"Give an example JSON for an employee profile that " f"fits this schema: {SAMPLE_JSON_SCHEMA}" }] - outputs = vllm_model.model.chat(messages, sampling_params=params) + outputs = vllm_model.llm.chat(messages, sampling_params=params) generated_text = outputs[0].outputs[0].text json_response = json.loads(generated_text) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 6c4fde5fdfa94..97362f6416659 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): super().__init__() - self.model = vllm_model + self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( @@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder): # issues by randomizing the order. r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] - outputs = self.model.embed(sentences, use_tqdm=False) + outputs = self.llm.embed(sentences, use_tqdm=False) embeds = np.array(outputs) embeds = embeds[np.argsort(r)] return embeds @@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder): queries = [s[0] for s in sentences] corpus = [s[1] for s in sentences] - outputs = self.model.score(queries, - corpus, - truncate_prompt_tokens=-1, - use_tqdm=False) + outputs = self.llm.score(queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False) scores = np.array(outputs) scores = scores[np.argsort(r)] return scores @@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner, if model_info.architecture: assert (model_info.architecture - in vllm_model.model.llm_engine.model_config.architectures) + in vllm_model.llm.llm_engine.model_config.architectures) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) - vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype with hf_runner(model_info.name, is_sentence_transformer=True, @@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner, max_num_seqs=8, **vllm_extra_kwargs) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.architecture: assert (model_info.architecture in model_config.architectures) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 1274657991bfe..efa119bb76596 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner): task="embed", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm d_rep = run_llm_encode( llm, @@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): task="generate", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=256) outputs = llm.generate(input, sampling_params=sampling_params) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 9bfe7411e16b6..16c711407aeae 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -87,10 +87,10 @@ def test_matryoshka( task="embed", dtype=dtype, max_model_len=None) as vllm_model: - assert vllm_model.model.llm_engine.model_config.is_matryoshka + assert vllm_model.llm.llm_engine.model_config.is_matryoshka matryoshka_dimensions = ( - vllm_model.model.llm_engine.model_config.matryoshka_dimensions) + vllm_model.llm.llm_engine.model_config.matryoshka_dimensions) assert matryoshka_dimensions is not None if dimensions not in matryoshka_dimensions: diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 250b3a52835af..7413ef578e38c 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor) def test_default(model_info, vllm_runner): with vllm_runner(model_info.name, task="embed", max_model_len=None) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. @@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): # set max_model_len <= 512 with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 256 # set 512 < max_model_len <= 2048 @@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): else: with vllm_runner(model_info.name, task="embed", max_model_len=1024) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 1024 diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index 33aff1c873fc4..c7399e01c735b 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner, model_name, task="embed", max_model_len=max_model_len) as vllm_model: - llm_output = vllm_model.model.encode( + llm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) assert llm_output == f"""truncate_prompt_tokens value diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 1def825ab0874..e157d6f4a79df 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -180,8 +180,7 @@ def test_chat( ) as vllm_model: outputs = [] for msg in MSGS: - output = vllm_model.model.chat(msg, - sampling_params=SAMPLING_PARAMS) + output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS) outputs.extend(output) @@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt, max_model_len=8192, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, ) as vllm_model: - outputs = vllm_model.model.generate(prompt) + outputs = vllm_model.llm.generate(prompt) assert len(outputs) == 1, f"{len(outputs)=}" output: RequestOutput = outputs[0] diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 363d55153aac6..4a65e8c95204e 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -106,7 +106,7 @@ def run_test( tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams( temperature=0, diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 8c83d8f8a8a22..cf8962ce49750 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -85,7 +85,7 @@ def run_test( enforce_eager=enforce_eager, task=task, **vllm_runner_kwargs_) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() vllm_kwargs: dict[str, Any] = {} if get_stop_token_ids is not None: diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py index f889eea5e8393..a6f5aeccf94e0 100644 --- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py +++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py @@ -96,7 +96,7 @@ def _run_test( dtype=dtype, enforce_eager=True, max_model_len=8192) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() texts = [ # this is necessary because vllm_model.embed will not apply any # templating to the prompt, and therefore lacks an image_pad diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py index 50c91f1f81ca2..712b6801de456 100644 --- a/tests/models/multimodal/pooling/test_jinavl_reranker.py +++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py @@ -56,7 +56,7 @@ def vllm_reranker( mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, ) as vllm_model: - outputs = vllm_model.model.score(query, documents) + outputs = vllm_model.llm.score(query, documents) return [output.outputs.score for output in outputs] diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py index 6ad526cc893f3..e23d4d9d211d8 100644 --- a/tests/models/quantization/test_modelopt.py +++ b/tests/models/quantization/test_modelopt.py @@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = { reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index b95dad9a4effe..b3c217e729e4a 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = { reason="modelopt_fp4 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index f00a8f6998cbd..b940ab416e673 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -25,25 +25,25 @@ MODEL_LEN_LEN = [ @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) def test_disable_sliding_window(model_len_len, ): model, sliding_len, full_len = model_len_len - vllm_disabled_model = LLM(model, disable_sliding_window=True) - vllm_disabled_model.generate("Hi my name is") - model_config = vllm_disabled_model.llm_engine.model_config + disabled_llm = LLM(model, disable_sliding_window=True) + disabled_llm.generate("Hi my name is") + model_config = disabled_llm.llm_engine.model_config assert model_config.max_model_len == sliding_len, ( "Max len expected to equal sliding_len of %s, but got %s", sliding_len, model_config.max_model_len) - del vllm_disabled_model + del disabled_llm cleanup_dist_env_and_memory() - vllm_enabled_model = LLM(model, - enforce_eager=True, - disable_sliding_window=False, - enable_prefix_caching=False) - vllm_enabled_model.generate("Hi my name is") - model_config = vllm_enabled_model.llm_engine.model_config + enabled_llm = LLM(model, + enforce_eager=True, + disable_sliding_window=False, + enable_prefix_caching=False) + enabled_llm.generate("Hi my name is") + model_config = enabled_llm.llm_engine.model_config assert model_config.max_model_len == full_len, ( "Max len expected to equal full_len of %s, but got %s", full_len, model_config.max_model_len) - del vllm_enabled_model + del enabled_llm cleanup_dist_env_and_memory() diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index a65fc934b16ab..5bf6ed957c74e 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -93,8 +93,8 @@ def test_mixed_requests( # Run all the promopts greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, - greedy_params) + req_outputs = vllm_model.llm.generate(example_prompts, + greedy_params) # Verify number of cached tokens for i in range(len(req_outputs)): @@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model): max_num_batched_tokens=max_num_batched_tokens, max_num_seqs=max_num_batched_tokens, ) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore engine.scheduler[0] = scheduler diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 23b999e7c679b..aea50e99c1dd5 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.model.llm_engine.model_executor. + for name, submodule in (vllm_model.llm.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): if name == "lm_head": assert isinstance(submodule.quant_method, linear_method_cls) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 2db11cb997d19..4a0c8ba4d8a95 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.model.llm_engine.model_executor. + quark_model = (quark_handle.llm.llm_engine.model_executor. driver_worker.model_runner.model) quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker. + fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. model_runner.model) fp8_state_dict = fp8_model.state_dict() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 6c541fdbeeae2..84705e92c85bb 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch): quantization="custom_quant", enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 7eb9c0b5fb8c8..ea4a17dd2306f 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -36,7 +36,7 @@ def test_ignore_eos( ignore_eos=True) for prompt in example_prompts: - ignore_eos_output = vllm_model.model.generate( + ignore_eos_output = vllm_model.llm.generate( prompt, sampling_params=sampling_params) output_length = len(ignore_eos_output[0].outputs[0].token_ids) assert output_length == max_tokens diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 901c875912643..123f9595e97b9 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -26,7 +26,7 @@ def test_logits_processor_force_generate( dtype: str, ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() repeat_times = 2 enforced_answers = " vLLM" vllm_token_ids = tokenizer.encode(enforced_answers, @@ -45,13 +45,13 @@ def test_logits_processor_force_generate( ) # test logits_processors when prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[0], params=params_with_logprobs, ) # test prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[1], params=SamplingParams( prompt_logprobs=3, @@ -60,11 +60,11 @@ def test_logits_processor_force_generate( ) # test grouped requests - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[2], params=SamplingParams(max_tokens=max_tokens), ) - outputs = vllm_model.model._run_engine(use_tqdm=False) + outputs = vllm_model.llm._run_engine(use_tqdm=False) assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 86c8a03eee10f..87f40b1005312 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -64,7 +64,7 @@ def test_get_prompt_logprobs( prompt_logprobs=num_top_logprobs, temperature=0.0, detokenize=detokenize) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) # Test whether logprobs are included in the results. @@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, logprobs=None, temperature=0.0, detokenize=detokenize) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none) for i in range(len(results_logprobs_none)): diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 42b529ae169de..11803b8d7a5eb 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -20,7 +20,7 @@ def v1(run_with_both_engines): def _generate( - model: LLM, + llm: LLM, prompt: str, num_prompt_tokens: int, temperature: float = 0, @@ -32,7 +32,7 @@ def _generate( ) # [([output_token_ids, ], [output_text, ]), ] - output = model.generate([prompt], sampling_params=sampling_params) + output = llm.generate([prompt], sampling_params=sampling_params) output_token_ids = output[0][0][0][num_prompt_tokens:] # [0] first (and only) request output @@ -66,10 +66,10 @@ class TestOneTokenBadWord: assert self.target_token_id not in output_token_ids def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, @@ -156,10 +156,10 @@ class TestTwoTokenBadWord: or (self.neighbour_token_id2 in output_token_ids)) def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index b339b4b2ddf3d..5a0efd98acc16 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -49,7 +49,7 @@ def test_random_sample_with_seed( sampling_params_seed_2 = copy.deepcopy(sampling_params) sampling_params_seed_2.seed = 200 - llm = vllm_model.model + llm = vllm_model.llm for prompt in example_prompts: for params in ( diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f8aeba8301b12..ccafc88461275 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill( logprobs=5, prompt_logprobs=5, temperature=0.0) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) for idx, result in enumerate(vllm_results): diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 85415f6ad4b69..bd0320baef871 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -14,7 +14,7 @@ PROMPT = "Hello my name is Robert and I" @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: return LLM(MODEL, enforce_eager=True, enable_prefix_caching=True, @@ -24,16 +24,16 @@ def model() -> LLM: block_size=16) -def test_concurrent_partial_prefill(model): - outputs = model.generate([PROMPT] * 3) +def test_concurrent_partial_prefill(llm): + outputs = llm.generate([PROMPT] * 3) assert len(outputs) == 3 for output in outputs: assert len(output.outputs) == 1 -def test_prefix_cache_stats_is_recorded(model): +def test_prefix_cache_stats_is_recorded(llm): # 17 tokens will make sure first 16 tokens are cached in a block input_tokens = {"prompt_token_ids": [101] * 17} - _ = model.generate([input_tokens]) - outputs = model.generate([input_tokens]) + _ = llm.generate([input_tokens]) + outputs = llm.generate([input_tokens]) assert outputs[0].num_cached_tokens == 16 diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 059106c62a204..f37686317fd14 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init( example_prompts, structured_outputs=True, ) - model: LLM = vllm_model_skip_tokenizer_init.model + llm: LLM = vllm_model_skip_tokenizer_init.llm with pytest.raises(ValueError): - _ = model.generate(example_prompts, sampling_params_list) + _ = llm.generate(example_prompts, sampling_params_list) def test_parallel_sampling(vllm_model, example_prompts) -> None: @@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: example_prompt: test fixture providing prompts for testing. """ sampling_params_list, n_list = _get_test_sampling_params(example_prompts) - model: LLM = vllm_model.model - outputs = model.generate(example_prompts, sampling_params_list) + llm: LLM = vllm_model.llm + outputs = llm.generate(example_prompts, sampling_params_list) # Validate each request response for out, n in zip(outputs, n_list): @@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): speculative_config=speculative_config, disable_log_stats=False, ) as vllm_model: - model: LLM = vllm_model.model + llm: LLM = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs = model.generate(example_prompts, sampling_params) + outputs = llm.generate(example_prompts, sampling_params) n_prompts = len(example_prompts) assert len(outputs) == n_prompts @@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): total_tokens += len(out.outputs[0].token_ids) assert total_tokens == max_tokens * n_prompts - metrics = model.get_metrics() + metrics = llm.get_metrics() def find_metric(name) -> list[Metric]: found = [] diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 69180e6e5db49..4f1f340a4ccbb 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -112,7 +112,7 @@ def _run_and_validate( max_tokens: int, do_apc: bool, ) -> None: - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( test_prompts, sampling_params=vllm_sampling_params) for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip( @@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs( """ with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT): # Skip some test-cases to save time. @@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts, prompt_logprobs=None, temperature=0.0, ) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none, ) @@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts, logprobs=0, prompt_logprobs=0, temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( + results_logprobs_zero = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_zero) for i in range(len(results_logprobs_zero)): diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index ac0f3eb58836f..f53e1e1c485d6 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -14,30 +14,30 @@ PROMPT = "Hello my name is Robert and I" @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: # Disable prefix caching so that we can test prompt logprobs. # TODO remove this after https://github.com/vllm-project/vllm/pull/13949 # is merged return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False) -def test_n_gt_1(model): +def test_n_gt_1(llm): """ParallelSampling is supported.""" params = SamplingParams(n=3) - outputs = model.generate(PROMPT, params) + outputs = llm.generate(PROMPT, params) assert len(outputs[0].outputs) == 3 -def test_best_of(model): +def test_best_of(llm): """Raise a ValueError since best_of is deprecated.""" params = SamplingParams(n=2, best_of=3) with pytest.raises(ValueError): - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_penalties(model): +def test_penalties(llm): """Check that we do not get errors if applied.""" params = SamplingParams( @@ -49,18 +49,18 @@ def test_penalties(model): top_p=0.5, top_k=3, ) - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_stop(model): +def test_stop(llm): """Check that we respect the stop words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() STOP_IDX = 5 params = SamplingParams(temperature=0, stop=split_text[STOP_IDX]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should not contain the stop word. @@ -69,40 +69,40 @@ def test_stop(model): params = SamplingParams(temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should contain the stop word. assert len(new_split_text) == STOP_IDX + 1 -def test_stop_token_ids(model): +def test_stop_token_ids(llm): """Check that we respect the stop token ids.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) stop_token_id_0 = output[0].outputs[0].token_ids[5] stop_token_id_1 = output[0].outputs[0].token_ids[6] stop_token_ids = [stop_token_id_1, stop_token_id_0] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 stop_token_ids = [stop_token_id_0, stop_token_id_1] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 -def test_detokenize_false(model): +def test_detokenize_false(llm): """Check that detokenize=False option works.""" - output = model.generate(PROMPT, SamplingParams(detokenize=False)) + output = llm.generate(PROMPT, SamplingParams(detokenize=False)) assert len(output[0].outputs[0].token_ids) > 0 assert len(output[0].outputs[0].text) == 0 - output = model.generate( + output = llm.generate( PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)) assert len(output[0].outputs[0].token_ids) > 0 @@ -118,28 +118,28 @@ def test_detokenize_false(model): assert all(lp.decoded_token is None for lp in logprobs.values()) -def test_bad_words(model): +def test_bad_words(llm): """Check that we respect bad words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() bad_words_1 = " ".join(split_text[:2]) params = SamplingParams(temperature=0, bad_words=[bad_words_1]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text bad_words_2 = new_text.split()[-1] params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text assert bad_words_2 not in new_text -def test_logits_processor(model): +def test_logits_processor(llm): """Check that we reject logits processor.""" # This sample logits processor gives infinite score to the i-th token, @@ -150,47 +150,45 @@ def test_logits_processor(model): return logits with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(logits_processors=[pick_ith])) + _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith])) -def test_allowed_token_ids(model): +def test_allowed_token_ids(llm): """Check that we can use allowed_token_ids.""" TOKEN_ID = 10 allowed_token_ids = [TOKEN_ID] - output = model.generate( - PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids)) + output = llm.generate(PROMPT, + SamplingParams(allowed_token_ids=allowed_token_ids)) assert output[0].outputs[0].token_ids[-1] == TOKEN_ID # Reject empty allowed_token_ids. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[])) # Reject negative token id. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) # Reject out of vocabulary. with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(allowed_token_ids=[10000000])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) -def test_priority(model): +def test_priority(llm): """Check that we reject requests with priority.""" # Reject all allowed token ids with pytest.raises(ValueError): - _ = model.generate(PROMPT, priority=[1]) + _ = llm.generate(PROMPT, priority=[1]) -def test_seed(model): +def test_seed(llm): """Check that seed impacts randomness.""" - out_1 = model.generate(PROMPT, SamplingParams(seed=42)) - out_2 = model.generate(PROMPT, SamplingParams(seed=42)) - out_3 = model.generate(PROMPT, SamplingParams(seed=43)) + out_1 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_2 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_3 = llm.generate(PROMPT, SamplingParams(seed=43)) assert out_1[0].outputs[0].text == out_2[0].outputs[0].text assert out_1[0].outputs[0].text != out_3[0].outputs[0].text diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 39515d710e81e..b4d4348c7fd9b 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -106,9 +106,9 @@ def test_v1_llm_by_default(monkeypatch): m.delenv("VLLM_USE_V1") # Should default to V1 for supported config. - model = LLM(MODEL, enforce_eager=True, enable_lora=True) - print(model.generate("Hello my name is")) - assert hasattr(model.llm_engine, "engine_core") + llm = LLM(MODEL, enforce_eager=True, enable_lora=True) + print(llm.generate("Hello my name is")) + assert hasattr(llm.llm_engine, "engine_core") m.delenv("VLLM_USE_V1") From 6b46c4b653d1d730a9b75d32b59b9d60f879b9d7 Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Mon, 21 Jul 2025 07:02:58 -0700 Subject: [PATCH 09/16] Add Nvidia ModelOpt config adaptation (#19815) Signed-off-by: Zhiyu Cheng --- tests/quantization/test_modelopt.py | 91 ++++++++ vllm/config.py | 20 +- .../layers/quantization/modelopt.py | 208 +++++++++++++++--- 3 files changed, 287 insertions(+), 32 deletions(-) create mode 100644 tests/quantization/test_modelopt.py diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py new file mode 100644 index 0000000000000..fcbfa681d75c9 --- /dev/null +++ b/tests/quantization/test_modelopt.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Test ModelOpt quantization method setup and weight loading. + +Run `pytest tests/quantization/test_modelopt.py`. +""" + +import os + +import pytest +import torch + +from tests.quantization.utils import is_quant_method_supported +from vllm.platforms import current_platform + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module relies on V0 internals, so set VLLM_USE_V1=0. + """ + if not current_platform.is_cpu(): + monkeypatch.setenv('VLLM_USE_V1', '0') + + +@pytest.mark.skipif(not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.") +def test_modelopt_fp8_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8 checkpoint loading and structure validation.""" + # TODO: provide a small publically available test checkpoint + model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" + "TinyLlama-1.1B-Chat-v1.0-fp8-0710") + + # Skip test if checkpoint doesn't exist + if not os.path.exists(model_path): + pytest.skip(f"Test checkpoint not found at {model_path}. " + "This test requires a local ModelOpt FP8 checkpoint.") + + with vllm_runner(model_path, quantization="modelopt", + enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # Check that ModelOpt quantization method is properly applied + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8LinearMethod) + assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod) + assert isinstance(gate_up_proj.quant_method, + ModelOptFp8LinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod) + + # Check weight dtype is FP8 + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Check scales are present and have correct dtype + assert hasattr(qkv_proj, 'weight_scale') + assert hasattr(qkv_proj, 'input_scale') + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.input_scale.dtype == torch.float32 + + assert hasattr(o_proj, 'weight_scale') + assert hasattr(o_proj, 'input_scale') + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.input_scale.dtype == torch.float32 + + assert hasattr(gate_up_proj, 'weight_scale') + assert hasattr(gate_up_proj, 'input_scale') + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.input_scale.dtype == torch.float32 + + assert hasattr(down_proj, 'weight_scale') + assert hasattr(down_proj, 'input_scale') + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.input_scale.dtype == torch.float32 + + llm.apply_model(check_model) + + # Run a simple generation test to ensure the model works + output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + assert output + print(f"ModelOpt FP8 output: {output}") diff --git a/vllm/config.py b/vllm/config.py index 4cafbc9260525..3e6aa2a93e6a7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -346,11 +346,11 @@ class ModelConfig: """Maximum number of data items per modality per prompt. Only applicable for multimodal models.""" interleave_mm_strings: bool = False - """Enable fully interleaved support for multimodal prompts, while using + """Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string. Defaults to False.""" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" @@ -1000,9 +1000,13 @@ class ModelConfig: quant_cfg = self._parse_quant_hf_config() if quant_cfg is not None: + # Use the community standard 'quant_method' quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names quant_method = quant_method.replace("compressed_tensors", "compressed-tensors") + quant_cfg["quant_method"] = quant_method # Quantization methods which are overrides (i.e. they have a @@ -1017,6 +1021,8 @@ class ModelConfig: "awq_marlin", "ipex", "moe_wna16", + "modelopt", + "modelopt_fp4", ] quantization_methods = [ q for q in supported_quantization if q not in overrides @@ -3185,8 +3191,8 @@ class MultiModalConfig: """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ mm_processor_kwargs: Optional[dict[str, object]] = None @@ -4086,7 +4092,7 @@ class CompilationConfig: - True: inductor compilation is used (custom_ops disabled by default). One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - + This setting is ignored if level` can be used to directly specify the compilation level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`). - Currently, -O and -O= are supported as well but this will likely be + Currently, -O and -O= are supported as well but this will likely be removed in favor of clearer -O syntax in the future. NOTE: level 0 is the default level without any optimization. level 1 and 2 diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 20def70d19768..460334d77f0a8 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -75,20 +75,64 @@ class ModelOptFp8Config(QuantizationConfig): def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: + """Detect if this ModelOpt config should be used based on + quantization config.""" + + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "FP8" in quant_algo: + return "modelopt" + else: + # Check for compressed-tensors style config with specific quant_algo + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP8" in quant_algo: + return "modelopt" + + return None + @classmethod def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": - quant_config = cls.get_from_keys(config, ["quantization"]) - quant_method = quant_config["quant_algo"] - kv_cache_quant_method = cls.get_from_keys( - config, ["quantization"]).get("kv_cache_quant_algo") - exclude_modules = cls.get_from_keys( - config, ["quantization"]).get("exclude_modules") + # Handle both ModelOpt format and compressed-tensors style format + if "quantization" in config: + # ModelOpt format: {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError( + "Expected 'quantization' to be a dictionary in config") + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") + exclude_modules = quant_config.get("exclude_modules") + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + kv_cache_quant_method = config.get("kv_cache_quant_algo") + exclude_modules = config.get("exclude_modules") if quant_method not in QUANT_ALGOS: - raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" - " quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration.") + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration.") is_checkpoint_fp8_serialized = ("FP8" in quant_method) return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, @@ -434,7 +478,7 @@ class ModelOptNvFp4Config(QuantizationConfig): def __init__( self, is_checkpoint_nvfp4_serialized: bool, - kv_cache_quant_algo: str, + kv_cache_quant_algo: Optional[str], exclude_modules: list[str], group_size: int = 16, ) -> None: @@ -465,24 +509,138 @@ class ModelOptNvFp4Config(QuantizationConfig): def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: + """Detect if this ModelOpt FP4 config should be used based on + quantization config.""" + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "NVFP4" in quant_algo: + return "modelopt_fp4" + else: + # Check for compressed-tensors style config with specific + # quant_algo field + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP4" in quant_algo.upper(): + return "modelopt_fp4" + + return None + @classmethod def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": - quant_config = cls.get_from_keys(config, ["quantization"]) - quant_method = quant_config["quant_algo"] + # Handle both traditional ModelOpt format and compressed-tensors + # style format + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError( + "Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError(f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got " + f"{type(group_size_raw)}") from None + + exclude_modules = quant_config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got " + f"{type(exclude_modules)}") + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError(f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got " + f"{type(group_size_raw)}") from None + + exclude_modules = config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got " + f"{type(exclude_modules)}") + if quant_method not in QUANT_ALGOS: - raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" - " quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration.") + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration.") is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method) - if ("group_size" and "kv_cache_quant_algo" - and "exclude_modules") not in quant_config: - raise ValueError("NVFP4 quantization requires group size and " - "kv_cache_quant_algo specified in " - "hf_quant_config.json") - kv_cache_quant_algo = quant_config["kv_cache_quant_algo"] - group_size = quant_config["group_size"] - exclude_modules = quant_config["exclude_modules"] + + # For FP4, these fields are required + if is_checkpoint_nvfp4_serialized and "quantization" in config: + # Check if required fields are present in the quantization config + quant_config = config["quantization"] + required_fields = [ + "group_size", "kv_cache_quant_algo", "exclude_modules" + ] + missing_fields = [ + field for field in required_fields if field not in quant_config + ] + if missing_fields: + raise ValueError( + f"NVFP4 quantization requires the following fields in " + f"hf_quant_config.json: {missing_fields}") + return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) From 6dda13c86ba17ca6bc054293d135bad2d1ab7129 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 21 Jul 2025 08:37:49 -0700 Subject: [PATCH 10/16] [Misc] Add sliding window to flashinfer test (#21282) Signed-off-by: Woosuk Kwon --- tests/kernels/attention/test_flashinfer.py | 49 ++++++++++++++-------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 3ad6e1d32911b..8f9b4eceaa72b 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -77,6 +77,7 @@ def ref_paged_attn( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("sliding_window", [None, 64]) @torch.inference_mode def test_flashinfer_decode_with_paged_kv( kv_lens: list[int], @@ -85,6 +86,7 @@ def test_flashinfer_decode_with_paged_kv( dtype: torch.dtype, block_size: int, soft_cap: Optional[float], + sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -136,17 +138,20 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.plan(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - q_data_type=dtype, - kv_data_type=dtype, - logits_soft_cap=soft_cap) + wrapper.plan( + kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + window_left=sliding_window - 1 if sliding_window is not None else -1, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, + ) output = wrapper.run(query, key_value_cache) @@ -157,7 +162,8 @@ def test_flashinfer_decode_with_paged_kv( kv_lens=kv_lens, block_tables=block_tables, scale=scale, - soft_cap=soft_cap) + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -168,12 +174,17 @@ def test_flashinfer_decode_with_paged_kv( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("sliding_window", [None, 64]) @torch.inference_mode -def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], - num_heads: tuple[int, int], - head_size: int, dtype: torch.dtype, - block_size: int, - soft_cap: Optional[float]) -> None: +def test_flashinfer_prefill_with_paged_kv( + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], + head_size: int, + dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float], + sliding_window: Optional[int], +) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) num_seqs = len(seq_lens) @@ -242,6 +253,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], num_kv_heads, head_size, block_size, + window_left=sliding_window - 1 if sliding_window is not None else -1, q_data_type=dtype, kv_data_type=dtype, logits_soft_cap=soft_cap, @@ -259,7 +271,8 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], kv_lens=kv_lens, block_tables=block_tables, scale=scale, - soft_cap=soft_cap) + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" From a15a50fc17f9918d2cc457e5ef50310b38c28f5f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Tue, 22 Jul 2025 00:07:08 +0800 Subject: [PATCH 11/16] [CPU] Enable shared-memory based pipeline parallel for CPU backend (#21289) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 18 ++--- csrc/cpu/shm.cpp | 69 +++++++++++++------ docs/getting_started/installation/cpu.md | 14 ++++ .../device_communicators/cpu_communicator.py | 60 +++++++++++++++- vllm/distributed/parallel_state.py | 12 ++++ vllm/engine/arg_utils.py | 9 +-- vllm/envs.py | 7 +- vllm/platforms/cpu.py | 35 ++++------ 8 files changed, 165 insertions(+), 59 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index e3d47a0e6c16b..90cc9c8446223 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,6 +6,7 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +# used for TP/PP E2E test OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} @@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -78,17 +79,16 @@ function cpu_tests() { # tests/quantization/test_ipex_quant.py" # online serving - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + python3 benchmarks/benchmark_serving.py \ --backend vllm \ --dataset-name random \ - --model facebook/opt-125m \ + --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" + --endpoint /v1/completions' # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp index 9adb6f27ec411..7e64e1c521980 100644 --- a/csrc/cpu/shm.cpp +++ b/csrc/cpu/shm.cpp @@ -7,7 +7,7 @@ namespace { #define MAX_SHM_RANK_NUM 8 -#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024) +#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024) static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0); #define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1) #define MIN_THREAD_PROCESS_SIZE (256) @@ -34,9 +34,10 @@ struct KernelVecType { }; struct ThreadSHMContext { - volatile char _curr_thread_stamp; - volatile char _ready_thread_stamp; - char _padding1[6]; + volatile char _curr_thread_stamp[2]; + volatile char _ready_thread_stamp[2]; + int local_stamp_buffer_idx; + int remote_stamp_buffer_idx; int thread_id; int thread_num; int rank; @@ -45,23 +46,28 @@ struct ThreadSHMContext { int swizzled_ranks[MAX_SHM_RANK_NUM]; void* thread_shm_ptrs[MAX_SHM_RANK_NUM]; ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM]; - size_t _thread_buffer_mask; - char _padding2[56]; + size_t _thread_buffer_mask[2]; + char _padding2[40]; ThreadSHMContext(const int thread_id, const int thread_num, const int rank, const int group_size, void* thread_shm_ptr) - : _curr_thread_stamp(1), - _ready_thread_stamp(0), + : local_stamp_buffer_idx(0), + remote_stamp_buffer_idx(0), thread_id(thread_id), thread_num(thread_num), rank(rank), group_size(group_size), - _spinning_count(0), - _thread_buffer_mask(0) { + _spinning_count(0) { static_assert(sizeof(ThreadSHMContext) % 64 == 0); TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM); TORCH_CHECK((size_t)this % 64 == 0); TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0); + _curr_thread_stamp[0] = 1; + _curr_thread_stamp[1] = 1; + _ready_thread_stamp[0] = 0; + _ready_thread_stamp[1] = 0; + _thread_buffer_mask[0] = 0; + _thread_buffer_mask[1] = 0; for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) { shm_contexts[i] = nullptr; thread_shm_ptrs[i] = nullptr; @@ -70,6 +76,11 @@ struct ThreadSHMContext { set_context(rank, this, thread_shm_ptr); } + void set_stamp_buffer_idx(int local, int remote) { + local_stamp_buffer_idx = local; + remote_stamp_buffer_idx = remote; + } + void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) { TORCH_CHECK(rank < MAX_SHM_RANK_NUM); TORCH_CHECK(ptr); @@ -84,23 +95,27 @@ struct ThreadSHMContext { T* get_thread_shm_ptr(int rank) { return reinterpret_cast( reinterpret_cast(thread_shm_ptrs[rank]) + - (PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask)); + (PER_THREAD_SHM_BUFFER_OFFSET & + _thread_buffer_mask[local_stamp_buffer_idx])); } - void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; } + void next_buffer() { + _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF; + } - char get_curr_stamp() const { return _curr_thread_stamp; } + char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; } - char get_ready_stamp() const { return _ready_thread_stamp; } + char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; } void next_stamp() { _mm_mfence(); - _curr_thread_stamp += 1; + _curr_thread_stamp[local_stamp_buffer_idx] += 1; } void commit_ready_stamp() { _mm_mfence(); - _ready_thread_stamp = _curr_thread_stamp; + _ready_thread_stamp[local_stamp_buffer_idx] = + _curr_thread_stamp[local_stamp_buffer_idx]; } int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; } @@ -117,10 +132,11 @@ struct ThreadSHMContext { void wait_for_one(int rank, Cond&& cond) { ThreadSHMContext* rank_ctx = shm_contexts[rank]; for (;;) { - char local_curr_stamp = get_curr_stamp(); - char local_ready_stamp = get_ready_stamp(); - char rank_curr_stamp = rank_ctx->get_curr_stamp(); - char rank_ready_stamp = rank_ctx->get_ready_stamp(); + char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx); + char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx); + char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx); + char rank_ready_stamp = + rank_ctx->get_ready_stamp(remote_stamp_buffer_idx); if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp, rank_ready_stamp)) { break; @@ -361,6 +377,15 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) { } } } + +void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local, + int remote) { + int thread_num = ctx->thread_num; + for (int i = 0; i < thread_num; ++i) { + ThreadSHMContext* thread_ctx = ctx + i; + thread_ctx->set_stamp_buffer_idx(local, remote); + } +} }; // namespace shm_cc_ops namespace shm_cc_ops { @@ -632,6 +657,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst, TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta(); metadata->bind_tensor_list(tensor_list_with_metadata); + shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1); shm_cc_ops::shm_cc_loop( ctx, metadata->total_bytes, [&](ThreadSHMContext* thread_ctx, int64_t data_offset, @@ -659,6 +685,7 @@ std::vector shm_recv_tensor_list_impl(ThreadSHMContext* ctx, torch::Tensor metadata_tensor = torch::empty({sizeof(TensorListMeta)}, options); + shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0); ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); shm_cc_ops::memcpy(metadata_tensor.data_ptr(), ctx->get_thread_shm_ptr(src), @@ -677,7 +704,7 @@ std::vector shm_recv_tensor_list_impl(ThreadSHMContext* ctx, ctx, metadata.total_bytes, [&](ThreadSHMContext* thread_ctx, int64_t data_offset, int64_t data_elem_num, bool fast_mode) { - ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); + thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); int64_t curr_shm_offset = 0; while (curr_shm_offset < data_elem_num) { MemPiece frag = metadata.get_data(data_offset + curr_shm_offset); diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index d77e7383650cf..5721195172dc1 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -166,6 +166,20 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe - This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory. +### How to do performance tuning for vLLM CPU? + + - First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`. + + - Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM: + - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as: + - Offline Inference: `4096 * world_size` + - Online Serving: `2048 * world_size` + - `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance. + - Offline Inference: `256 * world_size` + - Online Serving: `128 * world_size` + + - vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes. + ### Which quantization configs does vLLM CPU support? - vLLM CPU supports quantizations: diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index 94effa0b2ca88..bda567f8489c5 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import Optional +from typing import Any, Optional, Union import torch from torch.distributed import ProcessGroup +from vllm.distributed.utils import pickle from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum @@ -26,7 +27,8 @@ class CpuCommunicator(DeviceCommunicatorBase): if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) and hasattr( torch.ops._C, - "init_shm_manager") and unique_name.startswith("tp"): + "init_shm_manager") and (unique_name.startswith("tp") + or unique_name.startswith("pp")): self.dist_module = _CPUSHMDistributed(self) def all_reduce(self, input_): @@ -94,6 +96,19 @@ class CpuCommunicator(DeviceCommunicatorBase): input_size[dim + 1:]) return output_tensor + def send_tensor_dict( + self, + tensor_dict: dict[str, Union[torch.Tensor, Any]], + dst: int, + ) -> None: + return self.dist_module.send_tensor_dict(tensor_dict, dst) + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, Union[torch.Tensor, Any]]: + return self.dist_module.recv_tensor_dict(src) + class _CPUSHMDistributed: @@ -143,3 +158,44 @@ class _CPUSHMDistributed: input: torch.Tensor, group: Optional[ProcessGroup] = None) -> None: torch.ops._C.shm_all_gather(self.handle, input, output) + + def send_tensor_dict( + self, + tensor_dict: dict[str, Union[torch.Tensor, Any]], + dst: int, + ) -> None: + key_list = list(tensor_dict.keys()) + value_list = list(tensor_dict.values()) + size_list = [] + for v in value_list: + if not isinstance(v, torch.Tensor): + raise RuntimeError( + "CpuCommunicator only supports sending tensors.") + size_list.append(v.size()) + key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]), + dtype=torch.uint8) + value_list.append(key_size_tensor) + + torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst) + + return None + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, Union[torch.Tensor, Any]]: + tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src) + + value_list: list[torch.Tensor] = tensor_list[:-1] + key_size_tensor = tensor_list[-1] + + key_size = pickle.loads(key_size_tensor.numpy().tobytes()) + key_list = key_size[0] + size_list = key_size[1] + assert len(key_list) == len(size_list) + assert len(key_list) == len(value_list) + + tensor_dict: dict[str, torch.Tensor] = {} + for key, size, t in zip(key_list, size_list, value_list): + tensor_dict[key] = t.view(size) + return tensor_dict diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 1bb0ca79cc1da..1f7a14920c418 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -272,6 +272,9 @@ class GroupCoordinator: self.use_custom_op_call = (current_platform.is_cuda_alike() or current_platform.is_tpu()) + self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr( + torch.ops._C, "init_shm_manager")) + @property def first_rank(self): """Return the global rank of the first process in the group""" @@ -663,6 +666,11 @@ class GroupCoordinator: dst = (self.rank_in_group + 1) % self.world_size assert dst < self.world_size, f"Invalid dst rank ({dst})" + if self.use_cpu_custom_send_recv: + self.device_communicator.send_tensor_dict( # type: ignore + tensor_dict, dst) + return None + metadata_list: list[tuple[Any, Any]] = [] assert isinstance( tensor_dict, @@ -718,6 +726,10 @@ class GroupCoordinator: src = (self.rank_in_group - 1) % self.world_size assert src < self.world_size, f"Invalid src rank ({src})" + if self.use_cpu_custom_send_recv: + return self.device_communicator.recv_tensor_dict( # type: ignore + src) + recv_metadata_list = self.recv_object(src=src) tensor_dict: dict[str, Any] = {} for key, value in recv_metadata_list: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 019ff033eda2d..28b1c1c363a76 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1639,13 +1639,14 @@ class EngineArgs: # cpu specific default values. if current_platform.is_cpu(): + world_size = self.pipeline_parallel_size * self.tensor_parallel_size default_max_num_batched_tokens = { - UsageContext.LLM_CLASS: 4096, - UsageContext.OPENAI_API_SERVER: 2048, + UsageContext.LLM_CLASS: 4096 * world_size, + UsageContext.OPENAI_API_SERVER: 2048 * world_size, } default_max_num_seqs = { - UsageContext.LLM_CLASS: 128, - UsageContext.OPENAI_API_SERVER: 32, + UsageContext.LLM_CLASS: 256 * world_size, + UsageContext.OPENAI_API_SERVER: 128 * world_size, } use_context_value = usage_context.value if usage_context else None diff --git a/vllm/envs.py b/vllm/envs.py index c5f97de807a72..16f635b3ac41d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -42,7 +42,7 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None - VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None VLLM_CPU_MOE_PREPACK: bool = True @@ -430,9 +430,10 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), # (CPU backend only) CPU key-value cache space. - # default is 4 GiB + # default is None and will be set as 4 GB "VLLM_CPU_KVCACHE_SPACE": - lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), + lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) + if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None, # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 70c339c9bc980..31a67183ff12c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -104,8 +104,19 @@ class CpuPlatform(Platform): @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: - import psutil - return psutil.virtual_memory().total + import vllm.envs as envs + from vllm.utils import GiB_bytes + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + if kv_cache_space is None: + kv_cache_space = 4 * GiB_bytes # type: ignore + logger.warning_once( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) " + "for CPU backend is not set, using 4 by default.") + else: + kv_cache_space *= GiB_bytes + + return kv_cache_space @classmethod def set_device(cls, device: torch.device) -> None: @@ -124,8 +135,6 @@ class CpuPlatform(Platform): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - import vllm.envs as envs - from vllm.utils import GiB_bytes model_config = vllm_config.model_config if model_config is not None: @@ -162,20 +171,8 @@ class CpuPlatform(Platform): " support fp16 for now, cast to bf16.") model_config.dtype = torch.bfloat16 - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE - - if kv_cache_space >= 0: - if kv_cache_space == 0: - cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning( - "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) " - "for CPU backend is not set, using 4 by default.") - else: - cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa - else: - raise RuntimeError( - "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") + cache_config.cpu_kvcache_space_bytes = \ + CpuPlatform.get_device_total_memory() parallel_config = vllm_config.parallel_config if (parallel_config.world_size > 1 @@ -216,8 +213,6 @@ class CpuPlatform(Platform): False, "nan_asserts": False, - "memory_planning": - True, "epilogue_fusion": True, }) From a0e827e07c3c6a22283b4de2e0072c09f62162fc Mon Sep 17 00:00:00 2001 From: simpx Date: Tue, 22 Jul 2025 00:07:36 +0800 Subject: [PATCH 12/16] [BugFix] make utils.current_stream thread-safety (#21252) (#21253) Signed-off-by: simpx --- tests/test_utils.py | 44 +++++++++++++++++++++++++++++++++++++++--- vllm/utils/__init__.py | 15 +++++++------- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 28acacd251903..53a34642e5baf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,9 +23,9 @@ from vllm.transformers_utils.detokenizer_utils import ( from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, bind_kv_cache, common_broadcastable_dtype, - deprecate_kwargs, get_open_port, get_tcp_uri, - is_lossless_cast, join_host_port, make_zmq_path, - make_zmq_socket, memory_profiling, + current_stream, deprecate_kwargs, get_open_port, + get_tcp_uri, is_lossless_cast, join_host_port, + make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_host_port, split_zmq_path, supports_kw, swap_dict_values) @@ -957,3 +957,41 @@ def test_convert_ids_list_to_tokens(): ] tokens = convert_ids_list_to_tokens(tokenizer, token_ids) assert tokens == ['Hello', ',', ' world', '!'] + + +def test_current_stream_multithread(): + import threading + if not torch.cuda.is_available(): + pytest.skip("CUDA not available") + + main_default_stream = torch.cuda.current_stream() + child_stream = torch.cuda.Stream() + + thread_stream_ready = threading.Event() + thread_can_exit = threading.Event() + + def child_thread_func(): + with torch.cuda.stream(child_stream): + thread_stream_ready.set() + thread_can_exit.wait(timeout=10) + + child_thread = threading.Thread(target=child_thread_func) + child_thread.start() + + try: + assert thread_stream_ready.wait( + timeout=5), "Child thread failed to enter stream context in time" + + main_current_stream = current_stream() + + assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread" + assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream" + + # Notify child thread it can exit + thread_can_exit.set() + + finally: + # Ensure child thread exits properly + child_thread.join(timeout=5) + if child_thread.is_alive(): + pytest.fail("Child thread failed to exit properly") diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index bbcc2a523dcb2..e4f495e22e291 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1383,12 +1383,11 @@ def find_nccl_library() -> str: prev_set_stream = torch.cuda.set_stream -_current_stream = None +_current_stream_tls = threading.local() def _patched_set_stream(stream: torch.cuda.Stream) -> None: - global _current_stream - _current_stream = stream + _current_stream_tls.value = stream prev_set_stream(stream) @@ -1407,16 +1406,16 @@ def current_stream() -> torch.cuda.Stream: from C/C++ code. """ from vllm.platforms import current_platform - global _current_stream - if _current_stream is None: + if not hasattr(_current_stream_tls, + "value") or _current_stream_tls.value is None: # when this function is called before any stream is set, # we return the default stream. # On ROCm using the default 0 stream in combination with RCCL # is hurting performance. Therefore creating a dedicated stream # per process - _current_stream = torch.cuda.Stream() if current_platform.is_rocm( - ) else torch.cuda.current_stream() - return _current_stream + _current_stream_tls.value = torch.cuda.Stream( + ) if current_platform.is_rocm() else torch.cuda.current_stream() + return _current_stream_tls.value def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: From 6ece16c4fe8c6f8f49b66c95cd3dd06b1c75de35 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 21 Jul 2025 09:08:09 -0700 Subject: [PATCH 13/16] [Misc] Add dummy maverick test (#21199) Signed-off-by: Ming Yang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../multimodal/generation/test_maverick.py | 649 ++++++++++++++++++ 1 file changed, 649 insertions(+) create mode 100644 tests/models/multimodal/generation/test_maverick.py diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py new file mode 100644 index 0000000000000..083dc66148e2e --- /dev/null +++ b/tests/models/multimodal/generation/test_maverick.py @@ -0,0 +1,649 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Create a reduced-layer version of the Maverick model for testing purposes. + +This script creates a new model with fewer layers by: +1. Loading the original Maverick model configuration +2. Creating a reduced configuration +3. Generating compatible safetensors files with appropriate weights +4. Creating the necessary index files for vLLM compatibility +""" + +import json +import shutil +from pathlib import Path +from typing import Any + +import pytest +import torch +from safetensors.torch import save_file +from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, + GenerationConfig) + +from vllm import LLM, SamplingParams + +# Sample prompts for testing +PROMPTS: list[str] = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +def run_maverick_serving(model: str): + """Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent + options with reduced layers. + """ + + try: + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model=model, + max_model_len=2048, + enforce_eager=True, + tensor_parallel_size=8, + enable_expert_parallel=True, + trust_remote_code=True, + gpu_memory_utilization=0.4, + kv_cache_dtype="fp8", + ) + + outputs = llm.generate(PROMPTS, sampling_params) + + # Print the outputs + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + except Exception as e: + print(f"Error initializing or running model: {e}") + raise + + +def create_reduced_maverick_model( + original_model_name: + str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + output_dir: str = "/tmp/reduced_maverick", + text_layers: int = 4, + num_experts: int = 4, + vision_layers: int = 2, + force_recreate: bool = False, +) -> str: + """ + Create a reduced-layer version of the Maverick model. + + Args: + original_model_name: Name of the original Maverick model + output_dir: Directory to save the reduced model + text_layers: Number of text transformer layers + num_experts: Number of experts per layer + vision_layers: Number of vision transformer layers + force_recreate: Whether to recreate if output_dir already exists + + Returns: + Path to the created reduced model directory + """ + + print( + f"Creating reduced Maverick model with {text_layers} text layers and " + f"{vision_layers} vision layers...") + + # Create output directory + output_path = Path(output_dir) + if output_path.exists(): + if force_recreate: + shutil.rmtree(output_path) + else: + print(f"Output directory {output_dir} already exists. " + "Use --force-recreate to overwrite.") + return str(output_path) + + output_path.mkdir(parents=True, exist_ok=True) + + try: + print("Loading original model configuration...") + original_config = AutoConfig.from_pretrained(original_model_name, + trust_remote_code=True) + + print("Creating reduced configuration...") + reduced_config = create_reduced_config(original_config, text_layers, + num_experts, vision_layers) + + config_path = output_path / "config.json" + with open(config_path, "w") as f: + json.dump(reduced_config, f, indent=2) + print(f"Saved reduced config to {config_path}") + + print("Copying tokenizer files...") + copy_tokenizer_files(original_model_name, output_path) + + print("Creating reduced safetensors files...") + create_reduced_safetensors(original_config, reduced_config, + output_path) + + print("Creating preprocessor config...") + create_preprocessor_config(original_config, output_path) + + try: + gen_config = GenerationConfig.from_pretrained(original_model_name) + gen_config.save_pretrained(output_path) + print("Copied generation config") + except Exception as e: + print(f"Could not copy generation config: {e}") + + print(f"Successfully created reduced Maverick model at {output_path}") + return str(output_path) + + except Exception as e: + print(f"Error creating reduced model: {e}") + # Clean up on failure + if output_path.exists(): + shutil.rmtree(output_path) + raise + + +def create_reduced_config(original_config: Any, text_layers: int, + num_experts: int, + vision_layers: int) -> dict[str, Any]: + """Create a reduced configuration based on the original.""" + + # Convert config to dictionary + config_dict = original_config.to_dict() + + # Reduce text layers + if "text_config" in config_dict: + original_text_layers = config_dict["text_config"]["num_hidden_layers"] + config_dict["text_config"]["num_hidden_layers"] = text_layers + print( + f"Reduced text layers from {original_text_layers} to {text_layers}" + ) + + original_num_experts = config_dict["text_config"]["num_local_experts"] + config_dict["text_config"]["num_local_experts"] = num_experts + print( + f"Reduced num experts from {original_num_experts} to {num_experts}" + ) + + hidden_dim_divisor = 4 + + original_hidden_size = config_dict["text_config"]["hidden_size"] + new_hidden_size = original_hidden_size // hidden_dim_divisor + config_dict["text_config"]["hidden_size"] = new_hidden_size + print(f"Reduced hidden size from {original_hidden_size} to " + f"{new_hidden_size}") + + original_head_dim = config_dict["text_config"]["head_dim"] + new_head_dim = original_head_dim // hidden_dim_divisor + config_dict["text_config"]["head_dim"] = new_head_dim + print(f"Reduced head dim from {original_head_dim} to {new_head_dim}") + + # Reduce vision layers + if "vision_config" in config_dict: + original_vision_layers = config_dict["vision_config"][ + "num_hidden_layers"] + config_dict["vision_config"]["num_hidden_layers"] = vision_layers + print(f"Reduced vision layers from {original_vision_layers} " + f"to {vision_layers}") + + # Update model name to indicate it's a reduced version + config_dict["_name_or_path"] = ( + f"reduced_maverick_{text_layers}t_{vision_layers}v") + + return config_dict + + +def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None: + """Copy tokenizer files from the original model.""" + + try: + tokenizer = AutoTokenizer.from_pretrained(original_model_name, + trust_remote_code=True) + tokenizer.save_pretrained(output_path) + print("Tokenizer files copied successfully") + except Exception as e: + print(f"Warning: Could not copy tokenizer files: {e}") + + +def create_preprocessor_config(original_config: Any, + output_path: Path) -> None: + """Create preprocessor_config.json for multimodal model.""" + + # Try to load the original preprocessor config + try: + processor = AutoProcessor.from_pretrained( + original_config._name_or_path + or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + trust_remote_code=True, + ) + processor.save_pretrained(output_path) + print("Copied original preprocessor config") + return + except Exception as e: + print(f"Could not copy original preprocessor config: {e}") + raise + + +def create_reduced_safetensors(original_config: Any, reduced_config: dict[str, + Any], + output_path: Path) -> None: + """Create safetensors files with weights for the reduced model.""" + + print("Generating synthetic weights for reduced model...") + + text_config = reduced_config["text_config"] + vision_config = reduced_config["vision_config"] + + weights = {} + + print("Creating text model weights...") + weights.update(create_text_model_weights(text_config)) + + print("Creating vision model weights...") + weights.update(create_vision_model_weights(vision_config)) + + print("Creating shared model weights...") + weights.update(create_shared_weights(text_config, vision_config)) + + print("Saving weights to safetensors files...") + save_weights_to_safetensors(weights, output_path) + + +def create_text_model_weights( + text_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create synthetic weights for the text model with MoE structure.""" + + weights = {} + + vocab_size = text_config["vocab_size"] + hidden_size = text_config["hidden_size"] + intermediate_size = text_config["intermediate_size"] + intermediate_size_mlp = text_config["intermediate_size_mlp"] + num_layers = text_config["num_hidden_layers"] + num_attention_heads = text_config["num_attention_heads"] + num_key_value_heads = text_config.get("num_key_value_heads", + num_attention_heads) + + # MoE specific parameters + num_experts = text_config.get("num_local_experts") + assert (num_experts + is not None), "num_local_experts must be specified for MoE" + + head_dim = hidden_size // num_attention_heads + + # Embedding layers + weights["language_model.model.embed_tokens.weight"] = torch.randn( + vocab_size, hidden_size, dtype=torch.float16) + + # Transformer layers + for layer_idx in range(num_layers): + layer_prefix = f"language_model.model.layers.{layer_idx}" + print(f"Creating weights for layer {layer_prefix}...") + + # Self-attention weights (separate q, k, v projections) + weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn( + hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn( + hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn( + num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn( + hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16) + print("Self-attention weights created.") + + # Feed-forward weights - MoE pattern based on interleave_moe_layer_step + # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers + # 0,2,4,... are dense + interleave_step = text_config.get("interleave_moe_layer_step", 1) + is_moe_layer = (interleave_step > 0 + and (layer_idx + 1) % interleave_step == 0) + + if is_moe_layer: + # MoE layer structure + # 1. Router weights + weights[ + f"{layer_prefix}.feed_forward.router.weight"] = torch.randn( + num_experts, hidden_size, dtype=torch.float16) + + # 2. Individual expert weights (not fused) + for expert_idx in range(num_experts): + expert_prefix = ( + f"{layer_prefix}.feed_forward.experts.{expert_idx}") + + weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{expert_prefix}.up_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{expert_prefix}.down_proj.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + + # Expert weight scales (FP8 quantization) + weights[ + f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones( + intermediate_size, 1, dtype=torch.bfloat16) + weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones( + intermediate_size, 1, dtype=torch.bfloat16) + weights[ + f"{expert_prefix}.down_proj.weight_scale"] = torch.ones( + hidden_size, 1, dtype=torch.bfloat16) + + # 3. Shared expert weights + shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert" + weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + print(f"MoE feed-forward weights created for layer {layer_idx}.") + else: + # Dense layer structure + weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = ( + torch.randn(intermediate_size_mlp, + hidden_size, + dtype=torch.bfloat16)) + weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = ( + torch.randn(intermediate_size_mlp, + hidden_size, + dtype=torch.bfloat16)) + weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = ( + torch.randn(hidden_size, + intermediate_size_mlp, + dtype=torch.bfloat16)) + print(f"Dense feed-forward weights created for layer {layer_idx}.") + + # Layer norms + weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[ + f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + print("Layer norms created.") + + # Final layer norm and output projection + weights["language_model.model.norm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights["language_model.lm_head.weight"] = torch.randn( + vocab_size, hidden_size, dtype=torch.bfloat16) + + return weights + + +def create_vision_model_weights( + vision_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create synthetic weights for the vision model.""" + + weights = {} + + hidden_size = vision_config["hidden_size"] + intermediate_size = vision_config["intermediate_size"] + num_layers = vision_config["num_hidden_layers"] + + # Vision transformer layers + for layer_idx in range(num_layers): + layer_prefix = f"vision_model.model.layers.{layer_idx}" + + weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros( + intermediate_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[ + f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + return weights + + +def create_shared_weights( + text_config: dict[str, Any], + vision_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create weights for shared components (vision-language connector)""" + + weights = {} + + text_hidden_size = text_config["hidden_size"] + projector_input_dim = vision_config["projector_input_dim"] + + # Vision-language connector (projects vision features to text space) + weights["multi_modal_projector.linear_1.weight"] = torch.randn( + text_hidden_size, projector_input_dim, dtype=torch.bfloat16) + + return weights + + +def save_weights_to_safetensors(weights: dict[str, torch.Tensor], + output_path: Path) -> None: + """Save weights to safetensors files and create index.""" + + # Determine how to shard the weights + max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard + + # Calculate sizes and create shards + shards = [] + current_shard: dict[str, torch.Tensor] = {} + current_size = 0 + + for name, tensor in weights.items(): + tensor_size = tensor.numel() * tensor.element_size() + + if current_size + tensor_size > max_shard_size and current_shard: + shards.append(current_shard) + current_shard = {} + current_size = 0 + + current_shard[name] = tensor + current_size += tensor_size + + if current_shard: + shards.append(current_shard) + + # Save shards and create index + weight_map = {} + + if len(shards) == 1: + # Single file + filename = "model.safetensors" + save_file(shards[0], output_path / filename) + weight_map = {name: filename for name in shards[0]} + print(f"Saved weights to single file: {filename}") + else: + # Multiple shards + for i, shard in enumerate(shards): + filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors" + save_file(shard, output_path / filename) + for name in shard: + weight_map[name] = filename + print(f"Saved shard {i+1}/{len(shards)}: {filename}") + + # Create index file + index_data = { + "metadata": { + "total_size": + sum(tensor.numel() * tensor.element_size() + for tensor in weights.values()) + }, + "weight_map": weight_map, + } + + index_path = output_path / "model.safetensors.index.json" + with open(index_path, "w") as f: + json.dump(index_data, f, indent=2) + + print(f"Created index file: {index_path}") + print(f"Total model size: " + f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB") + + +def run_reduced_model(model_path: str, + should_profile: bool = False, + **kwargs) -> None: + """Test the created reduced model with vLLM.""" + + print(f"\nTesting reduced model at {model_path}...") + + llm = LLM( + model=model_path, + trust_remote_code=True, + max_model_len=512, # Small context for testing + gpu_memory_utilization=0.3, # Conservative memory usage + **kwargs, + ) + + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=50) + + if should_profile: + llm.start_profile() + outputs = llm.generate(PROMPTS, sampling_params) + if should_profile: + llm.stop_profile() + + print("Test generation successful!") + for output in outputs: + print(f"Prompt: {output.prompt}") + print(f"Output: " + f"{output.outputs[0].text}") + print("-" * 40) + + +@pytest.mark.parametrize( + "original_model_name,text_layers,num_experts,vision_layers,", + [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)]) +@pytest.mark.parametrize("enforce_eager", [True, False]) +@pytest.mark.parametrize("tp,ep", [(2, True)]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_dummy_maverick( + original_model_name: str, + text_layers: int, + num_experts: int, + vision_layers: int, + enforce_eager: bool, + tp: int, + ep: bool, + output_dir: str = "/tmp/reduced_maverick", + force_recreate: bool = True, + profile: bool = False, +) -> None: + model_path = create_reduced_maverick_model( + original_model_name=original_model_name, + output_dir=output_dir, + text_layers=text_layers, + num_experts=num_experts, + vision_layers=vision_layers, + force_recreate=force_recreate, + ) + + print(f"\nReduced model created successfully at: {model_path}") + + run_reduced_model(model_path=model_path, + should_profile=profile, + enforce_eager=enforce_eager, + tensor_parallel_size=tp, + enable_expert_parallel=ep) + + +def main(): + """Main function to create and test the reduced model.""" + + import argparse + + parser = argparse.ArgumentParser( + description="Create a reduced-layer Maverick model") + parser.add_argument( + "--output-dir", + default="/tmp/reduced_maverick", + help="Output directory for the reduced model", + ) + parser.add_argument( + "--text-layers", + type=int, + default=4, + help="Number of text transformer layers", + ) + parser.add_argument("--num-experts", + type=int, + default=4, + help="Number of experts") + parser.add_argument( + "--vision-layers", + type=int, + default=2, + help="Number of vision transformer layers", + ) + parser.add_argument( + "--force-recreate", + action="store_true", + help="Force recreation if output directory exists", + ) + parser.add_argument("--test", + action="store_true", + help="Test the created model with vLLM") + parser.add_argument("--profile", + action="store_true", + help="Profile the created model with vLLM") + parser.add_argument( + "--test-original", + action="store_true", + help="Test the original model with vLLM", + ) + parser.add_argument( + "--original-model", + default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + help="Original model name to base the reduction on", + ) + + args = parser.parse_args() + + if args.test: + test_dummy_maverick(original_model_name=args.original_model, + output_dir=args.output_dir, + text_layers=args.text_layers, + num_experts=args.num_experts, + vision_layers=args.vision_layers, + force_recreate=args.force_recreate, + tp=2, + ep=True, + enforce_eager=True, + profile=args.profile) + + if args.test_original: + run_maverick_serving(args.original_model) + + +if __name__ == "__main__": + exit(main()) From 304dce7ec02769ecea137091caa5413e1a4abf60 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 21 Jul 2025 12:10:30 -0400 Subject: [PATCH 14/16] [Attention] Clean up iRoPE in V1 (#21188) Signed-off-by: Lucas Wilkinson Co-authored-by: Michael Goin --- vllm/attention/layer.py | 7 +++++++ vllm/v1/attention/backends/cpu_attn.py | 5 ----- vllm/v1/attention/backends/flash_attn.py | 2 -- vllm/v1/attention/backends/flashinfer.py | 2 -- vllm/v1/attention/backends/pallas.py | 5 ----- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 -- vllm/v1/attention/backends/triton_attn.py | 6 ------ vllm/v1/worker/gpu_model_runner.py | 7 +++---- vllm/v1/worker/tpu_model_runner.py | 4 ++++ 9 files changed, 14 insertions(+), 26 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 5d8ffb8e82d3f..1b80fa19d54f3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -137,6 +137,13 @@ class Attention(nn.Module): self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window + # For v1 we have backend agnostic iRoPE (local chunked attention) + # we have to store the flag on the layer so gpu model runner can + # set KVSpec appropriately (and pop it so it doesnt get passed to + # the backends) + if envs.VLLM_USE_V1: + self.use_irope = extra_impl_args.pop("use_irope", False) + quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None and not isinstance( diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 2efbe0de27255..3b6d753863d07 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -446,17 +446,12 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, ) -> None: if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported in V0.") if logits_soft_cap is not None: logger.warning_once("Torch SPDA does not support logits soft cap. " "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in Torch SPDA is not supported yet, it will fall" - " back to global attention for long context.") self.paged_attn_impl = _get_paged_attn_impl() self.num_heads = num_heads self.head_size = head_size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ad414ee0a1fc9..5fe274f2c65b2 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -352,7 +352,6 @@ class FlashAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -381,7 +380,6 @@ class FlashAttentionImpl(AttentionImpl): "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") - self.use_irope = use_irope self.vllm_flash_attn_version = get_flash_attn_version() if is_quantized_kv_cache(self.kv_cache_dtype) \ and not flash_attn_supports_fp8(): diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e1ffa61a6005e..953ef26c81439 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -493,7 +493,6 @@ class FlashInferImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -509,7 +508,6 @@ class FlashInferImpl(AttentionImpl): self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - self.use_irope = use_irope self.num_queries_per_kv = self.num_heads // self.num_kv_heads diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 9307cd937d5db..9b122136afb7f 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -148,12 +148,7 @@ class PallasAttentionBackendImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: - if use_irope: - logger.warning_once( - "Using irope in Pallas is not supported yet, it will fall back " - "to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 8f75676394494..0739d2596676f 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -337,7 +337,6 @@ class AiterFlashAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -367,7 +366,6 @@ class AiterFlashAttentionImpl(AttentionImpl): "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") - self.use_irope = use_irope if is_quantized_kv_cache(self.kv_cache_dtype): raise NotImplementedError( "AiterFlashAttention does not support fp8 kv-cache on this " diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index d65ff5ff74ece..83471ca51b73f 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -72,9 +72,6 @@ class TritonAttentionMetadataBuilder( vllm_config.parallel_config) self.headdim = model_config.get_head_size() - self.attention_chunk_size = getattr(vllm_config.scheduler_config, - 'attention_chunk_size', None) - def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata ) -> TritonAttentionMetadata: @@ -208,7 +205,6 @@ class TritonAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -228,8 +224,6 @@ class TritonAttentionImpl(AttentionImpl): self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - self.use_irope = use_irope - self.num_queries_per_kv = self.num_heads // self.num_kv_heads TritonAttentionBackend.validate_head_size(head_size) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cd66d8bcd6342..4c14ac3be3c0e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2702,8 +2702,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # TODO: Support other attention modules, e.g., cross-attention if attn_module.attn_type == AttentionType.DECODER: use_local_attention = (self.attention_chunk_size is not None - and getattr(attn_module.impl, - "use_irope", False)) + and attn_module.use_irope) if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, @@ -2716,13 +2715,13 @@ class GPUModelRunner(LoRAModelRunnerMixin): "attention module can not be with ", "both local attention and sliding window") elif use_local_attention: - kv_cache_spec[layer_name] = (ChunkedLocalAttentionSpec( + kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, head_size=attn_module.head_size, dtype=self.kv_cache_dtype, attention_chunk_size=self.attention_chunk_size, - use_mla=use_mla)) + use_mla=use_mla) else: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index aad45b6abd128..31e9cff91247c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -519,6 +519,10 @@ class TPUModelRunner(LoRAModelRunnerMixin): continue if attn_module.attn_type == AttentionType.DECODER: + if attn_module.use_irope: + logger.warning_once( + "Using irope in Pallas is not supported yet, it " + "will fall back to global attention for long context.") if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, From 29d1ffc5b4c763ef76aff9e3f617fa60dd292418 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:11:35 -0400 Subject: [PATCH 15/16] [DP] Fix Prometheus Logging (#21257) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- tests/v1/engine/test_async_llm.py | 7 +- tests/v1/test_async_llm_dp.py | 6 +- vllm/v1/engine/async_llm.py | 69 ++-- vllm/v1/engine/core_client.py | 9 +- vllm/v1/metrics/loggers.py | 523 +++++++++++++++++++----------- vllm/v1/metrics/ray_wrappers.py | 4 - 6 files changed, 369 insertions(+), 249 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index e137452f2625f..412df3acff126 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -336,9 +336,10 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - assert len(engine.stat_loggers) == 1 - assert len(engine.stat_loggers[0]) == 1 - engine.stat_loggers[0][0].log.assert_called_once() + stat_loggers = engine.logger_manager.per_engine_logger_dict + assert len(stat_loggers) == 1 + assert len(stat_loggers[0]) == 1 + stat_loggers[0][0].log.assert_called_once() @pytest.mark.asyncio(scope="module") diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 64a41bec37919..6716d27f571f9 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -90,8 +90,10 @@ async def test_load(output_kind: RequestOutputKind, def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): stats_loggers[engine_index] = self - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): if iteration_stats: self.finished_req_count += len( iteration_stats.finished_requests) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6395d2c1875b7..b8ba36f3502f7 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,10 +36,9 @@ from vllm.v1.engine.output_processor import (OutputProcessor, from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory, - setup_default_loggers) +from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager from vllm.v1.metrics.prometheus import shutdown_prometheus -from vllm.v1.metrics.stats import IterationStats, SchedulerStats +from vllm.v1.metrics.stats import IterationStats logger = init_logger(__name__) @@ -95,14 +94,6 @@ class AsyncLLM(EngineClient): self.log_requests = log_requests self.log_stats = log_stats - # Set up stat loggers; independent set for each DP rank. - self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( - vllm_config=vllm_config, - log_stats=self.log_stats, - engine_num=vllm_config.parallel_config.data_parallel_size, - custom_stat_loggers=stat_loggers, - ) - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -121,7 +112,6 @@ class AsyncLLM(EngineClient): log_stats=self.log_stats) # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_async_mp_client( vllm_config=vllm_config, executor_class=executor_class, @@ -129,9 +119,17 @@ class AsyncLLM(EngineClient): client_addresses=client_addresses, client_index=client_index, ) - if self.stat_loggers: - for stat_logger in self.stat_loggers[0]: - stat_logger.log_engine_initialized() + + # Loggers. + self.logger_manager: Optional[StatLoggerManager] = None + if self.log_stats: + self.logger_manager = StatLoggerManager( + vllm_config=vllm_config, + engine_idxs=self.engine_core.engine_ranks, + custom_stat_loggers=stat_loggers, + ) + self.logger_manager.log_engine_initialized() + self.output_handler: Optional[asyncio.Task] = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -370,7 +368,7 @@ class AsyncLLM(EngineClient): engine_core = self.engine_core output_processor = self.output_processor log_stats = self.log_stats - stat_loggers = self.stat_loggers if log_stats else None + logger_manager = self.logger_manager async def output_handler(): try: @@ -410,9 +408,9 @@ class AsyncLLM(EngineClient): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - if stat_loggers: - AsyncLLM._record_stats( - stat_loggers[outputs.engine_index], + if logger_manager: + logger_manager.record( + engine_idx=outputs.engine_index, scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, ) @@ -431,18 +429,6 @@ class AsyncLLM(EngineClient): if self.log_requests: logger.info("Aborted request %s.", request_id) - @staticmethod - def _record_stats( - stat_loggers: list[StatLoggerBase], - scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats], - ): - """static so that it can be used from the output_handler task - without a circular ref to AsyncLLM.""" - for stat_logger in stat_loggers: - stat_logger.record(scheduler_stats=scheduler_stats, - iteration_stats=iteration_stats) - async def encode( self, prompt: PromptType, @@ -547,9 +533,8 @@ class AsyncLLM(EngineClient): scheduler_outputs=None, model_output=None, ) -> None: - for loggers in self.stat_loggers: - for stat_logger in loggers: - stat_logger.log() + if self.logger_manager: + self.logger_manager.log() async def check_health(self) -> None: logger.debug("Called check_health.") @@ -653,18 +638,16 @@ class AsyncLLM(EngineClient): new_data_parallel_size # recreate stat loggers - if new_data_parallel_size > old_data_parallel_size: - stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( + if new_data_parallel_size > old_data_parallel_size and self.log_stats: + # TODO(rob): fix this after talking with Ray team. + # This resets all the prometheus metrics since we + # unregister during initialization. Need to understand + # the intended behavior here better. + self.logger_manager = StatLoggerManager( vllm_config=self.vllm_config, - log_stats=self.log_stats, - engine_num=new_data_parallel_size, + engine_idxs=list(range(new_data_parallel_size)), custom_stat_loggers=None, ) - num_new_engines = len(stat_loggers) - len(self.stat_loggers) - self.stat_loggers.extend(stat_loggers[-num_new_engines:]) - else: - for _ in range(old_data_parallel_size - new_data_parallel_size): - self.stat_loggers.pop() @property def is_running(self) -> bool: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 82fc1fa9937c5..2ebb76a97ebea 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -432,14 +432,15 @@ class MPClient(EngineCoreClient): external_dp_lb = parallel_config.data_parallel_external_lb offline_mode = parallel_config.data_parallel_rank_local is not None - engine_ranks = [dp_rank] if (offline_mode - or external_dp_lb) else range(dp_size) + self.engine_ranks = ([dp_rank] if + (offline_mode or external_dp_lb) else list( + range(dp_size))) assert parallel_config.data_parallel_size_local <= len( - engine_ranks) + self.engine_ranks) # ZMQ identity of each engine that this client will talk to. self.core_engines: list[EngineIdentity] = [ - index.to_bytes(2, "little") for index in engine_ranks + index.to_bytes(2, "little") for index in self.engine_ranks ] # Wait for ready messages from each engine on the input socket. diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c720ca13e51b2..7f2556bab5a40 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -4,7 +4,7 @@ import logging import time from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Callable, Optional, Union import numpy as np import prometheus_client @@ -35,8 +35,10 @@ class StatLoggerBase(ABC): ... @abstractmethod - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): ... @abstractmethod @@ -78,8 +80,10 @@ class LoggingStatLogger(StatLoggerBase): # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log Stats to standard output.""" if iteration_stats: @@ -146,233 +150,290 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): + def __init__(self, + vllm_config: VllmConfig, + engine_indexes: Optional[list[int]] = None): + if engine_indexes is None: + engine_indexes = [0] + self.engine_indexes = engine_indexes unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_index = engine_index # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ vllm_config.observability_config.show_hidden_metrics labelnames = ["model_name", "engine"] - labelvalues = [ - vllm_config.model_config.served_model_name, - str(engine_index) - ] - + model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len + if (len(self.engine_indexes) > 1 + and vllm_config.speculative_config is not None): + raise NotImplementedError("Prometheus metrics with Spec Decoding " + "with >1 EngineCore per AsyncLLM is not " + "supported yet.") + spec_decode_labelvalues = [ + vllm_config.model_config.served_model_name, + str(self.engine_indexes[0]) + ] self.spec_decoding_prom = self._spec_decoding_cls( - vllm_config.speculative_config, labelnames, labelvalues) + vllm_config.speculative_config, labelnames, + spec_decode_labelvalues) # # Scheduler state # - self.gauge_scheduler_running = self._gauge_cls( + gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running, + engine_indexes, + model_name) - self.gauge_scheduler_waiting = self._gauge_cls( + gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting, + engine_indexes, + model_name) # # GPU cache # # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True - self.gauge_gpu_cache_usage = self._gauge_cls( + gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation=( "GPU KV-cache usage. 1 means 100 percent usage." "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, + engine_indexes, + model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_queries = self._counter_cls( + counter_gpu_prefix_cache_queries = self._counter_cls( name="vllm:gpu_prefix_cache_queries", - documentation= - ("GPU prefix cache queries, in terms of number of queried tokens." - "DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames).labels(*labelvalues) + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_hits = self._counter_cls( + counter_gpu_prefix_cache_hits = self._counter_cls( name="vllm:gpu_prefix_cache_hits", documentation=( - "GPU prefix cache hits, in terms of number of cached tokens." - "DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(*labelvalues) + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) - self.gauge_kv_cache_usage = self._gauge_cls( + gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage, + engine_indexes, model_name) - self.counter_prefix_cache_queries = self._counter_cls( + counter_prefix_cache_queries = self._counter_cls( name="vllm:prefix_cache_queries", documentation=( "Prefix cache queries, in terms of number of queried tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prefix_cache_queries = make_per_engine( + counter_prefix_cache_queries, engine_indexes, model_name) - self.counter_prefix_cache_hits = self._counter_cls( + counter_prefix_cache_hits = self._counter_cls( name="vllm:prefix_cache_hits", documentation=( "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prefix_cache_hits = make_per_engine( + counter_prefix_cache_hits, engine_indexes, model_name) # # Counters # - self.counter_num_preempted_reqs = self._counter_cls( + counter_num_preempted_reqs = self._counter_cls( name="vllm:num_preemptions", documentation="Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_num_preempted_reqs = make_per_engine( + counter_num_preempted_reqs, engine_indexes, model_name) - self.counter_prompt_tokens = self._counter_cls( + counter_prompt_tokens = self._counter_cls( name="vllm:prompt_tokens", documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens, + engine_indexes, + model_name) - self.counter_generation_tokens = self._counter_cls( + counter_generation_tokens = self._counter_cls( name="vllm:generation_tokens", documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_generation_tokens = make_per_engine( + counter_generation_tokens, engine_indexes, model_name) - self.counter_request_success: dict[FinishReason, - prometheus_client.Counter] = {} + self.counter_request_success: dict[FinishReason, dict[ + int, prometheus_client.Counter]] = {} counter_request_success_base = self._counter_cls( name="vllm:request_success", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: - self.counter_request_success[ - reason] = counter_request_success_base.labels(*(labelvalues + - [str(reason)])) + self.counter_request_success[reason] = { + idx: + counter_request_success_base.labels(model_name, str(idx), + str(reason)) + for idx in engine_indexes + } # # Histograms of counts # - self.histogram_num_prompt_tokens_request = \ - self._histogram_cls( - name="vllm:request_prompt_tokens", - documentation="Number of prefill tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_num_prompt_tokens_request = self._histogram_cls( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_prompt_tokens_request = make_per_engine( + histogram_num_prompt_tokens_request, engine_indexes, model_name) - self.histogram_num_generation_tokens_request = \ - self._histogram_cls( - name="vllm:request_generation_tokens", - documentation="Number of generation tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_generation_tokens_request = make_per_engine( + histogram_num_generation_tokens_request, engine_indexes, + model_name) # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = \ - self._histogram_cls( - name="vllm:iteration_tokens_total", - documentation="Histogram of number of tokens per engine_step.", - buckets=[ - 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, - 16384 - ], - labelnames=labelnames).labels(*labelvalues) + histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 + ], + labelnames=labelnames) + self.histogram_iteration_tokens = make_per_engine( + histogram_iteration_tokens, engine_indexes, model_name) - self.histogram_max_num_generation_tokens_request = \ - self._histogram_cls( - name="vllm:request_max_num_generation_tokens", - documentation= - "Histogram of maximum number of requested generation tokens.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_num_generation_tokens_request = make_per_engine( + histogram_max_num_generation_tokens_request, engine_indexes, + model_name) - self.histogram_n_request = \ - self._histogram_cls( - name="vllm:request_params_n", - documentation="Histogram of the n request parameter.", - buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(*labelvalues) + histogram_n_request = self._histogram_cls( + name="vllm:request_params_n", + documentation="Histogram of the n request parameter.", + buckets=[1, 2, 5, 10, 20], + labelnames=labelnames) + self.histogram_n_request = make_per_engine(histogram_n_request, + engine_indexes, model_name) - self.histogram_max_tokens_request = \ - self._histogram_cls( - name="vllm:request_params_max_tokens", - documentation="Histogram of the max_tokens request parameter.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_tokens_request = make_per_engine( + histogram_max_tokens_request, engine_indexes, model_name) # # Histogram of timing intervals # - self.histogram_time_to_first_token = \ - self._histogram_cls( - name="vllm:time_to_first_token_seconds", - documentation="Histogram of time to first token in seconds.", - buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, - 640.0, 2560.0 - ], - labelnames=labelnames).labels(*labelvalues) + histogram_time_to_first_token = self._histogram_cls( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0 + ], + labelnames=labelnames) + self.histogram_time_to_first_token = make_per_engine( + histogram_time_to_first_token, engine_indexes, model_name) - self.histogram_time_per_output_token = \ - self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", - buckets=[ - 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 - ], - labelnames=labelnames).labels(*labelvalues) + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name) request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = \ - self._histogram_cls( - name="vllm:e2e_request_latency_seconds", - documentation="Histogram of e2e request latency in seconds.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_queue_time_request = \ - self._histogram_cls( - name="vllm:request_queue_time_seconds", - documentation= - "Histogram of time spent in WAITING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_inference_time_request = \ - self._histogram_cls( - name="vllm:request_inference_time_seconds", - documentation= - "Histogram of time spent in RUNNING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_prefill_time_request = \ - self._histogram_cls( - name="vllm:request_prefill_time_seconds", - documentation= - "Histogram of time spent in PREFILL phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_decode_time_request = \ - self._histogram_cls( - name="vllm:request_decode_time_seconds", - documentation= - "Histogram of time spent in DECODE phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) + histogram_e2e_time_request = self._histogram_cls( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of e2e request latency in seconds.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_e2e_time_request = make_per_engine( + histogram_e2e_time_request, engine_indexes, model_name) + + histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_queue_time_request = make_per_engine( + histogram_queue_time_request, engine_indexes, model_name) + + histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_inference_time_request = make_per_engine( + histogram_inference_time_request, engine_indexes, model_name) + + histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_prefill_time_request = make_per_engine( + histogram_prefill_time_request, engine_indexes, model_name) + + histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_decode_time_request = make_per_engine( + histogram_decode_time_request, engine_indexes, model_name) # # LoRA metrics @@ -382,6 +443,9 @@ class PrometheusStatLogger(StatLoggerBase): # api_server counts which uses prometheus mp. self.gauge_lora_info: Optional[prometheus_client.Gauge] = None if vllm_config.lora_config is not None: + if len(self.engine_indexes) > 1: + raise NotImplementedError( + "LoRA in DP mode is not supported yet.") self.labelname_max_lora = "max_lora" self.labelname_waiting_lora_adapters = "waiting_lora_adapters" self.labelname_running_lora_adapters = "running_lora_adapters" @@ -399,9 +463,8 @@ class PrometheusStatLogger(StatLoggerBase): ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): - metrics_info = config_obj.metrics_info() - metrics_info["engine"] = self.engine_index + metrics_info["engine"] = "" name, documentation = None, None if type == "cache_config": @@ -417,27 +480,36 @@ class PrometheusStatLogger(StatLoggerBase): documentation=documentation, multiprocess_mode="mostrecent", labelnames=metrics_info.keys(), - ).labels(**metrics_info) - info_gauge.set(1) + ) + for engine_index in self.engine_indexes: + metrics_info = config_obj.metrics_info() + metrics_info["engine"] = str(engine_index) + info_gauge.labels(**metrics_info).set(1) - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log to prometheus.""" if scheduler_stats is not None: - self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) - self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_scheduler_running[engine_idx].set( + scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting[engine_idx].set( + scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage) - self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage) + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) + self.gauge_kv_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries.inc( + self.counter_gpu_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits.inc( + self.counter_gpu_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) - self.counter_prefix_cache_queries.inc( + self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_prefix_cache_hits.inc( + self.counter_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) if scheduler_stats.spec_decoding_stats is not None: @@ -447,42 +519,45 @@ class PrometheusStatLogger(StatLoggerBase): if iteration_stats is None: return - self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs) - self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) - self.counter_generation_tokens.inc( + self.counter_num_preempted_reqs[engine_idx].inc( + iteration_stats.num_preempted_reqs) + self.counter_prompt_tokens[engine_idx].inc( + iteration_stats.num_prompt_tokens) + self.counter_generation_tokens[engine_idx].inc( iteration_stats.num_generation_tokens) - self.histogram_iteration_tokens.observe( + self.histogram_iteration_tokens[engine_idx].observe( iteration_stats.num_prompt_tokens + \ iteration_stats.num_generation_tokens) for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter: - self.histogram_max_num_generation_tokens_request.observe( - max_gen_tokens) + self.histogram_max_num_generation_tokens_request[ + engine_idx].observe(max_gen_tokens) for n_param in iteration_stats.n_params_iter: - self.histogram_n_request.observe(n_param) + self.histogram_n_request[engine_idx].observe(n_param) for ttft in iteration_stats.time_to_first_tokens_iter: - self.histogram_time_to_first_token.observe(ttft) + self.histogram_time_to_first_token[engine_idx].observe(ttft) for tpot in iteration_stats.time_per_output_tokens_iter: - self.histogram_time_per_output_token.observe(tpot) + self.histogram_time_per_output_token[engine_idx].observe(tpot) for finished_request in iteration_stats.finished_requests: - self.counter_request_success[finished_request.finish_reason].inc() - self.histogram_e2e_time_request.observe( + self.counter_request_success[ + finished_request.finish_reason][engine_idx].inc() + self.histogram_e2e_time_request[engine_idx].observe( finished_request.e2e_latency) - self.histogram_queue_time_request.observe( + self.histogram_queue_time_request[engine_idx].observe( finished_request.queued_time) - self.histogram_prefill_time_request.observe( + self.histogram_prefill_time_request[engine_idx].observe( finished_request.prefill_time) - self.histogram_inference_time_request.observe( + self.histogram_inference_time_request[engine_idx].observe( finished_request.inference_time) - self.histogram_decode_time_request.observe( + self.histogram_decode_time_request[engine_idx].observe( finished_request.decode_time) - self.histogram_num_prompt_tokens_request.observe( + self.histogram_num_prompt_tokens_request[engine_idx].observe( finished_request.num_prompt_tokens) - self.histogram_num_generation_tokens_request.observe( + self.histogram_num_generation_tokens_request[engine_idx].observe( finished_request.num_generation_tokens) if finished_request.max_tokens_param: - self.histogram_max_tokens_request.observe( + self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) if self.gauge_lora_info is not None: @@ -502,6 +577,18 @@ class PrometheusStatLogger(StatLoggerBase): self.log_metrics_info("cache_config", self.vllm_config.cache_config) +PromMetric = Union[ + prometheus_client.Gauge, + prometheus_client.Counter, + prometheus_client.Histogram, +] + + +def make_per_engine(metric: PromMetric, engine_idxs: list[int], + model_name: str) -> dict[int, PromMetric]: + return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs} + + def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by @@ -529,29 +616,79 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: return build_buckets([1, 2, 5], max_value) -def setup_default_loggers( - vllm_config: VllmConfig, - log_stats: bool, - engine_num: int, - custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> list[list[StatLoggerBase]]: - """Setup logging and prometheus metrics.""" - if not log_stats: - return [] +class StatLoggerManager: + """ + StatLoggerManager: + Logging happens at the level of the EngineCore (per scheduler). + * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore. + * With Local Logger, just make N copies for N EngineCores. + * With Prometheus, we need a single logger with N "labels" - factories: list[StatLoggerFactory] - if custom_stat_loggers is not None: - factories = custom_stat_loggers - else: - factories = [PrometheusStatLogger] - if logger.isEnabledFor(logging.INFO): - factories.append(LoggingStatLogger) + This class abstracts away this implementation detail from + the AsyncLLM, allowing the AsyncLLM to just call .record() + and .log() to a simple interface. + """ - stat_loggers: list[list[StatLoggerBase]] = [] - for i in range(engine_num): - per_engine_stat_loggers: list[StatLoggerBase] = [] - for logger_factory in factories: - per_engine_stat_loggers.append(logger_factory(vllm_config, i)) - stat_loggers.append(per_engine_stat_loggers) + def __init__( + self, + vllm_config: VllmConfig, + engine_idxs: Optional[list[int]] = None, + custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, + ): + self.engine_idxs = engine_idxs if engine_idxs else [0] - return stat_loggers + factories: list[StatLoggerFactory] + if custom_stat_loggers is not None: + factories = custom_stat_loggers + else: + factories = [] + if logger.isEnabledFor(logging.INFO): + factories.append(LoggingStatLogger) + + # engine_idx: StatLogger + self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} + prometheus_factory = PrometheusStatLogger + for engine_idx in self.engine_idxs: + loggers: list[StatLoggerBase] = [] + for logger_factory in factories: + # If we get a custom prometheus logger, use that + # instead. This is typically used for the ray case. + if (isinstance(logger_factory, type) + and issubclass(logger_factory, PrometheusStatLogger)): + prometheus_factory = logger_factory + continue + loggers.append(logger_factory(vllm_config, + engine_idx)) # type: ignore + self.per_engine_logger_dict[engine_idx] = loggers + + # For Prometheus, need to share the metrics between EngineCores. + # Each EngineCore's metrics are expressed as a unique label. + self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs) + + def record( + self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: Optional[int] = None, + ): + if engine_idx is None: + engine_idx = 0 + + per_engine_loggers = self.per_engine_logger_dict[engine_idx] + for logger in per_engine_loggers: + logger.record(scheduler_stats, iteration_stats, engine_idx) + + self.prometheus_logger.record(scheduler_stats, iteration_stats, + engine_idx) + + def log(self): + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log() + + def log_engine_initialized(self): + self.prometheus_logger.log_engine_initialized() + + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log_engine_initialized() diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 8384310062dda..ae8f9447e9c8b 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -3,7 +3,6 @@ import time from typing import Optional, Union -from vllm.config import VllmConfig from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.spec_decode.metrics import SpecDecodingProm @@ -128,9 +127,6 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - super().__init__(vllm_config, engine_index) - @staticmethod def _unregister_vllm_metrics(): # No-op on purpose From 005ae9be6c22dfa2c2c5580b50b41e67faee4a87 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 21 Jul 2025 13:47:51 -0400 Subject: [PATCH 16/16] Fix bad lm-eval fork (#21318) --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 114c48dba531f..c476f71c6637d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -273,7 +273,7 @@ steps: # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: Examples Test # 25min