Merge branch 'main' into woosuk/model-runner-v2

This commit is contained in:
Woosuk Kwon 2025-09-19 18:53:18 +00:00
commit 396bbe67d3
86 changed files with 3802 additions and 2757 deletions

13
.github/CODEOWNERS vendored
View File

@ -66,18 +66,25 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/models/test_transformers.py @hmellor /tests/models/test_transformers.py @hmellor
# Docs # Docs
/docs @hmellor /docs/mkdocs @hmellor
/docs/**/*.yml @hmellor
/requirements/docs.txt @hmellor
.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor mkdocs.yaml @hmellor
# Linting
.markdownlint.yaml @hmellor
.pre-commit-config.yaml @hmellor
# CPU # CPU
/vllm/v1/worker/^cpu @bigPYJ1151 /vllm/v1/worker/cpu* @bigPYJ1151
/csrc/cpu @bigPYJ1151 /csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151 /vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151 /cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151 /docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU # Intel GPU
/vllm/v1/worker/^xpu @jikunshang /vllm/v1/worker/xpu* @jikunshang
/vllm/platforms/xpu.py @jikunshang /vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang /docker/Dockerfile.xpu @jikunshang

View File

@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any from typing import Any
import torch import torch
import triton
from tqdm import tqdm from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul, _w8a8_block_fp8_matmul,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True) mp.set_start_method("spawn", force=True)

View File

@ -59,7 +59,7 @@ enabling the corresponding APIs:
#### Predefined models #### Predefined models
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
you can override some of its attributes via the `--override-pooler-config` option. you can override some of its attributes via the `--pooler-config` option.
#### Converted models #### Converted models
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--override-pooler-config` option, You can further customize this via the `--pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults. which takes priority over both the model's and Sentence Transformers's defaults.
## Offline Inference ## Offline Inference

View File

@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models]
### Transformers ### Transformers
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases. vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
To check if the modeling backend is Transformers, you can simply do this: Currently, the Transformers backend works for the following:
- Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only
- Attention types: full attention and/or sliding attention
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
- Any combination of the following vLLM parallelisation schemes:
- Pipeline parallel
- Tensor parallel
Checking if the modeling backend is Transformers is as simple as:
```python ```python
from vllm import LLM from vllm import LLM
@ -27,16 +42,12 @@ llm = LLM(model=...) # Name or path of your model
llm.apply_model(lambda model: print(type(model))) llm.apply_model(lambda model: print(type(model)))
``` ```
If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers! If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
!!! tip If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
!!! note !!! note
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
!!! note
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
#### Custom models #### Custom models
@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa
To make your model compatible with the Transformers backend, it needs: To make your model compatible with the Transformers backend, it needs:
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
3. `MyModel` must contain `_supports_attention_backend = True`. 3. `MyModel` must contain `_supports_attention_backend = True`.
<details> <details class="code">
<summary>modeling_my_model.py</summary> <summary>modeling_my_model.py</summary>
```python ```python
@ -78,6 +90,7 @@ from transformers import PreTrainedModel
from torch import nn from torch import nn
class MyAttention(nn.Module): class MyAttention(nn.Module):
is_causal = False # Only do this for encoder-only models
def forward(self, hidden_states, **kwargs): def forward(self, hidden_states, **kwargs):
... ...
@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded:
1. The config is loaded. 1. The config is loaded.
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. 3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
That's it! That's it!
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
<details> <details class="code">
<summary>configuration_my_model.py</summary> <summary>configuration_my_model.py</summary>
```python ```python
@ -457,7 +470,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
!!! note !!! note
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
!!! note !!! note
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
@ -552,7 +565,7 @@ If your model is not in the above list, we will try to automatically convert the
!!! important !!! important
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
#### Token Classification #### Token Classification

View File

@ -42,7 +42,7 @@ python client.py
### Server Configuration ### Server Configuration
The key parameters for chunked processing are in the `--override-pooler-config`: The key parameters for chunked processing are in the `--pooler-config`:
```json ```json
{ {

View File

@ -13,7 +13,7 @@ Prerequisites:
# MEAN pooling (processes all chunks, recommended for complete coverage) # MEAN pooling (processes all chunks, recommended for complete coverage)
vllm serve intfloat/multilingual-e5-large \ vllm serve intfloat/multilingual-e5-large \
--override-pooler-config \ --pooler-config \
'{"pooling_type": "MEAN", "normalize": true, ' \ '{"pooling_type": "MEAN", "normalize": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
--served-model-name multilingual-e5-large \ --served-model-name multilingual-e5-large \
@ -23,7 +23,7 @@ Prerequisites:
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
vllm serve BAAI/bge-large-en-v1.5 \ vllm serve BAAI/bge-large-en-v1.5 \
--override-pooler-config \ --pooler-config \
'{"pooling_type": "CLS", "normalize": true, ' \ '{"pooling_type": "CLS", "normalize": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
--served-model-name bge-large-en-v1.5 \ --served-model-name bge-large-en-v1.5 \

View File

@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
vllm serve "$MODEL_NAME" \ vllm serve "$MODEL_NAME" \
--tensor-parallel-size "$GPU_COUNT" \ --tensor-parallel-size "$GPU_COUNT" \
--enforce-eager \ --enforce-eager \
--override-pooler-config "$POOLER_CONFIG" \ --pooler-config "$POOLER_CONFIG" \
--served-model-name ${MODEL_CODE} \ --served-model-name ${MODEL_CODE} \
--api-key "$API_KEY" \ --api-key "$API_KEY" \
--trust-remote-code \ --trust-remote-code \

View File

@ -1,8 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import dataclasses
import json import json
import logging import logging
import os import os
@ -327,12 +325,7 @@ def main():
if args.command == "serialize": if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in engine_args = EngineArgs.from_cli_args(args)
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(
argparse.Namespace(**eng_args_dict)
)
input_dir = tensorizer_dir.rstrip('/') input_dir = tensorizer_dir.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex suffix = args.suffix if args.suffix else uuid.uuid4().hex

View File

@ -39,7 +39,8 @@ from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype from vllm.config.model import (ConvertOption, RunnerOption,
_get_and_verify_dtype)
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
@ -244,39 +245,6 @@ class DecoderPromptType(Enum):
EMPTY_STR = 3 EMPTY_STR = 3
@pytest.fixture
def example_encoder_decoder_prompts(
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
decoder prompt) tuple.
Returns:
* Encoder prompt list
* Decoder prompt list (reverse of encoder prompt list)
'''
encoder_prompts = []
for filename in _TEST_PROMPTS:
encoder_prompts += _read_prompts(filename)
custom_decoder_prompts = encoder_prompts[::-1]
empty_str_decoder_prompts = [""] * len(encoder_prompts)
none_decoder_prompts = [None] * len(encoder_prompts)
# NONE decoder prompt type
return {
DecoderPromptType.NONE:
zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
DecoderPromptType.EMPTY_STR:
zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
DecoderPromptType.CUSTOM:
zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
}
@pytest.fixture @pytest.fixture
def example_long_prompts() -> list[str]: def example_long_prompts() -> list[str]:
prompts = [] prompts = []
@ -690,68 +658,6 @@ class HfRunner:
return [(output_ids, output_str, output_logprobs) return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs] for output_ids, output_str, output_logprobs in outputs]
def generate_encoder_decoder_greedy_logprobs_limit(
self,
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: Optional[int],
images: Optional[PromptImageInput] = None,
**kwargs: Any,
) -> list[TokensTextLogprobs]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs: list[list[dict[int, float]]] = []
all_output_ids: list[list[int]] = []
all_output_strs: list[str] = []
for i, (encoder_prompt, decoder_prompt) in enumerate(
to_enc_dec_tuple_list(encoder_decoder_prompts)):
processor_kwargs: dict[str, Any] = {
"text": encoder_prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
encoder_inputs = self.processor(**processor_kwargs)
encoder_inputs = self.wrap_device(encoder_inputs)
if decoder_prompt is None:
decoder_input_ids = None
else:
decoder_inputs = self.tokenizer(decoder_prompt,
return_tensors="pt")
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
output = self.model.generate(
decoder_input_ids=decoder_input_ids,
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**encoder_inputs,
**kwargs,
)
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
def encode(self, prompts: list[str], *args, def encode(self, prompts: list[str], *args,
**kwargs) -> list[list[torch.Tensor]]: **kwargs) -> list[list[torch.Tensor]]:
return self.model.encode(prompts, *args, **kwargs) return self.model.encode(prompts, *args, **kwargs)
@ -940,26 +846,6 @@ class VllmRunner:
if sampling_params.prompt_logprobs is None else if sampling_params.prompt_logprobs is None else
toks_str_logsprobs_prompt_logprobs) toks_str_logsprobs_prompt_logprobs)
def generate_encoder_decoder_w_logprobs(
self,
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
sampling_params: SamplingParams,
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
assert sampling_params.logprobs is not None
req_outputs = self.llm.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
if sampling_params.prompt_logprobs is None else
toks_str_logsprobs_prompt_logprobs)
def generate_greedy( def generate_greedy(
self, self,
prompts: Union[list[str], list[torch.Tensor]], prompts: Union[list[str], list[torch.Tensor]],
@ -1037,29 +923,6 @@ class VllmRunner:
return perplexities return perplexities
def generate_encoder_decoder_greedy_logprobs(
self,
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int] = None,
skip_special_tokens: bool = True,
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams(
temperature=0.0,
max_tokens=max_tokens,
logprobs=num_logprobs,
prompt_logprobs=(num_prompt_logprobs),
skip_special_tokens=skip_special_tokens,
)
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
return self.generate_encoder_decoder_w_logprobs(
encoder_decoder_prompts, greedy_logprobs_params)
def generate_beam_search( def generate_beam_search(
self, self,
prompts: list[str], prompts: list[str],

View File

@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config

View File

@ -515,6 +515,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
model=model_name, model=model_name,
input="What's the weather like in Paris today?", input="What's the weather like in Paris today?",
tools=tools, tools=tools,
temperature=0.0,
) )
assert response is not None assert response is not None
assert response.status == "completed" assert response.status == "completed"

View File

@ -216,7 +216,7 @@ def server_with_chunked_processing():
"--enforce-eager", "--enforce-eager",
"--max-model-len", "--max-model-len",
"512", # Set smaller max_model_len to trigger chunking mechanism "512", # Set smaller max_model_len to trigger chunking mechanism
'--override-pooler-config', '--pooler-config',
('{"pooling_type": "MEAN", "normalize": true, ' ('{"pooling_type": "MEAN", "normalize": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'), '"enable_chunked_processing": true, "max_embed_len": 10000}'),
"--gpu-memory-utilization", "--gpu-memory-utilization",

View File

@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
global WORKER_RUNTIME_SECONDS global WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS = 0.5 WORKER_RUNTIME_SECONDS = 0.5
# Copy the args to avoid mutating the # Copy the args to avoid mutating them
args = api_server_args.copy() args = api_server_args.copy()
if not with_stats_update: if not with_stats_update:

View File

@ -1,9 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import NamedTuple
import pytest import pytest
import torch import torch
from packaging.version import Version
from transformers import AutoConfig from transformers import AutoConfig
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -15,6 +18,7 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
head_size: int, max_position_embeddings: int, head_size: int, max_position_embeddings: int,
dtype: torch.dtype, device: torch.device): dtype: torch.dtype, device: torch.device):
"""Generate test data for given configuration.""" """Generate test data for given configuration."""
current_platform.seed_everything(42)
# Create 2D positions (3, num_tokens) for multimodal case # Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(0, positions = torch.randint(0,
max_position_embeddings // 4, (3, num_tokens), max_position_embeddings // 4, (3, num_tokens),
@ -33,22 +37,37 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
return positions, query, key return positions, query, key
def unroll_model_tp_dict(model_tp_dict): class MRoPETestInfo(NamedTuple):
return [(model_name, tp_size) model_name: str
for model_name, tp_sizes in model_tp_dict.items() # https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
for tp_size in tp_sizes] atol: float = 1e-2
rtol: float = 1.6e-2
marks: list[pytest.MarkDecorator] = []
model_tp_dict = { TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
}
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 MODELS_TO_TEST = [
dtype_atol_rtol_list = [ MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
[torch.bfloat16, 1e-2, 1.6e-2], MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
MRoPETestInfo(
model_name="Qwen/Qwen3-VL-4B-Instruct",
marks=[
pytest.mark.skipif(
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
reason="Qwen3-VL only available after Transformers v4.57",
)
]),
MRoPETestInfo(
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
marks=[
pytest.mark.skipif(
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
reason="Qwen3-VL only available after Transformers v4.57",
)
]),
] ]
num_tokens_list = [11, 8192] num_tokens_list = [11, 8192]
@ -56,20 +75,29 @@ num_tokens_list = [11, 8192]
@pytest.mark.skipif(not current_platform.is_cuda_alike(), @pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.") reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize("model_name, tp_size", @pytest.mark.parametrize("model_info, model_name", [
unroll_model_tp_dict(model_tp_dict)) pytest.param(test_config, test_config.model_name, marks=test_config.marks)
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) for test_config in MODELS_TO_TEST
])
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list) @pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
dtype: torch.dtype, num_tokens: int):
atol = model_info.atol
rtol = model_info.rtol
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
config = config.get_text_config()
# get the model config # get the model config
total_num_kv_heads = config.num_key_value_heads total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size) num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads head_dim = (config.head_dim if hasattr(config, "head_dim") else
config.hidden_size // total_num_heads)
is_neox_style = True is_neox_style = True
rope_theta = config.rope_theta rope_theta = config.rope_theta
@ -111,24 +139,30 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
@pytest.mark.skipif(not current_platform.is_cuda_alike(), @pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.") reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize( @pytest.mark.parametrize("model_info, model_name", [
"model_name, tp_size", pytest.param(test_config, test_config.model_name, marks=test_config.marks)
unroll_model_tp_dict({ for test_config in MODELS_TO_TEST
"Qwen/Qwen2-VL-7B-Instruct": [1, 2], ])
"zai-org/GLM-4.1V-9B-Thinking": [1, 2] @pytest.mark.parametrize("tp_size", [1, 2])
})) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) @pytest.mark.parametrize("num_tokens", num_tokens_list)
@pytest.mark.parametrize("num_tokens", [4]) def test_mrope_torch_compile_tracing(model_name: str,
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, model_info: MRoPETestInfo, tp_size: int,
num_tokens): dtype: torch.dtype, num_tokens: int):
atol = model_info.atol
rtol = model_info.rtol
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
config = config.get_text_config()
# get the model config # get the model config
total_num_kv_heads = config.num_key_value_heads total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size) num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads head_dim = (config.head_dim if hasattr(config, "head_dim") else
config.hidden_size // total_num_heads)
is_neox_style = True is_neox_style = True
rope_theta = config.rope_theta rope_theta = config.rope_theta
max_position = config.max_position_embeddings max_position = config.max_position_embeddings

View File

@ -8,11 +8,12 @@ import torch
from safetensors.torch import load_file from safetensors.torch import load_file
from torch import nn from torch import nn
from vllm.config import ModelConfig, VllmConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA, from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA) RowParallelLinearWithLoRA)
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager) LRUCacheLoRAModelManager)
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
@ -435,10 +436,19 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
target_modules=["layer1.dense1", "dense2"], target_modules=["layer1.dense1", "dense2"],
lora_dtype=DEFAULT_DTYPE, lora_dtype=DEFAULT_DTYPE,
) )
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config,
lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_adapter_manager = LRUCacheWorkerLoRAManager( worker_adapter_manager = LRUCacheWorkerLoRAManager(
4, 2, vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.max_num_seqs = 4
worker_adapter_manager.max_num_batched_tokens = 2
worker_adapter_manager.create_lora_manager(dummy_model) worker_adapter_manager.create_lora_manager(dummy_model)
mapping = LoRAMapping([], []) mapping = LoRAMapping([], [])
@ -517,10 +527,20 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
max_cpu_loras=4, max_cpu_loras=4,
max_loras=4, max_loras=4,
lora_dtype=DEFAULT_DTYPE) lora_dtype=DEFAULT_DTYPE)
worker_adapter_manager = WorkerLoRAManager(
4, 2, dummy_model_gate_up.unpadded_vocab_size - model_config = ModelConfig(max_model_len=16)
lora_config.lora_extra_vocab_size, lora_config, device, vllm_config = VllmConfig(model_config=model_config,
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
EMBEDDING_MODULES,
EMBEDDING_PADDING_MODULES)
worker_adapter_manager.vocab_size = (
dummy_model_gate_up.unpadded_vocab_size -
lora_config.lora_extra_vocab_size)
worker_adapter_manager.create_lora_manager(dummy_model_gate_up) worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
dummy_lora_files = f"{tmp_path}/lora_adapter" dummy_lora_files = f"{tmp_path}/lora_adapter"

View File

@ -9,7 +9,7 @@ from typing import Optional, Union
import torch import torch
from safetensors.torch import save_file from safetensors.torch import save_file
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
class DummyLoRAManager: class DummyLoRAManager:

View File

@ -58,7 +58,7 @@ def test_models(
vllm_extra_kwargs = {} vllm_extra_kwargs = {}
if model == "ssmits/Qwen2-7B-Instruct-embed-base": if model == "ssmits/Qwen2-7B-Instruct-embed-base":
vllm_extra_kwargs["override_pooler_config"] = \ vllm_extra_kwargs["pooler_config"] = \
PoolerConfig(pooling_type="MEAN", normalize=False) PoolerConfig(pooling_type="MEAN", normalize=False)
max_model_len: Optional[int] = 512 max_model_len: Optional[int] = 512

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.config.pooler import PoolerConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -99,7 +100,7 @@ def test_gemma_multimodal(
convert="classify", convert="classify",
load_format="auto", load_format="auto",
hf_overrides=update_config, hf_overrides=update_config,
override_pooler_config={"pooling_type": "LAST"}, pooler_config=PoolerConfig(pooling_type="LAST"),
max_model_len=512, max_model_len=512,
enforce_eager=True, enforce_eager=True,
tensor_parallel_size=1, tensor_parallel_size=1,

View File

@ -24,18 +24,18 @@ def test_classify_models_using_activation(
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner(model, with vllm_runner(
max_model_len=512, model,
dtype=dtype, max_model_len=512,
override_pooler_config=PoolerConfig( dtype=dtype,
activation=False)) as vllm_model: pooler_config=PoolerConfig(activation=False)) as vllm_model:
wo_activation_out = vllm_model.classify(example_prompts) wo_activation_out = vllm_model.classify(example_prompts)
with vllm_runner(model, with vllm_runner(
max_model_len=512, model,
dtype=dtype, max_model_len=512,
override_pooler_config=PoolerConfig( dtype=dtype,
activation=True)) as vllm_model: pooler_config=PoolerConfig(activation=True)) as vllm_model:
w_activation_out = vllm_model.classify(example_prompts) w_activation_out = vllm_model.classify(example_prompts)
for wo_activation, w_activation in zip(wo_activation_out, for wo_activation, w_activation in zip(wo_activation_out,
@ -43,9 +43,8 @@ def test_classify_models_using_activation(
wo_activation = torch.tensor(wo_activation) wo_activation = torch.tensor(wo_activation)
w_activation = torch.tensor(w_activation) w_activation = torch.tensor(w_activation)
assert not torch.allclose( assert not torch.allclose(wo_activation, w_activation,
wo_activation, w_activation, atol=1e-2), "pooler_config is not working"
atol=1e-2), "override_pooler_config is not working"
assert torch.allclose(softmax(wo_activation), w_activation, assert torch.allclose(softmax(wo_activation), w_activation,
1e-3 if dtype == "float" else 1e-2) 1e-3 if dtype == "float" else 1e-2)
@ -65,23 +64,22 @@ def test_embed_models_using_normalize(
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
override_pooler_config=PoolerConfig(
normalize=False)) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
with vllm_runner( with vllm_runner(
model, model,
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
override_pooler_config=PoolerConfig(normalize=True)) as vllm_model: pooler_config=PoolerConfig(normalize=False)) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts)) w_normalize = torch.tensor(vllm_model.embed(example_prompts))
assert not torch.allclose( assert not torch.allclose(
wo_normalize, w_normalize, wo_normalize, w_normalize,
atol=1e-2), "override_pooler_config normalize is not working" atol=1e-2), "pooler_config normalize is not working"
assert torch.allclose( assert torch.allclose(
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
atol=1e-2), "w_normal should be close to normal(wo_normal)." atol=1e-2), "w_normal should be close to normal(wo_normal)."
@ -102,18 +100,16 @@ def test_reward_models_using_softmax(
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner( with vllm_runner(model,
model, max_model_len=1024,
max_model_len=1024, dtype=dtype,
dtype=dtype, pooler_config=PoolerConfig(softmax=False)) as vllm_model:
override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
wo_softmax = vllm_model.encode(example_prompts) wo_softmax = vllm_model.encode(example_prompts)
with vllm_runner( with vllm_runner(model,
model, max_model_len=1024,
max_model_len=1024, dtype=dtype,
dtype=dtype, pooler_config=PoolerConfig(softmax=True)) as vllm_model:
override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
w_softmax = vllm_model.encode(example_prompts) w_softmax = vllm_model.encode(example_prompts)
for wo, w in zip(wo_softmax, w_softmax): for wo, w in zip(wo_softmax, w_softmax):
@ -121,7 +117,7 @@ def test_reward_models_using_softmax(
w = torch.tensor(w) w = torch.tensor(w)
assert not torch.allclose( assert not torch.allclose(
wo, w, atol=1e-2), "override_pooler_config softmax is not working" wo, w, atol=1e-2), "pooler_config softmax is not working"
assert torch.allclose( assert torch.allclose(
softmax(wo), w, softmax(wo), w,
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)." atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."

View File

@ -7,7 +7,6 @@ from unittest.mock import patch
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import ModelImpl
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
@ -111,8 +110,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
# these tests seem to produce leftover memory # these tests seem to produce leftover memory
gpu_memory_utilization=0.80, gpu_memory_utilization=0.80,
load_format="dummy", load_format="dummy",
model_impl=ModelImpl.TRANSFORMERS model_impl="transformers"
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM, if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
hf_overrides=hf_overrides_fn, hf_overrides=hf_overrides_fn,
max_num_seqs=model_info.max_num_seqs) max_num_seqs=model_info.max_num_seqs)

View File

@ -9,7 +9,7 @@ from vllm.platforms import current_platform
from ..conftest import HfRunner, VllmRunner from ..conftest import HfRunner, VllmRunner
from ..utils import multi_gpu_test, prep_prompts from ..utils import multi_gpu_test, prep_prompts
from .utils import check_logprobs_close from .utils import check_embeddings_close, check_logprobs_close
def check_implementation( def check_implementation(
@ -165,6 +165,40 @@ def test_embed_loading(vllm_runner, model):
assert model_config.using_transformers_backend() assert model_config.using_transformers_backend()
@pytest.mark.parametrize(
"model",
[
# Encoder model
"BAAI/bge-base-en-v1.5",
])
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
import transformers
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("4.57.0.dev0")
if installed < required:
pytest.skip("Encoder models with the Transformers backend require "
f"transformers>={required}, but got {installed}")
with vllm_runner(model, max_model_len=512,
model_impl="transformers") as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
vllm_outputs = vllm_model.embed(example_prompts)
with hf_runner(model, is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model",
["jason9693/Qwen2.5-1.5B-apeach"], ["jason9693/Qwen2.5-1.5B-apeach"],

View File

@ -207,25 +207,19 @@ def test_get_pooling_config():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig(model_id) model_config = ModelConfig(model_id)
pooling_config = model_config._init_pooler_config() assert model_config.pooler_config is not None
assert pooling_config is not None assert model_config.pooler_config.normalize
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
assert pooling_config.normalize
assert pooling_config.pooling_type == PoolingType.MEAN.name
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config_from_args(): def test_get_pooling_config_from_args():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig(model_id) pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
model_config = ModelConfig(model_id, pooler_config=pooler_config)
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True) assert asdict(model_config.pooler_config) == asdict(pooler_config)
model_config.override_pooler_config = override_pooler_config
pooling_config = model_config._init_pooler_config()
assert pooling_config is not None
assert asdict(pooling_config) == asdict(override_pooler_config)
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None)) assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats:
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
def test_metrics(): def test_metrics():
""" """
Test the prefix caching metrics. Test the prefix caching metrics.
""" """
def stats(requests, queries, hits):
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
metrics = PrefixCachingMetrics(max_recent_requests=5) metrics = PrefixCachingMetrics(max_recent_requests=5)
assert metrics.hit_rate == 0.0 assert metrics.hit_rate == 0.0
metrics.observe(stats(1, 20, 9)) metrics.observe(_stats(1, 20, 9))
# 9 / 20 = 0.45 # 9 / 20 = 0.45
assert metrics.hit_rate == 0.45 assert metrics.hit_rate == 0.45
metrics.observe(stats(4, 80, 16)) metrics.observe(_stats(4, 80, 16))
# 25 / 100 = 0.25 # 25 / 100 = 0.25
assert metrics.hit_rate == 0.25 assert metrics.hit_rate == 0.25
metrics.observe(stats(1, 10, 2)) metrics.observe(_stats(1, 10, 2))
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
assert metrics.aggregated_requests == 5 assert metrics.aggregated_requests == 5
@ -549,6 +549,38 @@ def test_metrics():
assert not metrics.query_queue assert not metrics.query_queue
def test_metrics_empty_stats():
"""
Test the prefix caching metrics with empty stats.
"""
metrics = PrefixCachingMetrics(max_recent_requests=5)
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(1, 20, 9))
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(4, 80, 16))
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(1, 10, 2))
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
assert metrics.aggregated_requests == 5
assert metrics.aggregated_query_total == 90
assert metrics.aggregated_query_hit == 18
assert metrics.hit_rate == 0.2
# Only the latest added stats preserved 10 / 20 = 0.5
metrics.observe(_stats(11, 20, 10))
assert metrics.aggregated_requests == 11
assert metrics.aggregated_query_total == 20
assert metrics.aggregated_query_hit == 10
assert metrics.hit_rate == 0.5
# Only the latest added stats preserved 30 / 40 = 0.75
metrics.observe(_stats(22, 40, 30))
assert metrics.aggregated_requests == 22
assert metrics.aggregated_query_total == 40
assert metrics.aggregated_query_hit == 30
assert metrics.hit_rate == 0.75
def test_get_kv_cache_configs_multiple_workers(): def test_get_kv_cache_configs_multiple_workers():
model_config = ModelConfig(max_model_len=16) model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config) vllm_config = VllmConfig(model_config=model_config)

View File

@ -18,12 +18,18 @@ import torch
from vllm import LLM from vllm import LLM
from vllm.config import KVTransferConfig from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
MultiKVConnectorStats)
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
NixlConnectorWorker) NixlConnectorWorker, NixlKVConnectorStats)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
from .utils import create_request, create_scheduler, create_vllm_config from .utils import create_request, create_scheduler, create_vllm_config
@ -475,6 +481,209 @@ class TestNixlHandshake:
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
# we put here is important. First run ray, it will clean up the resources, then # we put here is important. First run ray, it will clean up the resources, then
# the rest of the tests. # the rest of the tests.
@patch(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
FakeNixlWrapper)
def test_kv_connector_stats(dist_init):
"""Test that KV transfer stats are properly recorded and retrieved."""
vllm_config = create_vllm_config()
# Test worker role in decode server.
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
connector.connector_worker = FakeNixlConnectorWorker(vllm_config,
connector.engine_id,
hand_shake_latency=0)
# Verify that xfer_stats starts empty
initial_stats = connector.get_kv_connector_stats()
assert initial_stats is None
# Create transfer metadata
request_id = "test_req_for_stats"
metadata = NixlConnectorMetadata()
metadata.add_new_req(request_id=request_id,
local_block_ids=[1, 2, 3],
kv_transfer_params={
"remote_block_ids": [4, 5, 6],
"remote_engine_id":
FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
"remote_host": "localhost",
"remote_port": 1234,
"remote_tp_size": 1,
})
connector.bind_connector_metadata(metadata)
# Start the transfer
dummy_ctx = ForwardContext(
no_compile_layers={},
attn_metadata={},
virtual_engine=0,
)
connector.start_load_kv(dummy_ctx)
# Verify stats are recorded after transfer is complete
max_iterations = 2
# Clear metadata before start_load_kv to prevent reprocessing same request
connector.bind_connector_metadata(NixlConnectorMetadata())
for _ in range(max_iterations):
# Need to call start_load_kv to process completed handshakes
connector.start_load_kv(dummy_ctx)
_, done_recving = connector.get_finished(finished_req_ids=set())
if len(done_recving) > 0 and request_id in done_recving:
break
time.sleep(
0.1) # Small delay to allow background handshake to complete
else:
assert "Transfer did not complete within expected iterations"
# Now check that stats were recorded
stats_after_transfer = connector.get_kv_connector_stats()
assert isinstance(stats_after_transfer, NixlKVConnectorStats)
# Verify stats values are recorded
assert not stats_after_transfer.is_empty()
assert stats_after_transfer.data["num_successful_transfers"] == 1
# Verify stats are reset after retrieval
stats_after_reset = connector.get_kv_connector_stats()
assert stats_after_reset is None
def test_kv_connector_stats_aggregation():
"""
Test KV transfer stats aggregation across TP ranks using
KVOutputAggregator (used by MultiprocExecutor).
"""
# Create KVOutputAggregator for 3 workers (simulating TP=3), same thing
# done in MultiprocExecutor.execute_model
aggregator = KVOutputAggregator(world_size=3)
# Create stats for multiple workers with different transfer patterns
worker1_stats = NixlKVConnectorStats()
worker2_stats = NixlKVConnectorStats()
worker3_stats = NixlKVConnectorStats()
# Record different transfers on each worker
# Worker 1: 2 transfers
worker1_stats.record_transfer()
worker1_stats.record_transfer()
# Worker 2: 1 transfer
worker2_stats.record_transfer()
# Worker 3: 3 transfers
worker3_stats.record_transfer()
worker3_stats.record_transfer()
worker3_stats.record_transfer()
# Create ModelRunnerOutput instances for each worker
worker_outputs = []
for i, worker_stats in enumerate(
[worker1_stats, worker2_stats, worker3_stats]):
output = ModelRunnerOutput(
req_ids=[f"req_{i}"],
req_id_to_index={f"req_{i}": 0},
sampled_token_ids=[[123]], # dummy token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[None],
kv_connector_output=KVConnectorOutput(
finished_sending=set([f"req_{i}_send"])
if i < 2 else None, # Workers 0,1 finished sending
finished_recving=set([f"req_{i}_recv"])
if i > 0 else None, # Workers 1,2 finished receiving
kv_connector_stats=worker_stats,
))
worker_outputs.append(output)
# Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
kv_connector_stats = \
aggregated_output.kv_connector_output.kv_connector_stats
assert isinstance(kv_connector_stats, NixlKVConnectorStats)
# Number of total transfers across all workers.
assert kv_connector_stats.data["num_successful_transfers"] == 6
def test_multi_kv_connector_stats_aggregation():
"""
Test MultiKVConnectorStats aggregation across TP ranks using
KVOutputAggregator (used by MultiprocExecutor).
"""
aggregator = KVOutputAggregator(world_size=3)
from dataclasses import dataclass
@dataclass
class FooKVConnectorStats(KVConnectorStats):
def reset(self):
self.data = {"num_foo_transfers": 0}
def record_transfer(self):
if "num_foo_transfers" not in self.data:
self.data["num_foo_transfers"] = 0
self.data["num_foo_transfers"] += 1
def is_empty(self) -> bool:
return self.data["num_foo_transfers"] == 0
def aggregate(self,
other: "FooKVConnectorStats") -> "FooKVConnectorStats":
if not other.is_empty():
self.data["num_foo_transfers"] += other.data[
"num_foo_transfers"]
return self
def make_multi_stats(nixl_count: int,
foo_count: int) -> MultiKVConnectorStats:
data: dict[str, KVConnectorStats] = {}
if nixl_count > 0:
nixl_stats = NixlKVConnectorStats()
for _ in range(nixl_count):
nixl_stats.record_transfer()
data["NixlConnector"] = nixl_stats
if foo_count > 0:
foo_stats = FooKVConnectorStats()
for _ in range(foo_count):
foo_stats.record_transfer()
data["FooConnector"] = foo_stats
return MultiKVConnectorStats(data=data)
# Create heterogeneous stats across 3 workers
worker_patterns = [(2, 1), (3, 0), (0, 5)] # (Nixl, Foo)
worker_outputs: list[ModelRunnerOutput] = []
for i, (nixl, foo) in enumerate(worker_patterns):
stats = make_multi_stats(nixl, foo)
output = ModelRunnerOutput(
req_ids=[f"req_{i}"],
req_id_to_index={f"req_{i}": 0},
sampled_token_ids=[[123]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[None],
kv_connector_output=KVConnectorOutput(
finished_sending=set([f"req_{i}_send"]) if i < 2 else None,
finished_recving=set([f"req_{i}_recv"]) if i > 0 else None,
kv_connector_stats=stats,
),
)
worker_outputs.append(output)
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
kv_connector_stats = \
aggregated_output.kv_connector_output.kv_connector_stats
assert isinstance(kv_connector_stats, MultiKVConnectorStats)
# Validate per-connector totals across workers
assert kv_connector_stats["NixlConnector"].data[
"num_successful_transfers"] == 5
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
@pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
@patch( @patch(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",

View File

@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time
import pytest
import torch
from vllm.platforms import current_platform
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
NUM_GPU_BLOCKS = [64]
NUM_CPU_BLOCKS = [256]
GPU_BLOCK_SIZES = [16]
GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
HEAD_SIZES = [64]
NUM_HEADS = [8]
NUM_LAYERS = [4]
DTYPES = [torch.bfloat16]
SEEDS = [0]
CUDA_DEVICES = ['cuda:0']
NUM_MAPPINGS = [3]
@pytest.mark.parametrize("gpu_to_cpu", [True, False])
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_transfer(
gpu_to_cpu: bool,
num_mappings: int,
head_size: int,
num_heads: int,
gpu_block_size: int,
gpu_blocks_per_cpu_block: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
num_layers: int,
dtype: torch.dtype,
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
# create per-layer GPU KV caches
attn_backends_list = [
FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend
]
gpu_caches = {}
attn_backends = {}
for i in range(num_layers):
layer_name = f'layer {i}'
attn_backend = attn_backends_list[i % len(attn_backends_list)]
attn_backends[layer_name] = attn_backend
gpu_cache_shape = attn_backend.get_kv_cache_shape(
num_gpu_blocks, gpu_block_size, num_heads, head_size)
gpu_caches[layer_name] = torch.rand(gpu_cache_shape,
dtype=dtype,
device=device)
# create handler
cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
handler = CpuGpuOffloadingHandler(attn_backends=attn_backends,
gpu_block_size=gpu_block_size,
cpu_block_size=cpu_block_size,
num_cpu_blocks=num_cpu_blocks,
gpu_caches=gpu_caches)
# select block mappings
gpu_blocks = random.sample(range(num_gpu_blocks),
num_mappings * gpu_blocks_per_cpu_block)
cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
# convert cpu blocks to gpu block size
cpu_blocks_in_gpu_block_size = []
for cpu_block in cpu_blocks:
base_block_id = cpu_block * gpu_blocks_per_cpu_block
for i in range(gpu_blocks_per_cpu_block):
cpu_blocks_in_gpu_block_size.append(i + base_block_id)
# maybe skip a GPU block to test writing to the middle of a CPU block
if gpu_to_cpu:
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:]
cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
gpu_blocks_per_cpu_block - 1:]
# set transfer direction
if gpu_to_cpu:
src_kv_caches = handler.gpu_tensors
dst_kv_caches = handler.cpu_tensors
src_spec_class = GPULoadStoreSpec
dst_spec_class = CPULoadStoreSpec
src_blocks = gpu_blocks
dst_blocks = cpu_blocks
src_blocks_in_gpu_block_size = gpu_blocks
dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
else:
src_kv_caches = handler.cpu_tensors
dst_kv_caches = handler.gpu_tensors
src_spec_class = CPULoadStoreSpec
dst_spec_class = GPULoadStoreSpec
src_blocks = cpu_blocks
dst_blocks = gpu_blocks
src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
dst_blocks_in_gpu_block_size = gpu_blocks
dst_size_in_gpu_blocks = num_gpu_blocks
# build dst -> src mapping
dst_to_src = {}
for src_block, dst_block in zip(src_blocks_in_gpu_block_size,
dst_blocks_in_gpu_block_size):
dst_to_src[dst_block] = src_block
# build transfer specs
src_spec = src_spec_class(src_blocks)
dst_spec = dst_spec_class(dst_blocks)
# clone src and dst tensors before transfer
orig_src_caches = [x.clone() for x in src_kv_caches]
orig_dst_caches = [x.clone() for x in dst_kv_caches]
# call transfer function
assert handler.transfer_async(1, (src_spec, dst_spec))
assert set(handler.transfer_events.keys()) == {1}
# wait for transfer to complete
end_time = time.time() + 10
while time.time() < end_time:
finished = handler.get_finished()
if finished:
assert finished == [(1, True)]
break
time.sleep(0.1)
# verify src tensors did not change
for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches):
assert torch.equal(orig_tensor, tensor)
# verify dst tensors
for dst_block in range(dst_size_in_gpu_blocks):
src_block_candidate = dst_to_src.get(dst_block)
for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
src_kv_caches, dst_kv_caches, orig_dst_caches,
handler.kv_dim_before_num_blocks):
if kv_dim:
# iterate over key, value
for i in range(2):
if src_block_candidate is not None:
expected_value = src_cache[i][src_block_candidate]
else:
expected_value = orig_dst_cache[i][dst_block]
torch.testing.assert_close(dst_cache[i][dst_block].cpu(),
expected_value.cpu())
else:
if src_block_candidate is not None:
expected_value = src_cache[src_block_candidate]
else:
expected_value = orig_dst_cache[dst_block]
torch.testing.assert_close(dst_cache[dst_block].cpu(),
expected_value.cpu())

View File

@ -3,6 +3,7 @@
import itertools import itertools
from collections.abc import Generator from collections.abc import Generator
from typing import get_args
import pytest import pytest
import torch import torch
@ -464,7 +465,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
assert len(prompt_logprob) == vocab_size assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode, def test_logprobs_mode(logprobs_mode: LogprobsMode,
monkeypatch: pytest.MonkeyPatch): monkeypatch: pytest.MonkeyPatch):
"""Test with LLM engine with different logprobs_mode. """Test with LLM engine with different logprobs_mode.
@ -493,14 +494,12 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode,
for logprobs in output.logprobs: for logprobs in output.logprobs:
for token_id in logprobs: for token_id in logprobs:
logprob = logprobs[token_id] logprob = logprobs[token_id]
if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
LogprobsMode.PROCESSED_LOGPROBS):
assert logprob.logprob <= 0 assert logprob.logprob <= 0
if logprob.logprob > 0: if logprob.logprob > 0:
positive_values = positive_values + 1 positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1 total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs) assert total_token_with_logprobs >= len(results[0].outputs)
if logprobs_mode in (LogprobsMode.RAW_LOGITS, if logprobs_mode in ("raw_logits", "processed_logits"):
LogprobsMode.PROCESSED_LOGITS):
assert positive_values > 0 assert positive_values > 0
del llm del llm

View File

@ -9,6 +9,7 @@ from contextlib import AsyncExitStack
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -70,6 +71,8 @@ class ExternalLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var: current_platform.device_control_env_var:
",".join( ",".join(
str( str(
@ -127,11 +130,19 @@ def default_server_args():
@pytest.fixture(scope="module", params=[1, 4]) @pytest.fixture(scope="module", params=[1, 4])
def servers(request, default_server_args): def server_manager(request, default_server_args):
api_server_count = request.param api_server_count = request.param
with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE,
default_server_args) as server_list: api_server_count,
yield server_list default_server_args)
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest_asyncio.fixture @pytest_asyncio.fixture
@ -144,6 +155,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
] ]
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_external_lb_server_info(server_manager):
servers = server_manager.servers
api_server_count = server_manager.api_server_count
for i, (server, _) in enumerate(servers):
print(f"Testing {i=}")
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(server) for _ in range(n_reqs)
]
api_process_counts = [
c["_api_process_count"] for c in parallel_configs
]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",

View File

@ -9,6 +9,7 @@ from contextlib import AsyncExitStack
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing from tests.v1.test_utils import check_request_balancing
@ -92,6 +93,8 @@ class HybridLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var: current_platform.device_control_env_var:
",".join( ",".join(
str( str(
@ -150,12 +153,20 @@ def default_server_args():
@pytest.fixture(scope="module", params=[1, 4]) @pytest.fixture(scope="module", params=[1, 4])
def servers(request, default_server_args): def server_manager(request, default_server_args):
api_server_count = request.param api_server_count = request.param
with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE,
default_server_args, DP_SIZE_LOCAL, api_server_count,
TP_SIZE) as server_list: default_server_args, DP_SIZE_LOCAL,
yield server_list TP_SIZE)
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest_asyncio.fixture @pytest_asyncio.fixture
@ -168,6 +179,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
] ]
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_hybrid_dp_server_info(server_manager):
servers = server_manager.servers
api_server_count = server_manager.api_server_count
for i, (server, _) in enumerate(servers):
print(f"Testing {i=}")
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(server) for _ in range(n_reqs)
]
api_process_counts = [
c["_api_process_count"] for c in parallel_configs
]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",

View File

@ -10,6 +10,7 @@ from typing import Optional, cast
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from tests.v1.test_utils import check_request_balancing from tests.v1.test_utils import check_request_balancing
@ -101,6 +102,8 @@ class MultinodeInternalLBServerManager:
sargs, sargs,
auto_port=False, auto_port=False,
env_dict={ env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var: current_platform.device_control_env_var:
",".join( ",".join(
str( str(
@ -214,7 +217,10 @@ class APIOnlyServerManager:
self.model_name, self.model_name,
api_server_args, api_server_args,
auto_port=False, auto_port=False,
env_dict={}) # No GPUs needed for API-only server env_dict={
"VLLM_SERVER_DEV_MODE": "1",
# No GPUs needed for API-only server
})
server.__enter__() server.__enter__()
print(f"API-only server started successfully with " print(f"API-only server started successfully with "
f"{self.api_server_count} API servers") f"{self.api_server_count} API servers")
@ -293,14 +299,21 @@ def default_server_args():
@pytest.fixture(scope="module", params=[1, 4]) @pytest.fixture(scope="module", params=[1, 4])
def servers(request, default_server_args): def server_manager(request, default_server_args):
api_server_count = request.param api_server_count = request.param
with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
api_server_count, api_server_count,
default_server_args, default_server_args,
DP_SIZE // NUM_NODES, DP_SIZE // NUM_NODES,
TP_SIZE) as server_list: TP_SIZE)
yield server_list
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest.fixture(scope="module", params=[1, 4]) @pytest.fixture(scope="module", params=[1, 4])
@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
yield client yield client
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_multinode_dp_server_info(server_manager):
head_server = server_manager.servers[0][0]
api_server_count = server_manager.api_server_count
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(head_server) for _ in range(n_reqs)
]
api_process_counts = [c["_api_process_count"] for c in parallel_configs]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",

View File

@ -23,14 +23,14 @@ class AttentionType:
Attention type. Attention type.
Use string to be compatible with `torch.compile`. Use string to be compatible with `torch.compile`.
""" """
# Decoder attention between previous layer Q/K/V
DECODER = "decoder" DECODER = "decoder"
# Encoder attention between previous layer Q/K/V for encoder-decoder """Decoder attention between previous layer Q/K/V."""
ENCODER = "encoder" ENCODER = "encoder"
# Encoder attention between previous layer Q/K/V """Encoder attention between previous layer Q/K/V for encoder-decoder."""
ENCODER_ONLY = "encoder_only" ENCODER_ONLY = "encoder_only"
# Attention between dec. Q and enc. K/V for encoder-decoder """Encoder attention between previous layer Q/K/V."""
ENCODER_DECODER = "encoder_decoder" ENCODER_DECODER = "encoder_decoder"
"""Attention between dec. Q and enc. K/V for encoder-decoder."""
class AttentionBackend(ABC): class AttentionBackend(ABC):

View File

@ -430,9 +430,11 @@ class MultiHeadAttention(nn.Module):
key: torch.Tensor, key: torch.Tensor,
value: torch.Tensor, value: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
"""Input shape: batch_size x seq_len x hidden_size""" """Input shape:
# TODO(Isotr0py): Use existing backend implementations and support FA3 (batch_size x seq_len x hidden_size) or
bsz, q_len, _ = query.size() (batch_size x seq_len x num_heads x head_size)
"""
bsz, q_len = query.size()[:2]
kv_len = key.size(1) kv_len = key.size(1)
query = query.view(bsz, q_len, self.num_heads, self.head_size) query = query.view(bsz, q_len, self.num_heads, self.head_size)

View File

@ -8,8 +8,9 @@ import os
import sys import sys
import time import time
import traceback import traceback
from collections.abc import Awaitable
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union from typing import Optional, Protocol, Union
import aiohttp import aiohttp
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
@ -92,6 +93,16 @@ class RequestFuncOutput:
start_time: float = 0.0 start_time: float = 0.0
class RequestFunc(Protocol):
def __call__(
self,
request_func_input: RequestFuncInput,
session: aiohttp.ClientSession,
pbar: Optional[tqdm] = None,
) -> Awaitable[RequestFuncOutput]:
...
async def async_request_openai_completions( async def async_request_openai_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
session: aiohttp.ClientSession, session: aiohttp.ClientSession,
@ -507,7 +518,7 @@ async def async_request_openai_embeddings(
# TODO: Add more request functions for different API protocols. # TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS = { ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
"vllm": async_request_openai_completions, "vllm": async_request_openai_completions,
"openai": async_request_openai_completions, "openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions, "openai-chat": async_request_openai_chat_completions,

View File

@ -8,11 +8,12 @@ import time
import aiohttp import aiohttp
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput from .endpoint_request_func import (RequestFunc, RequestFuncInput,
RequestFuncOutput)
async def wait_for_endpoint( async def wait_for_endpoint(
request_func, request_func: RequestFunc,
test_input: RequestFuncInput, test_input: RequestFuncInput,
session: aiohttp.ClientSession, session: aiohttp.ClientSession,
timeout_seconds: int = 600, timeout_seconds: int = 600,

File diff suppressed because it is too large Load Diff

2006
vllm/config/model.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -193,6 +193,25 @@ class ParallelConfig:
not change by dcp, it simply reuse the GPUs of TP group, and tp_size not change by dcp, it simply reuse the GPUs of TP group, and tp_size
needs to be divisible by dcp_size.""" needs to be divisible by dcp_size."""
_api_process_count: int = 1
"""
The number of API processes initialized.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
_api_process_rank: int = 0
"""
The rank of this API process, or `-1` for engine core processes
under API server scale-out.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
@property @property
def world_size_across_dp(self) -> int: def world_size_across_dp(self) -> int:
"""world_size_across_dp is TPxPPxDP, it is the size of the world """world_size_across_dp is TPxPPxDP, it is the size of the world
@ -428,6 +447,12 @@ class ParallelConfig:
if self.distributed_executor_backend is None and self.world_size == 1: if self.distributed_executor_backend is None and self.world_size == 1:
self.distributed_executor_backend = "uni" self.distributed_executor_backend = "uni"
if not -1 <= self._api_process_rank < self._api_process_count:
raise ValueError(
"Invalid value of `_api_process_rank`. "
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
f"but found: {self._api_process_rank}")
@property @property
def use_ray(self) -> bool: def use_ray(self) -> bool:
return self.distributed_executor_backend == "ray" or ( return self.distributed_executor_backend == "ray" or (

97
vllm/config/pooler.py Normal file
View File

@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import Any, Optional
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
@config
@dataclass
class PoolerConfig:
"""Controls the behavior of output pooling in pooling models."""
pooling_type: Optional[str] = None
"""
The pooling method of the pooling model. This should be a key in
[`vllm.model_executor.layers.pooler.PoolingType`][].
"""
## for embeddings models
normalize: Optional[bool] = None
"""
Whether to normalize the embeddings outputs. Defaults to True.
"""
dimensions: Optional[int] = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: Optional[bool] = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: Optional[int] = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
When an input exceeds max_embed_len, it will be handled according to
the original max_model_len validation logic.
Defaults to None (i.e. set to max_model_len).
"""
## for classification models
activation: Optional[bool] = None
"""
Whether to apply activation function to the classification outputs.
Defaults to True.
"""
logit_bias: Optional[float] = None
"""
If provided, apply classification logit biases. Defaults to None.
"""
## for reward models
softmax: Optional[bool] = None
"""
Whether to apply softmax to the reward outputs.
Defaults to True.
"""
step_tag_id: Optional[int] = None
"""
If set, only the score corresponding to the ``step_tag_id`` in the
generated sentence should be returned. Otherwise, the scores for all tokens
are returned.
"""
returned_token_ids: Optional[list[int]] = None
"""
A list of indices for the vocabulary dimensions to be extracted,
such as the token IDs of ``good_token`` and ``bad_token`` in the
``math-shepherd-mistral-7b-prm`` model.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str

View File

@ -3,7 +3,7 @@
import hashlib import hashlib
from dataclasses import field from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal, Optional, Union from typing import Any, Literal, Optional, Union
from pydantic import SkipValidation, model_validator from pydantic import SkipValidation, model_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
@ -15,13 +15,9 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS) POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
if TYPE_CHECKING:
from vllm.config import RunnerType
else:
RunnerType = Any
logger = init_logger(__name__) logger = init_logger(__name__)
RunnerType = Literal["generate", "pooling", "draft"]
PreemptionMode = Literal["swap", "recompute"] PreemptionMode = Literal["swap", "recompute"]
SchedulerPolicy = Literal["fcfs", "priority"] SchedulerPolicy = Literal["fcfs", "priority"]

View File

@ -1,8 +1,13 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import inspect
import textwrap
from dataclasses import MISSING, Field, field, fields, is_dataclass from dataclasses import MISSING, Field, field, fields, is_dataclass
from typing import TYPE_CHECKING, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
import regex as re
if TYPE_CHECKING: if TYPE_CHECKING:
from _typeshed import DataclassInstance from _typeshed import DataclassInstance
@ -45,3 +50,96 @@ def get_field(cls: ConfigType, name: str) -> Field:
return field(default=default) return field(default=default)
raise ValueError( raise ValueError(
f"{cls.__name__}.{name} must have a default value or default factory.") f"{cls.__name__}.{name} must have a default value or default factory.")
def contains_object_print(text: str) -> bool:
"""
Check if the text looks like a printed Python object, e.g.
contains any substring matching the pattern: "at 0xFFFFFFF>"
We match against 0x followed by 2-16 hex chars (there's
a max of 16 on a 64-bit system).
Args:
text (str): The text to check
Returns:
result (bool): `True` if a match is found, `False` otherwise.
"""
pattern = r'at 0x[a-fA-F0-9]{2,16}>'
match = re.search(pattern, text)
return match is not None
def assert_hashable(text: str) -> bool:
if not contains_object_print(text):
return True
raise AssertionError(
f"vLLM tried to hash some configs that may have Python objects ids "
f"in them. This is a bug, please file an issue. "
f"Text being hashed: {text}")
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
"""
Get any docstrings placed after attribute assignments in a class body.
https://davidism.com/mit-license/
"""
def pairwise(iterable):
"""
Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
Can be removed when Python 3.9 support is dropped.
"""
iterator = iter(iterable)
a = next(iterator, None)
for b in iterator:
yield a, b
a = b
try:
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
except (OSError, KeyError, TypeError):
# HACK: Python 3.13+ workaround - set missing __firstlineno__
# Workaround can be removed after we upgrade to pydantic==2.12.0
with open(inspect.getfile(cls)) as f:
for i, line in enumerate(f):
if f"class {cls.__name__}" in line and ":" in line:
cls.__firstlineno__ = i + 1
break
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
if not isinstance(cls_node, ast.ClassDef):
raise TypeError("Given object was not a class.")
out = {}
# Consider each pair of nodes.
for a, b in pairwise(cls_node.body):
# Must be an assignment then a constant string.
if (not isinstance(a, (ast.Assign, ast.AnnAssign))
or not isinstance(b, ast.Expr)
or not isinstance(b.value, ast.Constant)
or not isinstance(b.value.value, str)):
continue
doc = inspect.cleandoc(b.value.value)
# An assignment can have multiple targets (a = b = v), but an
# annotated assignment only has one target.
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
for target in targets:
# Must be assigning to a plain name.
if not isinstance(target, ast.Name):
continue
out[target.id] = doc
return out
def is_init_field(cls: ConfigType, name: str) -> bool:
return next(f for f in fields(cls) if f.name == name).init

View File

@ -129,7 +129,7 @@ class KVOutputAggregator:
def aggregate(self, def aggregate(self,
outputs: list[ModelRunnerOutput], outputs: list[ModelRunnerOutput],
output_rank: int = 0) -> ModelRunnerOutput: output_rank: int = 0) -> ModelRunnerOutput:
# aggregate kv_connector_output from all workers # Aggregate kv_connector_output from all workers
def update_finished_set(req_ids: Optional[set[str]], def update_finished_set(req_ids: Optional[set[str]],
remaining_count_dict: dict[str, int], remaining_count_dict: dict[str, int],
@ -142,8 +142,9 @@ class KVOutputAggregator:
finished_sending = set[str]() finished_sending = set[str]()
finished_recving = set[str]() finished_recving = set[str]()
for output in outputs: aggregated_kv_connector_stats = None
output = output.kv_connector_output for model_runner_output in outputs:
output = model_runner_output.kv_connector_output
if not output: if not output:
continue continue
update_finished_set(output.finished_sending, update_finished_set(output.finished_sending,
@ -151,12 +152,26 @@ class KVOutputAggregator:
update_finished_set(output.finished_recving, update_finished_set(output.finished_recving,
self._recv_remaining_count, finished_recving) self._recv_remaining_count, finished_recving)
# Aggregate kv_connector_stats from all workers.
if aggregated_kv_connector_stats is None:
# Use the first worker's kv_connector_stats as accumulator.
aggregated_kv_connector_stats = output.kv_connector_stats
elif kv_connector_stats := output.kv_connector_stats:
if aggregated_kv_connector_stats is None:
aggregated_kv_connector_stats = kv_connector_stats
else:
assert isinstance(aggregated_kv_connector_stats,
type(kv_connector_stats))
aggregated_kv_connector_stats = \
aggregated_kv_connector_stats.aggregate(kv_connector_stats)
# select output of the worker specified by output_rank # select output of the worker specified by output_rank
output = outputs[output_rank] output = outputs[output_rank]
output.kv_connector_output = KVConnectorOutput( output.kv_connector_output = KVConnectorOutput(
finished_sending=finished_sending or None, finished_sending=finished_sending or None,
finished_recving=finished_recving or None, finished_recving=finished_recving or None,
kv_connector_stats=aggregated_kv_connector_stats or None,
) )
return output return output

View File

@ -49,6 +49,8 @@ if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_events import KVCacheEvent
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.request import Request from vllm.v1.request import Request
@ -235,6 +237,12 @@ class KVConnectorBase_V1(ABC):
""" """
return None return None
def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
"""
Get the KV connector stats collected during the last interval.
"""
return None
# ============================== # ==============================
# Scheduler-side methods # Scheduler-side methods
# ============================== # ==============================
@ -366,3 +374,15 @@ class KVConnectorBase_V1(ABC):
""" """
return None return None
@classmethod
def build_kv_connector_stats(
cls,
data: Optional[dict[str,
Any]] = None) -> Optional["KVConnectorStats"]:
"""
KVConnectorStats resolution method. This method allows dynamically
registered connectors to return their own KVConnectorStats object,
which can implement custom aggregation logic on the data dict.
"""
return None

View File

@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from typing import Any, Optional, Union
from vllm.config.kv_transfer import KVTransferConfig
from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_transfer_state import (
has_kv_transfer_group)
from vllm.logger import init_logger
logger = init_logger(__name__)
@dataclass
class KVConnectorStats:
"""
Base class for KV Connector Stats, a container for transfer performance
metrics or otherwise important telemetry from the connector.
All sub-classes need to be serializable as stats are sent from worker to
logger process.
"""
data: dict[str, Any] = field(default_factory=dict)
def reset(self):
"""Reset the stats, clear the state."""
raise NotImplementedError
def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats":
"""
Aggregate stats with another `KVConnectorStats` object.
"""
raise NotImplementedError
def reduce(self) -> dict[str, Union[int, float]]:
"""
Reduce the observations collected during a time interval to one or
more representative values (eg avg/median/sum of the series).
This is meant to be called by the logger to produce a summary of the
stats for the last time interval.
"""
raise NotImplementedError
def is_empty(self) -> bool:
"""Return True if the stats are empty."""
raise NotImplementedError
class KVConnectorLogging:
def __init__(self, kv_tranfer_config: KVTransferConfig):
# This should be called on frontend process.
assert not has_kv_transfer_group()
# Instantiate the connector's stats class.
if kv_tranfer_config and kv_tranfer_config.kv_connector:
self.connector_cls = KVConnectorFactory.get_connector_class(
kv_tranfer_config)
self.reset()
def reset(self):
self.transfer_stats_accumulator: Optional[KVConnectorStats] = None
def observe(self, transfer_stats_data: dict[str, Any]):
# Should not be called when a KVConnector is not configured.
assert self.connector_cls is not None
# Called periodically when connector syncs with the scheduler.
# Note that this is not the same as the logging interval.
# We expect transfer_stats_data to be aggregated across all workers and
# consist of observations from a single connector or a MultiConnector.
transfer_stats = self.connector_cls.build_kv_connector_stats(
transfer_stats_data)
if transfer_stats is None:
logger.warning_once(
"The connector %s is collecting stats but "
"does not implement the "
"`build_kv_connector_stats` method. "
"Stats will not be logged.", self.connector_cls)
return
if self.transfer_stats_accumulator is None:
self.transfer_stats_accumulator = transfer_stats
else:
# Accumulate last interval stats.
self.transfer_stats_accumulator = \
self.transfer_stats_accumulator.aggregate(transfer_stats)
def log(self, log_fn=logger.info):
"""Log transfer metrics periodically, similar to throughput logging"""
if (self.transfer_stats_accumulator
and not self.transfer_stats_accumulator.is_empty()):
# Produce a single cumulative stats object for the last time
# interval from the recorded observations.
xfer_metrics = self.transfer_stats_accumulator.reduce()
xfer_metrics_str = ", ".join(f"{k}={v}"
for k, v in xfer_metrics.items())
log_fn("KV Transfer metrics: %s", xfer_metrics_str)
# Reset metrics for next interval
self.reset()

View File

@ -9,19 +9,21 @@ import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.kv_transfer import KVTransferConfig from vllm.config.kv_transfer import KVTransferConfig
from vllm.distributed.kv_events import KVCacheEvent
from vllm.distributed.kv_transfer.kv_connector.factory import ( from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory) KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import KVConnectorOutput from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata
from vllm.distributed.kv_events import KVCacheEvent
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.request import Request from vllm.v1.request import Request
logger = init_logger(__name__) logger = init_logger(__name__)
@ -33,6 +35,43 @@ class MultiKVConnectorMetadata(KVConnectorMetadata):
extra_async_saves: Optional[dict[str, int]] = None extra_async_saves: Optional[dict[str, int]] = None
@dataclass
class MultiKVConnectorStats(KVConnectorStats):
"""
Maintain a dict of KVConnectorStats objects, one for each connector.
This is used to aggregate the stats from all connectors separately.
"""
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
for connector_id, stats in other.data.items():
if connector_id not in self.data:
self[connector_id] = stats
else:
assert isinstance(stats, type(self.data[connector_id]))
self[connector_id] = self[connector_id].aggregate(stats)
return self
def reset(self):
for stats in self.data.values():
stats.reset()
def reduce(self) -> dict[str, Any]:
# TODO (NickLucche) Adjust for logging on separate lines
return {
connector_id: stats.reduce()
for connector_id, stats in self.data.items()
}
def is_empty(self) -> bool:
return all(stats.is_empty() for stats in self.data.values())
def __getitem__(self, connector_id: str) -> KVConnectorStats:
return self.data[connector_id]
def __setitem__(self, connector_id: str, stats: KVConnectorStats):
self.data[connector_id] = stats
class MultiConnector(KVConnectorBase_V1): class MultiConnector(KVConnectorBase_V1):
""" """
A wrapper for using multiple KVConnectors at the same time. A wrapper for using multiple KVConnectors at the same time.
@ -46,6 +85,7 @@ class MultiConnector(KVConnectorBase_V1):
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role) super().__init__(vllm_config=vllm_config, role=role)
self._connectors: list[KVConnectorBase_V1] = [] self._connectors: list[KVConnectorBase_V1] = []
self._ktc_kv_transfer_config = []
ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
"connectors") "connectors")
assert ktcs is not None assert ktcs is not None
@ -57,6 +97,7 @@ class MultiConnector(KVConnectorBase_V1):
**ktc, engine_id=engine_id) **ktc, engine_id=engine_id)
self._connectors.append( self._connectors.append(
KVConnectorFactory.create_connector(temp_config, role)) KVConnectorFactory.create_connector(temp_config, role))
self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
# A mapping from request id to the index of the connector chosen to # A mapping from request id to the index of the connector chosen to
# load the request from (if any). # load the request from (if any).
@ -227,7 +268,7 @@ class MultiConnector(KVConnectorBase_V1):
return async_saves > 0, kv_txfer_params return async_saves > 0, kv_txfer_params
def take_events(self) -> Iterable[KVCacheEvent]: def take_events(self) -> Iterable["KVCacheEvent"]:
for c in self._connectors: for c in self._connectors:
yield from c.take_events() yield from c.take_events()
@ -264,3 +305,24 @@ class MultiConnector(KVConnectorBase_V1):
f"({', '.join(layouts) })." f"({', '.join(layouts) })."
f"All connectors must use the same layout.") f"All connectors must use the same layout.")
return next(iter(layouts), None) return next(iter(layouts), None)
@classmethod
def build_kv_connector_stats(
cls,
data: Optional[dict[str,
Any]] = None) -> Optional[KVConnectorStats]:
return MultiKVConnectorStats(data=data) if data is not None \
else MultiKVConnectorStats()
def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]:
# Group connector stats by connector type.
stats_by_connector: Optional[MultiKVConnectorStats] = None
for c in self._connectors:
stats = c.get_kv_connector_stats()
if stats is None:
continue
if stats_by_connector is None:
# Lazy init to allow optional return value.
stats_by_connector = MultiKVConnectorStats()
stats_by_connector[c.__class__.__name__] = stats
return stats_by_connector

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib import contextlib
import copy
import logging import logging
import math import math
import queue import queue
@ -11,7 +12,7 @@ from collections import defaultdict
from collections.abc import Iterator from collections.abc import Iterator
from concurrent.futures import Future, ThreadPoolExecutor from concurrent.futures import Future, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional, Union
import msgspec import msgspec
import numpy as np import numpy as np
@ -23,6 +24,8 @@ from vllm.attention.selector import backend_name_to_enum, get_attn_backend
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.distributed.kv_transfer.kv_connector.v1.base import (
CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
get_tp_group) get_tp_group)
@ -33,7 +36,6 @@ from vllm.platforms import _Backend, current_platform
from vllm.utils import make_zmq_path, make_zmq_socket from vllm.utils import make_zmq_path, make_zmq_socket
from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.attention.backends.utils import get_kv_cache_layout
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import RequestStatus
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata
@ -206,6 +208,18 @@ class NixlConnector(KVConnectorBase_V1):
assert self.connector_worker is not None assert self.connector_worker is not None
return self.connector_worker.get_finished() return self.connector_worker.get_finished()
def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
assert self.connector_worker is not None
return self.connector_worker.get_kv_connector_stats()
@classmethod
def build_kv_connector_stats(
cls,
data: Optional[dict[str,
Any]] = None) -> Optional[KVConnectorStats]:
return NixlKVConnectorStats(data=data) if data is not None \
else NixlKVConnectorStats()
def start_load_kv(self, forward_context: "ForwardContext", def start_load_kv(self, forward_context: "ForwardContext",
**kwargs) -> None: **kwargs) -> None:
assert self.connector_worker is not None assert self.connector_worker is not None
@ -377,6 +391,7 @@ class NixlConnectorScheduler:
Once a request is finished, determine whether request blocks Once a request is finished, determine whether request blocks
should be freed now or will be sent asynchronously and freed later. should be freed now or will be sent asynchronously and freed later.
""" """
from vllm.v1.request import RequestStatus
params = request.kv_transfer_params params = request.kv_transfer_params
logger.debug( logger.debug(
@ -550,6 +565,7 @@ class NixlConnectorWorker:
# With heterogeneous TP, P must wait for all assigned D TP workers to # With heterogeneous TP, P must wait for all assigned D TP workers to
# finish reading before safely freeing the blocks. # finish reading before safely freeing the blocks.
self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
self.xfer_stats = NixlKVConnectorStats()
def __del__(self): def __del__(self):
"""Cleanup background threads on destruction.""" """Cleanup background threads on destruction."""
@ -1097,6 +1113,8 @@ class NixlConnectorWorker:
xfer_state = self.nixl_wrapper.check_xfer_state(handle) xfer_state = self.nixl_wrapper.check_xfer_state(handle)
if xfer_state == "DONE": if xfer_state == "DONE":
self.nixl_wrapper.release_xfer_handle(handle) self.nixl_wrapper.release_xfer_handle(handle)
# TODO (NickLucche) Get from NIXL telemetry once integrated
self.xfer_stats.record_transfer()
elif xfer_state == "PROC": elif xfer_state == "PROC":
in_progress = True in_progress = True
continue continue
@ -1248,7 +1266,6 @@ class NixlConnectorWorker:
self.nixl_wrapper.transfer(handle) self.nixl_wrapper.transfer(handle)
# Use handle to check completion in future step(). # Use handle to check completion in future step().
# TODO (NickLucche) surface xfer elapsed time
self._recving_transfers[request_id].append( self._recving_transfers[request_id].append(
(handle, time.perf_counter())) (handle, time.perf_counter()))
@ -1300,6 +1317,15 @@ class NixlConnectorWorker:
block_len = self.block_len block_len = self.block_len
return block_len return block_len
def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
"""
Get the KV transfer stats for the connector.
"""
# Clear stats for next iteration
if not self.xfer_stats.is_empty():
return self.xfer_stats.clone_and_reset()
return None
@contextlib.contextmanager @contextlib.contextmanager
def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
@ -1318,3 +1344,39 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
finally: finally:
if ctx is not None: if ctx is not None:
ctx.destroy(linger=0) ctx.destroy(linger=0)
@dataclass
class NixlKVConnectorStats(KVConnectorStats):
"""Container for transfer performance metrics"""
def __post_init__(self):
if "num_successful_transfers" not in self.data:
self.data["num_successful_transfers"] = 0
def reset(self):
self.data = {"num_successful_transfers": 0}
def record_transfer(self):
# TODO: record actual transfer stats when available
self.data["num_successful_transfers"] += 1
def clone_and_reset(self) -> "NixlKVConnectorStats":
old = copy.copy(self)
self.reset()
return old
def is_empty(self) -> bool:
return self.data["num_successful_transfers"] == 0
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
if not other.is_empty():
self.data["num_successful_transfers"] += other.data[
"num_successful_transfers"]
return self
def reduce(self) -> dict[str, Union[int, float]]:
# TODO: reduce stats to a single value, calculate latency/throughput
return {
"num_successful_transfers": self.data["num_successful_transfers"]
}

View File

@ -27,11 +27,11 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
EPLBConfig, HfOverrides, KVEventsConfig, EPLBConfig, HfOverrides, KVEventsConfig,
KVTransferConfig, LoadConfig, LogprobsMode, KVTransferConfig, LoadConfig, LogprobsMode,
LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
ModelDType, ModelImpl, ObservabilityConfig, ModelDType, ObservabilityConfig, ParallelConfig,
ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
RunnerOption, SchedulerConfig, SchedulerPolicy, SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
SpeculativeConfig, StructuredOutputsConfig, StructuredOutputsConfig, TaskOption, TokenizerMode,
TaskOption, TokenizerMode, VllmConfig, get_attr_docs) VllmConfig, get_attr_docs)
from vllm.config.multimodal import MMCacheType, MultiModalConfig from vllm.config.multimodal import MMCacheType, MultiModalConfig
from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.parallel import ExpertPlacementStrategy
from vllm.config.utils import get_field from vllm.config.utils import get_field
@ -333,6 +333,8 @@ class EngineArgs:
enable_eplb: bool = ParallelConfig.enable_eplb enable_eplb: bool = ParallelConfig.enable_eplb
expert_placement_strategy: ExpertPlacementStrategy = \ expert_placement_strategy: ExpertPlacementStrategy = \
ParallelConfig.expert_placement_strategy ParallelConfig.expert_placement_strategy
_api_process_count: int = ParallelConfig._api_process_count
_api_process_rank: int = ParallelConfig._api_process_rank
num_redundant_experts: int = EPLBConfig.num_redundant_experts num_redundant_experts: int = EPLBConfig.num_redundant_experts
eplb_window_size: int = EPLBConfig.window_size eplb_window_size: int = EPLBConfig.window_size
eplb_step_interval: int = EPLBConfig.step_interval eplb_step_interval: int = EPLBConfig.step_interval
@ -441,6 +443,7 @@ class EngineArgs:
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
ModelConfig.override_pooler_config ModelConfig.override_pooler_config
compilation_config: CompilationConfig = \ compilation_config: CompilationConfig = \
@ -547,7 +550,6 @@ class EngineArgs:
model_group.add_argument("--max-logprobs", model_group.add_argument("--max-logprobs",
**model_kwargs["max_logprobs"]) **model_kwargs["max_logprobs"])
model_group.add_argument("--logprobs-mode", model_group.add_argument("--logprobs-mode",
choices=[f.value for f in LogprobsMode],
**model_kwargs["logprobs_mode"]) **model_kwargs["logprobs_mode"])
model_group.add_argument("--disable-sliding-window", model_group.add_argument("--disable-sliding-window",
**model_kwargs["disable_sliding_window"]) **model_kwargs["disable_sliding_window"])
@ -579,8 +581,11 @@ class EngineArgs:
help=model_kwargs["hf_token"]["help"]) help=model_kwargs["hf_token"]["help"])
model_group.add_argument("--hf-overrides", model_group.add_argument("--hf-overrides",
**model_kwargs["hf_overrides"]) **model_kwargs["hf_overrides"])
model_group.add_argument("--pooler-config",
**model_kwargs["pooler_config"])
model_group.add_argument("--override-pooler-config", model_group.add_argument("--override-pooler-config",
**model_kwargs["override_pooler_config"]) **model_kwargs["override_pooler_config"],
deprecated=True)
model_group.add_argument("--logits-processor-pattern", model_group.add_argument("--logits-processor-pattern",
**model_kwargs["logits_processor_pattern"]) **model_kwargs["logits_processor_pattern"])
model_group.add_argument("--generation-config", model_group.add_argument("--generation-config",
@ -589,9 +594,7 @@ class EngineArgs:
**model_kwargs["override_generation_config"]) **model_kwargs["override_generation_config"])
model_group.add_argument("--enable-sleep-mode", model_group.add_argument("--enable-sleep-mode",
**model_kwargs["enable_sleep_mode"]) **model_kwargs["enable_sleep_mode"])
model_group.add_argument("--model-impl", model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
choices=[f.value for f in ModelImpl],
**model_kwargs["model_impl"])
model_group.add_argument("--override-attention-dtype", model_group.add_argument("--override-attention-dtype",
**model_kwargs["override_attention_dtype"]) **model_kwargs["override_attention_dtype"])
model_group.add_argument("--logits-processors", model_group.add_argument("--logits-processors",
@ -951,7 +954,10 @@ class EngineArgs:
# Get the list of attributes of this dataclass. # Get the list of attributes of this dataclass.
attrs = [attr.name for attr in dataclasses.fields(cls)] attrs = [attr.name for attr in dataclasses.fields(cls)]
# Set the attributes from the parsed arguments. # Set the attributes from the parsed arguments.
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) engine_args = cls(**{
attr: getattr(args, attr)
for attr in attrs if hasattr(args, attr)
})
return engine_args return engine_args
def create_model_config(self) -> ModelConfig: def create_model_config(self) -> ModelConfig:
@ -1031,6 +1037,7 @@ class EngineArgs:
mm_shm_cache_max_object_size_mb=self. mm_shm_cache_max_object_size_mb=self.
mm_shm_cache_max_object_size_mb, mm_shm_cache_max_object_size_mb,
mm_encoder_tp_mode=self.mm_encoder_tp_mode, mm_encoder_tp_mode=self.mm_encoder_tp_mode,
pooler_config=self.pooler_config,
override_pooler_config=self.override_pooler_config, override_pooler_config=self.override_pooler_config,
logits_processor_pattern=self.logits_processor_pattern, logits_processor_pattern=self.logits_processor_pattern,
generation_config=self.generation_config, generation_config=self.generation_config,
@ -1364,6 +1371,8 @@ class EngineArgs:
worker_cls=self.worker_cls, worker_cls=self.worker_cls,
worker_extension_cls=self.worker_extension_cls, worker_extension_cls=self.worker_extension_cls,
decode_context_parallel_size=self.decode_context_parallel_size, decode_context_parallel_size=self.decode_context_parallel_size,
_api_process_count=self._api_process_count,
_api_process_rank=self._api_process_rank,
) )
speculative_config = self.create_speculative_config( speculative_config = self.create_speculative_config(

View File

@ -135,23 +135,20 @@ def run_headless(args: argparse.Namespace):
def run_multi_api_server(args: argparse.Namespace): def run_multi_api_server(args: argparse.Namespace):
assert not args.headless assert not args.headless
num_api_servers = args.api_server_count num_api_servers: int = args.api_server_count
assert num_api_servers > 0 assert num_api_servers > 0
orig_mm_processor_cache_gb = args.mm_processor_cache_gb
if num_api_servers > 1: if num_api_servers > 1:
setup_multiprocess_prometheus() setup_multiprocess_prometheus()
# Not compatible with API server scale-out
args.mm_processor_cache_gb = 0
listen_address, sock = setup_server(args) listen_address, sock = setup_server(args)
engine_args = vllm.AsyncEngineArgs.from_cli_args(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
engine_args._api_process_count = num_api_servers
engine_args._api_process_rank = -1
usage_context = UsageContext.OPENAI_API_SERVER usage_context = UsageContext.OPENAI_API_SERVER
vllm_config = engine_args.create_engine_config(usage_context=usage_context) vllm_config = engine_args.create_engine_config(usage_context=usage_context)
model_config = vllm_config.model_config
if num_api_servers > 1: if num_api_servers > 1:
if not envs.VLLM_USE_V1: if not envs.VLLM_USE_V1:
@ -161,10 +158,6 @@ def run_multi_api_server(args: argparse.Namespace):
raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
"with api_server_count > 1") "with api_server_count > 1")
if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0:
logger.warning("Multi-modal processor cache is disabled because "
"it is not compatible with `api_server_count > 1`.")
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
log_stats = not engine_args.disable_log_stats log_stats = not engine_args.disable_log_stats
@ -221,9 +214,10 @@ def run_api_server_worker_proc(listen_address,
client_config=None, client_config=None,
**uvicorn_kwargs) -> None: **uvicorn_kwargs) -> None:
"""Entrypoint for individual API server worker processes.""" """Entrypoint for individual API server worker processes."""
client_config = client_config or {}
server_index = client_config.get("client_index", 0)
# Set process title and add process-specific prefix to stdout and stderr. # Set process title and add process-specific prefix to stdout and stderr.
server_index = client_config.get("client_index", 0) if client_config else 0
set_process_title("APIServer", str(server_index)) set_process_title("APIServer", str(server_index))
decorate_logs() decorate_logs()

View File

@ -151,9 +151,11 @@ class LLM:
multi-modal processor obtained from `AutoProcessor.from_pretrained`. multi-modal processor obtained from `AutoProcessor.from_pretrained`.
The available overrides depend on the model that is being run. The available overrides depend on the model that is being run.
For example, for Phi-3-Vision: `{"num_crops": 4}`. For example, for Phi-3-Vision: `{"num_crops": 4}`.
override_pooler_config: Initialize non-default pooling config or pooler_config: Initialize non-default pooling config for the pooling
override default pooling config for the pooling model. model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
argument is deprecated and will be removed in v0.12.0 or v1.0.0,
whichever is sooner.
compilation_config: Either an integer or a dictionary. If it is an compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration. is a dictionary, it can specify the full compilation configuration.
@ -191,6 +193,7 @@ class LLM:
hf_token: Optional[Union[bool, str]] = None, hf_token: Optional[Union[bool, str]] = None,
hf_overrides: Optional[HfOverrides] = None, hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None,
pooler_config: Optional[PoolerConfig] = None,
override_pooler_config: Optional[PoolerConfig] = None, override_pooler_config: Optional[PoolerConfig] = None,
structured_outputs_config: Optional[Union[dict[ structured_outputs_config: Optional[Union[dict[
str, Any], StructuredOutputsConfig]] = None, str, Any], StructuredOutputsConfig]] = None,
@ -288,6 +291,7 @@ class LLM:
hf_token=hf_token, hf_token=hf_token,
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
pooler_config=pooler_config,
override_pooler_config=override_pooler_config, override_pooler_config=override_pooler_config,
structured_outputs_config=structured_outputs_instance, structured_outputs_config=structured_outputs_instance,
compilation_config=compilation_config_instance, compilation_config=compilation_config_instance,

View File

@ -17,13 +17,14 @@ from argparse import Namespace
from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from http import HTTPStatus from http import HTTPStatus
from typing import Annotated, Any, Callable, Optional from typing import Annotated, Any, Callable, Literal, Optional
import prometheus_client import prometheus_client
import pydantic import pydantic
import regex as re import regex as re
import uvloop import uvloop
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query,
Request)
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse from fastapi.responses import JSONResponse, Response, StreamingResponse
@ -166,6 +167,9 @@ async def build_async_engine_client(
# Context manager to handle engine_client lifecycle # Context manager to handle engine_client lifecycle
# Ensures everything is shutdown and cleaned up on error/exit # Ensures everything is shutdown and cleaned up on error/exit
engine_args = AsyncEngineArgs.from_cli_args(args) engine_args = AsyncEngineArgs.from_cli_args(args)
if client_config:
engine_args._api_process_count = client_config.get("client_count", 1)
engine_args._api_process_rank = client_config.get("client_index", 0)
if disable_frontend_multiprocessing is None: if disable_frontend_multiprocessing is None:
disable_frontend_multiprocessing = bool( disable_frontend_multiprocessing = bool(
@ -209,8 +213,12 @@ async def build_async_engine_client_from_engine_args(
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
async_llm: Optional[AsyncLLM] = None async_llm: Optional[AsyncLLM] = None
client_count = client_config.pop("client_count") if client_config else 1
client_index = client_config.pop("client_index") if client_config else 0 # Don't mutate the input client_config
client_config = dict(client_config) if client_config else {}
client_count = client_config.pop("client_count", 1)
client_index = client_config.pop("client_index", 0)
try: try:
async_llm = AsyncLLM.from_vllm_config( async_llm = AsyncLLM.from_vllm_config(
vllm_config=vllm_config, vllm_config=vllm_config,
@ -956,9 +964,22 @@ if envs.VLLM_SERVER_DEV_MODE:
logger.warning("SECURITY WARNING: Development endpoints are enabled! " logger.warning("SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!") "This should NOT be used in production!")
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
@router.get("/server_info") @router.get("/server_info")
async def show_server_info(raw_request: Request): async def show_server_info(
server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} raw_request: Request,
config_format: Annotated[Literal["text", "json"],
Query()] = "text",
):
vllm_config: VllmConfig = raw_request.app.state.vllm_config
server_info = {
"vllm_config":
str(vllm_config)
if config_format == "text" else PydanticVllmConfig.dump_python(
vllm_config, mode="json", fallback=str)
# fallback=str is needed to handle e.g. torch.dtype
}
return JSONResponse(content=server_info) return JSONResponse(content=server_info)
@router.post("/reset_prefix_cache") @router.post("/reset_prefix_cache")
@ -1856,8 +1877,6 @@ async def run_server_worker(listen_address,
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin) ToolParserManager.import_tool_parser(args.tool_parser_plugin)
server_index = client_config.get("client_index", 0) if client_config else 0
# Load logging config for uvicorn if specified # Load logging config for uvicorn if specified
log_config = load_log_config(args.log_config_file) log_config = load_log_config(args.log_config_file)
if log_config is not None: if log_config is not None:
@ -1873,7 +1892,8 @@ async def run_server_worker(listen_address,
vllm_config = await engine_client.get_vllm_config() vllm_config = await engine_client.get_vllm_config()
await init_app_state(engine_client, vllm_config, app.state, args) await init_app_state(engine_client, vllm_config, app.state, args)
logger.info("Starting vLLM API server %d on %s", server_index, logger.info("Starting vLLM API server %d on %s",
vllm_config.parallel_config._api_process_rank,
listen_address) listen_address)
shutdown_task = await serve_http( shutdown_task = await serve_http(
app, app,

View File

@ -14,7 +14,7 @@ from torch import nn
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.lora.utils import (from_layer, from_layer_logits_processor, from vllm.lora.utils import (from_layer, from_layer_logits_processor,

View File

@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union
import torch import torch
from vllm.config.lora import LoRAConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.models import (LoRAModel, LoRAModelManager, from vllm.lora.models import (LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager, create_lora_manager) LRUCacheLoRAModelManager, create_lora_manager)
@ -27,25 +27,26 @@ class WorkerLoRAManager:
def __init__( def __init__(
self, self,
max_num_seqs: int, vllm_config: VllmConfig,
max_num_batched_tokens: int,
vocab_size: int,
lora_config: LoRAConfig,
device: torch.device, device: torch.device,
embedding_modules: dict[str, str], embedding_modules: dict[str, str],
embedding_padding_modules: list[str], embedding_padding_modules: list[str],
lora_model_cls: type[LoRAModel] = LoRAModel, lora_model_cls: type[LoRAModel] = LoRAModel,
max_position_embeddings: Optional[int] = None,
): ):
self._lora_model_cls = lora_model_cls self._lora_model_cls = lora_model_cls
self.embedding_modules = embedding_modules self.embedding_modules = embedding_modules
self.embedding_padding_modules = embedding_padding_modules self.embedding_padding_modules = embedding_padding_modules
self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
self.max_num_seqs = max_num_seqs self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
self.max_num_batched_tokens = max_num_batched_tokens self.max_num_batched_tokens = (
self.vocab_size = vocab_size vllm_config.scheduler_config.max_num_batched_tokens)
self.lora_config = lora_config self.vocab_size = vllm_config.model_config.get_vocab_size()
self.max_position_embeddings = max_position_embeddings self.lora_config = vllm_config.lora_config
# Use get_text_config() in case of multimodal models
text_config = vllm_config.model_config.hf_config.get_text_config()
self.max_position_embeddings = text_config.max_position_embeddings
self.device = device self.device = device
# Lazily initialized by create_lora_manager. # Lazily initialized by create_lora_manager.
self._adapter_manager: LoRAModelManager self._adapter_manager: LoRAModelManager

View File

@ -78,3 +78,12 @@ if HAS_TRITON:
"TritonOrDeepGemmExperts", "TritonOrDeepGemmExperts",
"BatchedTritonOrDeepGemmExperts", "BatchedTritonOrDeepGemmExperts",
] ]
else:
# Some model classes directly use the custom ops. Add placeholders
# to avoid import errors.
def _raise_exception(method: str):
raise NotImplementedError(
f"{method} is not implemented as lack of triton.")
fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk")
fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts")

View File

@ -15,7 +15,7 @@ from .common import apply_rotary_emb_dispatch
@triton.jit @triton.jit
def _triton_qwen2vl_mrope_forward( def _triton_mrope_forward(
q_ptr, q_ptr,
k_ptr, k_ptr,
cos, cos,
@ -30,12 +30,14 @@ def _triton_qwen2vl_mrope_forward(
pad_hd: tl.constexpr, pad_hd: tl.constexpr,
mrope_section_t: tl.constexpr, mrope_section_t: tl.constexpr,
mrope_section_h: tl.constexpr, mrope_section_h: tl.constexpr,
mrope_section_w: tl.constexpr,
is_interleaved: tl.constexpr,
): ):
# Adapted from # Adapted from
# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
# This version supports flatten input tensors from vllm # This version supports flatten input tensors from vllm
# and supports cos and sin cache with shape (3, num_tokens, head_dim // 2) # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
# instead of (3, bsz, seq_len, head_dim) # instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary
pid = tl.program_id(0) pid = tl.program_id(0)
# locate start address # locate start address
q_ptr = q_ptr + pid * (n_qh * hd) q_ptr = q_ptr + pid * (n_qh * hd)
@ -47,9 +49,6 @@ def _triton_qwen2vl_mrope_forward(
# #################################################################### # ####################################################################
# Note: cos and sin now have shape (3, num_tokens, head_dim // 2) # Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
t_end = mrope_section_t
h_end = t_end + mrope_section_h
# Updated stride calculation for half head_dim # Updated stride calculation for half head_dim
half_rd = rd // 2 half_rd = rd // 2
t_cos = cos + pid * half_rd t_cos = cos + pid * half_rd
@ -61,9 +60,18 @@ def _triton_qwen2vl_mrope_forward(
# Updated offsets for half head_dim # Updated offsets for half head_dim
cos_offsets = tl.arange(0, pad_hd // 2) cos_offsets = tl.arange(0, pad_hd // 2)
t_mask = cos_offsets < t_end if is_interleaved:
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) h_mask = (((cos_offsets % 3) == 1) &
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) (cos_offsets <= 3 * mrope_section_h))
w_mask = (((cos_offsets % 3) == 2) &
(cos_offsets <= 3 * mrope_section_w))
t_mask = ~(h_mask | w_mask)
else:
t_end = mrope_section_t
h_end = t_end + mrope_section_h
t_mask = cos_offsets < mrope_section_t
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
@ -131,6 +139,7 @@ def triton_mrope(
mrope_section: list[int], mrope_section: list[int],
head_size: int, head_size: int,
rotary_dim: int, rotary_dim: int,
mrope_interleaved: bool,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
"""Qwen2VL mrope kernel. """Qwen2VL mrope kernel.
@ -158,7 +167,7 @@ def triton_mrope(
cos = cos.contiguous() cos = cos.contiguous()
sin = sin.contiguous() sin = sin.contiguous()
_triton_qwen2vl_mrope_forward[(n_row, )]( _triton_mrope_forward[(n_row, )](
q, q,
k, k,
cos, cos,
@ -173,6 +182,8 @@ def triton_mrope(
pad_hd, pad_hd,
mrope_section[0], mrope_section[0],
mrope_section[1], mrope_section[1],
mrope_section[2],
mrope_interleaved,
) )
return q, k return q, k
@ -201,7 +212,7 @@ class MRotaryEmbedding(RotaryEmbedding):
is_neox_style: bool, is_neox_style: bool,
dtype: torch.dtype, dtype: torch.dtype,
mrope_section: Optional[list[int]] = None, mrope_section: Optional[list[int]] = None,
mrope_interleaved: Optional[bool] = False, mrope_interleaved: bool = False,
) -> None: ) -> None:
# In Qwen2.5-VL, the maximum index value is related to the duration of # In Qwen2.5-VL, the maximum index value is related to the duration of
# the input video. We enlarge max_position_embeddings to 4 times to get # the input video. We enlarge max_position_embeddings to 4 times to get
@ -282,10 +293,6 @@ class MRotaryEmbedding(RotaryEmbedding):
assert positions.ndim == 1 or positions.ndim == 2 assert positions.ndim == 1 or positions.ndim == 2
assert key is not None assert key is not None
if self.mrope_interleaved:
# TODO: add triton implementation to support mrope-interleaved
return self.forward_native(positions, query, key)
num_tokens = positions.shape[-1] num_tokens = positions.shape[-1]
cos_sin = self.cos_sin_cache[positions] cos_sin = self.cos_sin_cache[positions]
cos, sin = cos_sin.chunk(2, dim=-1) cos, sin = cos_sin.chunk(2, dim=-1)
@ -302,6 +309,7 @@ class MRotaryEmbedding(RotaryEmbedding):
self.mrope_section, self.mrope_section,
self.head_size, self.head_size,
self.rotary_dim, self.rotary_dim,
self.mrope_interleaved,
) )
return q.reshape(query_shape), k.reshape(key_shape) return q.reshape(query_shape), k.reshape(key_shape)

View File

@ -7,7 +7,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm import envs from vllm import envs
from vllm.platforms import current_platform from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
@ -167,7 +167,8 @@ def dispatch_cpu_unquantized_gemm(
if remove_weight: if remove_weight:
layer.weight = torch.nn.Parameter(torch.empty(0), layer.weight = torch.nn.Parameter(torch.empty(0),
requires_grad=False) requires_grad=False)
elif ops._supports_onednn: elif (ops._supports_onednn
and current_platform.get_cpu_architecture() == CpuArchEnum.X86):
origin_weight = layer.weight origin_weight = layer.weight
if remove_weight: if remove_weight:
layer.weight = torch.nn.Parameter(torch.empty(0), layer.weight = torch.nn.Parameter(torch.empty(0),

View File

@ -13,8 +13,7 @@ from torch import nn
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.attention import Attention from vllm.attention import Attention
from vllm.config import (ModelConfig, ModelImpl, VllmConfig, from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
set_current_vllm_config)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
@ -176,8 +175,8 @@ def get_model_architecture(
) )
if arch == model_config._get_transformers_backend_cls(): if arch == model_config._get_transformers_backend_cls():
assert model_config.model_impl != ModelImpl.VLLM assert model_config.model_impl != "vllm"
if model_config.model_impl == ModelImpl.AUTO: if model_config.model_impl == "auto":
logger.warning_once( logger.warning_once(
"%s has no vLLM implementation, falling back to Transformers " "%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and " "implementation. Some features may not be supported and "

View File

@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
batch. batch.
Info: Info:
[Blip2ImageInputs][] [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:

View File

@ -46,7 +46,8 @@ from vllm.sequence import IntermediateTensors
from .clip import CLIPVisionModel from .clip import CLIPVisionModel
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .siglip import SiglipVisionModel from .siglip import SiglipVisionModel
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import (AutoWeightsLoader, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings)
from .vision import get_vision_encoder_info from .vision import get_vision_encoder_info
EOT = "<|endofturn|>" EOT = "<|endofturn|>"
@ -740,33 +741,20 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None, multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
**kwargs,
) -> torch.Tensor: ) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids) inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if (kwargs.get("pixel_values_images") is not None if multimodal_embeddings is not None \
or kwargs.get("pixel_values_videos") and len(multimodal_embeddings) != 0:
is not None): # v0 compatibility inputs_embeds = merge_multimodal_embeddings(
multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) input_ids,
if multimodal_embeddings is not None: inputs_embeds,
multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0) multimodal_embeddings,
_mask_image = input_ids == self.config.image_token_id placeholder_token_id=[
_mask_video = input_ids == self.config.video_token_id self.config.image_token_id,
assert _mask_image.sum() + _mask_video.sum() == len( self.config.video_token_id,
multimodal_embeddings) ],
)
if multimodal_embeddings.dtype != inputs_embeds.dtype:
multimodal_embeddings = multimodal_embeddings.to(
dtype=inputs_embeds.dtype)
if multimodal_embeddings.device != inputs_embeds.device:
multimodal_embeddings = multimodal_embeddings.to(
device=inputs_embeds.device)
if _mask_image.sum() > 0:
inputs_embeds[
_mask_image] = multimodal_embeddings[:sum(_mask_image)]
if _mask_video.sum() > 0:
inputs_embeds[_mask_video] = multimodal_embeddings[
-sum(_mask_video):]
return inputs_embeds return inputs_embeds
def forward( def forward(
@ -783,8 +771,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
# NOTE: In v1, inputs_embeds is always generated at model runner, this # NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility. # condition is for v0 compatibility.
elif inputs_embeds is None: elif inputs_embeds is None:
inputs_embeds = self.get_input_embeddings(input_ids=input_ids, multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
**kwargs) inputs_embeds = self.get_input_embeddings(input_ids,
multimodal_embeddings)
input_ids = None input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,

View File

@ -23,7 +23,6 @@ from vllm.utils import supports_kw
from .interfaces_base import is_pooling_model from .interfaces_base import is_pooling_model
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@ -97,33 +96,10 @@ class SupportsMultiModal(Protocol):
""" """
... ...
# Only for models that support v0 chunked prefill
# TODO(ywang96): Remove this overload once v0 is deprecated
@overload
def get_input_embeddings( def get_input_embeddings(
self, self,
input_ids: Tensor, input_ids: Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None, multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
attn_metadata: Optional["AttentionMetadata"] = None,
) -> Tensor:
...
# TODO: Remove this overload once v0 is deprecated
@overload
def get_input_embeddings(
self,
input_ids: Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> Tensor:
...
def get_input_embeddings(
self,
input_ids: Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
# Only necessary so that the v0 overload is valid
# TODO: Remove attn_metadata once v0 is deprecated
attn_metadata: Optional["AttentionMetadata"] = None,
) -> Tensor: ) -> Tensor:
""" """
Returns the input embeddings merged from the text embeddings from Returns the input embeddings merged from the text embeddings from

View File

@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
inputs_embeds: Optional tensor of input embeddings. inputs_embeds: Optional tensor of input embeddings.
Info: Info:
[LlavaImageInputs][] [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs]
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None

View File

@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Unlike in LLaVA-1.5, the number of image tokens inputted to the language Unlike in LLaVA-1.5, the number of image tokens inputted to the language
model depends on the original size of the input image. Including the model depends on the original size of the input image. Including the
original image token in the input, the required number of image tokens original image token in the input, the required number of image tokens
is given by [get_llava_next_image_feature_size][]. is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\
model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
This way, the `positions` and `attn_metadata` are consistent This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`. with the `input_ids`.
@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
inputs_embeds: Optional tensor of input embeddings. inputs_embeds: Optional tensor of input embeddings.
Info: Info:
[LlavaNextImageInputs][] [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs]
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None

View File

@ -306,7 +306,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
eps=self.layer_norm_epsilon, eps=self.layer_norm_epsilon,
group_size=None, group_size=None,
norm_before_gate=True, norm_before_gate=True,
device=torch.cuda.current_device(), device=current_platform.current_device(),
dtype=config.torch_dtype, dtype=config.torch_dtype,
) )

View File

@ -223,9 +223,7 @@ class Qwen3_VisionPatchMerger(nn.Module):
if norm_layer is None: if norm_layer is None:
norm_layer = partial(nn.LayerNorm, eps=1e-6) norm_layer = partial(nn.LayerNorm, eps=1e-6)
self.use_postshuffle_norm = use_postshuffle_norm self.norm = norm_layer(context_dim)
self.norm = norm_layer(
self.hidden_size if use_postshuffle_norm else context_dim)
self.linear_fc1 = ColumnParallelLinear(self.hidden_size, self.linear_fc1 = ColumnParallelLinear(self.hidden_size,
self.hidden_size, self.hidden_size,
bias=True, bias=True,
@ -1075,6 +1073,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
config.text_config.hidden_size) config.text_config.hidden_size)
for _ in range(self.deepstack_num_level) for _ in range(self.deepstack_num_level)
] if self.use_deepstack else None ] if self.use_deepstack else None
self.visual_dim = config.vision_config.out_hidden_size
self.multiscale_dim = self.visual_dim * self.deepstack_num_level
def _get_deepstack_input_embeds(self, def _get_deepstack_input_embeds(self,
num_tokens: int) -> IntermediateTensors: num_tokens: int) -> IntermediateTensors:
@ -1313,12 +1313,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
] ]
multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0)
visual_dim = multimodal_embeddings_cat.shape[-1] // (
self.deepstack_num_level + 1)
main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level
multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501
multimodal_embeddings_cat, [main_dim, multi_dim], multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim],
dim=-1) dim=-1)
multimodal_embeddings = torch.split(multimodal_embeddings_main, multimodal_embeddings = torch.split(multimodal_embeddings_main,
@ -1340,10 +1336,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
], ],
) )
deepstack_input_embeds = deepstack_input_embeds.view( deepstack_input_embeds = deepstack_input_embeds.view(
inputs_embeds.shape[0], self.deepstack_num_level, inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim)
visual_dim).contiguous() deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2)
deepstack_input_embeds = deepstack_input_embeds.permute(
1, 0, 2).contiguous()
return deepstack_input_embeds, multimodal_embeddings return deepstack_input_embeds, multimodal_embeddings
def get_input_embeddings( def get_input_embeddings(
@ -1353,9 +1347,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
) -> torch.Tensor: ) -> torch.Tensor:
deepstack_input_embeds = None deepstack_input_embeds = None
inputs_embeds = self.language_model.get_input_embeddings(input_ids) inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None and self.use_deepstack: if multimodal_embeddings is not None:
deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 if self.use_deepstack:
input_ids, inputs_embeds, multimodal_embeddings) deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501
input_ids, inputs_embeds, multimodal_embeddings)
inputs_embeds = merge_multimodal_embeddings( inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings, input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_id, self.config.video_token_id]) [self.config.image_token_id, self.config.video_token_id])

View File

@ -344,3 +344,5 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
config.text_config.hidden_size) config.text_config.hidden_size)
for _ in range(self.deepstack_num_level) for _ in range(self.deepstack_num_level)
] if self.use_deepstack else None ] if self.use_deepstack else None
self.visual_dim = config.vision_config.out_hidden_size
self.multiscale_dim = self.visual_dim * self.deepstack_num_level

View File

@ -19,7 +19,7 @@ from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn import torch.nn as nn
import transformers import transformers
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults, from vllm.config import (ModelConfig, iter_architecture_defaults,
try_match_architecture_defaults) try_match_architecture_defaults)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.dynamic_module import ( from vllm.transformers_utils.dynamic_module import (
@ -587,7 +587,7 @@ class _ModelRegistry:
if model_module is not None: if model_module is not None:
break break
else: else:
if model_config.model_impl != ModelImpl.TRANSFORMERS: if model_config.model_impl != "transformers":
return None return None
raise ValueError( raise ValueError(
@ -598,7 +598,7 @@ class _ModelRegistry:
"'auto_map' (relevant if the model is custom).") "'auto_map' (relevant if the model is custom).")
if not model_module.is_backend_compatible(): if not model_module.is_backend_compatible():
if model_config.model_impl != ModelImpl.TRANSFORMERS: if model_config.model_impl != "transformers":
return None return None
raise ValueError( raise ValueError(
@ -644,20 +644,20 @@ class _ModelRegistry:
raise ValueError("No model architectures are specified") raise ValueError("No model architectures are specified")
# Require transformers impl # Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS: if model_config.model_impl == "transformers":
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
if arch is not None: if arch is not None:
model_info = self._try_inspect_model_cls(arch) model_info = self._try_inspect_model_cls(arch)
if model_info is not None: if model_info is not None:
return (model_info, arch) return (model_info, arch)
elif model_config.model_impl == ModelImpl.TERRATORCH: elif model_config.model_impl == "terratorch":
model_info = self._try_inspect_model_cls("Terratorch") model_info = self._try_inspect_model_cls("Terratorch")
return (model_info, "Terratorch") return (model_info, "Terratorch")
# Fallback to transformers impl (after resolving convert_type) # Fallback to transformers impl (after resolving convert_type)
if (all(arch not in self.models for arch in architectures) if (all(arch not in self.models for arch in architectures)
and model_config.model_impl == ModelImpl.AUTO and model_config.model_impl == "auto"
and getattr(model_config, "convert_type", "none") == "none"): and getattr(model_config, "convert_type", "none") == "none"):
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
@ -674,7 +674,7 @@ class _ModelRegistry:
# Fallback to transformers impl (before resolving runner_type) # Fallback to transformers impl (before resolving runner_type)
if (all(arch not in self.models for arch in architectures) if (all(arch not in self.models for arch in architectures)
and model_config.model_impl == ModelImpl.AUTO): and model_config.model_impl == "auto"):
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
if arch is not None: if arch is not None:
@ -695,14 +695,14 @@ class _ModelRegistry:
raise ValueError("No model architectures are specified") raise ValueError("No model architectures are specified")
# Require transformers impl # Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS: if model_config.model_impl == "transformers":
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
if arch is not None: if arch is not None:
model_cls = self._try_load_model_cls(arch) model_cls = self._try_load_model_cls(arch)
if model_cls is not None: if model_cls is not None:
return (model_cls, arch) return (model_cls, arch)
elif model_config.model_impl == ModelImpl.TERRATORCH: elif model_config.model_impl == "terratorch":
arch = "Terratorch" arch = "Terratorch"
model_cls = self._try_load_model_cls(arch) model_cls = self._try_load_model_cls(arch)
if model_cls is not None: if model_cls is not None:
@ -710,7 +710,7 @@ class _ModelRegistry:
# Fallback to transformers impl (after resolving convert_type) # Fallback to transformers impl (after resolving convert_type)
if (all(arch not in self.models for arch in architectures) if (all(arch not in self.models for arch in architectures)
and model_config.model_impl == ModelImpl.AUTO and model_config.model_impl == "auto"
and getattr(model_config, "convert_type", "none") == "none"): and getattr(model_config, "convert_type", "none") == "none"):
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
@ -727,7 +727,7 @@ class _ModelRegistry:
# Fallback to transformers impl (before resolving runner_type) # Fallback to transformers impl (before resolving runner_type)
if (all(arch not in self.models for arch in architectures) if (all(arch not in self.models for arch in architectures)
and model_config.model_impl == ModelImpl.AUTO): and model_config.model_impl == "auto"):
arch = self._try_resolve_transformers(architectures[0], arch = self._try_resolve_transformers(architectures[0],
model_config) model_config)
if arch is not None: if arch is not None:

View File

@ -27,7 +27,7 @@ from transformers import (AutoModel, BatchFeature, PretrainedConfig,
PreTrainedModel) PreTrainedModel)
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.attention import Attention from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, VllmConfig) ParallelConfig, VllmConfig)
@ -452,8 +452,9 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
self.pp_rank = self.pp_group.rank_in_group self.pp_rank = self.pp_group.rank_in_group
self.tp_size = get_tensor_model_parallel_world_size() self.tp_size = get_tensor_model_parallel_world_size()
# To be updated in child classes for use in `load_weights` # Weights to skip in `self.load_weights`
self.skip_prefixes: Optional[list[str]] = None self.skip_prefixes: list[str] = []
self.skip_substrs: list[str] = []
# Set correct attn and init on "meta" to delay allocating GPU tensors # Set correct attn and init on "meta" to delay allocating GPU tensors
# TODO: @raushan, use the public `model.set_attn_implementation()` # TODO: @raushan, use the public `model.set_attn_implementation()`
@ -596,7 +597,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
_tensor_parallel(self.model) _tensor_parallel(self.model)
def create_attention_instances(self) -> dict[int, Attention]: def create_attention_instances(
self,
attn_type: AttentionType = AttentionType.DECODER
) -> dict[int, Attention]:
""" """
Create `Attention` instances to inform KV cache allocation. Create `Attention` instances to inform KV cache allocation.
""" """
@ -625,7 +629,8 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
cache_config=self.cache_config, cache_config=self.cache_config,
quant_config=self.quant_config, quant_config=self.quant_config,
per_layer_sliding_window=per_layer_sliding_window, per_layer_sliding_window=per_layer_sliding_window,
prefix=f"{i}.attn") prefix=f"{i}.attn",
attn_type=attn_type)
return attention_instances return attention_instances
def init_parameters(self, module: nn.Module): def init_parameters(self, module: nn.Module):
@ -685,7 +690,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes) loader = AutoWeightsLoader(
self,
skip_prefixes=self.skip_prefixes,
skip_substrs=self.skip_substrs,
)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
@ -700,6 +709,37 @@ class TransformersModel(TransformersBase):
"model.score": "score", "model.score": "score",
}) })
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
# Some encoder models have the position_ids buffer in the checkpoint
# vLLM will always pass position_ids as an argument, so we skip loading
# the buffer if it exists
self.skip_substrs.append("position_ids")
def create_attention_instances(
self, attn_type: AttentionType = AttentionType.DECODER):
# TODO(hmellor): Better way to detect encoder models
# In encoder models, the attention layers will have `is_causal=False`
is_encoder = lambda m: not getattr(m, "is_causal", True)
# vLLM does not support encoder-decoder models, so if any encoder layer
# is found, we assume the whole model is an encoder model
if any(is_encoder(m) for m in self.model.modules()):
attn_type = AttentionType.ENCODER_ONLY
# Check minimum transformers version for encoder models support
if attn_type == AttentionType.ENCODER_ONLY:
import transformers
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("4.57.0.dev0")
if installed < required:
raise ValueError(
"Encoder models with the Transformers backend require "
f"transformers>={required}, but got {installed}")
return super().create_attention_instances(attn_type)
@support_torch_compile(enable_if=can_enable_torch_compile) @support_torch_compile(enable_if=can_enable_torch_compile)
class TransformersForCausalLM(TransformersBase): class TransformersForCausalLM(TransformersBase):
@ -710,7 +750,7 @@ class TransformersForCausalLM(TransformersBase):
# Tell `TransformersBase.load_weights` to skip # Tell `TransformersBase.load_weights` to skip
# `lm_head` if the model has tied word embeddings # `lm_head` if the model has tied word embeddings
if self.text_config.tie_word_embeddings: if self.text_config.tie_word_embeddings:
self.skip_prefixes = ["lm_head."] self.skip_prefixes.append("lm_head.")
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.unpadded_vocab_size = self.text_config.vocab_size self.unpadded_vocab_size = self.text_config.vocab_size

View File

@ -13,9 +13,7 @@ from transformers import BatchFeature, ProcessorMixin
from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper import WhisperFeatureExtractor
from transformers.models.whisper.modeling_whisper import WhisperEncoder from transformers.models.whisper.modeling_whisper import WhisperEncoder
from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.model_loader import DefaultModelLoader from vllm.model_executor.model_loader import DefaultModelLoader
@ -37,8 +35,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix, init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings, merge_multimodal_embeddings)
merge_multimodal_embeddings_from_map)
_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
_MAX_ENCODER_BATCH_SIZE = 16 _MAX_ENCODER_BATCH_SIZE = 16
@ -568,17 +565,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
safe_input_ids) safe_input_ids)
if multimodal_embeddings is not None and len( if multimodal_embeddings is not None and len(
multimodal_embeddings) > 0: multimodal_embeddings) > 0:
inputs_embeds = merge_multimodal_embeddings(
# TODO(ywang96): remove this block after v0 is deprecated. input_ids, inputs_embeds, multimodal_embeddings,
if not envs.VLLM_USE_V1: self.config.audio_token_index)
attn_metadata = get_forward_context().attn_metadata
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
else:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.audio_token_index)
return inputs_embeds return inputs_embeds
def forward(self, def forward(self,

View File

@ -15,7 +15,7 @@ import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.multimodal import NestedTensors
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available, from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
is_uva_available) is_uva_available)
@ -389,22 +389,6 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
_embedding_count_expression(inner) for inner in embeddings) _embedding_count_expression(inner) for inner in embeddings)
def merge_multimodal_embeddings_from_map(
inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
"""
Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided
placeholder map .
Note:
This updates ``inputs_embeds`` in place.
"""
flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
inputs_embeds[placeholder_map.dest] = flattened_embeddings[
placeholder_map.src].to(dtype=inputs_embeds.dtype)
return inputs_embeds
def _merge_multimodal_embeddings( def _merge_multimodal_embeddings(
inputs_embeds: torch.Tensor, inputs_embeds: torch.Tensor,
is_multimodal: torch.Tensor, is_multimodal: torch.Tensor,

View File

@ -494,7 +494,8 @@ def _enable_processor_cache(
def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
supports_ipc_cache = (parallel_config.data_parallel_size == 1 supports_ipc_cache = ((parallel_config._api_process_count == 1
and parallel_config.data_parallel_size == 1)
or parallel_config.data_parallel_external_lb) or parallel_config.data_parallel_external_lb)
return supports_ipc_cache return supports_ipc_cache

View File

@ -127,14 +127,23 @@ class PrefixCachingMetrics:
if stats.reset: if stats.reset:
self.reset() self.reset()
# DO NOT appending empty stats to avoid helpful info get kicked out
# due to sliding window.
if stats.requests == 0:
return
# Update the metrics. # Update the metrics.
self.query_queue.append((stats.requests, stats.queries, stats.hits)) self.query_queue.append((stats.requests, stats.queries, stats.hits))
self.aggregated_requests += stats.requests self.aggregated_requests += stats.requests
self.aggregated_query_total += stats.queries self.aggregated_query_total += stats.queries
self.aggregated_query_hit += stats.hits self.aggregated_query_hit += stats.hits
# Remove the oldest stats if the number of requests exceeds. # Remove the oldest stats until number of requests does not exceed
if self.aggregated_requests > self.max_recent_requests: # the limit.
# NOTE: We preserve the latest added stats regardless.
while len(
self.query_queue
) > 1 and self.aggregated_requests > self.max_recent_requests:
old_requests, old_queries, old_hits = self.query_queue.popleft() old_requests, old_queries, old_hits = self.query_queue.popleft()
self.aggregated_requests -= old_requests self.aggregated_requests -= old_requests
self.aggregated_query_total -= old_queries self.aggregated_query_total -= old_queries

View File

@ -15,6 +15,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory) KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
KVConnectorRole) KVConnectorRole)
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@ -576,8 +578,10 @@ class Scheduler(SchedulerInterface):
scheduled_spec_decode_tokens, scheduled_spec_decode_tokens,
req_to_new_blocks, req_to_new_blocks,
) )
scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
scheduled_resumed_reqs)
structured_output_request_ids, grammar_bitmask = ( structured_output_request_ids, grammar_bitmask = (
self.get_grammar_bitmask(self.running, self.get_grammar_bitmask(scheduled_requests,
scheduled_spec_decode_tokens)) scheduled_spec_decode_tokens))
scheduler_output = SchedulerOutput( scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data, scheduled_new_reqs=new_reqs_data,
@ -870,9 +874,12 @@ class Scheduler(SchedulerInterface):
num_scheduled_tokens = scheduler_output.num_scheduled_tokens num_scheduled_tokens = scheduler_output.num_scheduled_tokens
pooler_outputs = model_runner_output.pooler_output pooler_outputs = model_runner_output.pooler_output
num_nans_in_logits = model_runner_output.num_nans_in_logits num_nans_in_logits = model_runner_output.num_nans_in_logits
kv_connector_output = model_runner_output.kv_connector_output
outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
spec_decoding_stats: Optional[SpecDecodingStats] = None spec_decoding_stats: Optional[SpecDecodingStats] = None
kv_connector_stats = (kv_connector_output.kv_connector_stats
if kv_connector_output else None)
# NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
# the below loop can be a performance bottleneck. We should do our best # the below loop can be a performance bottleneck. We should do our best
@ -1013,7 +1020,8 @@ class Scheduler(SchedulerInterface):
finished_requests=finished_set) finished_requests=finished_set)
finished_req_ids.clear() finished_req_ids.clear()
if (stats := self.make_stats(spec_decoding_stats)) is not None: if (stats := self.make_stats(spec_decoding_stats,
kv_connector_stats)) is not None:
# Return stats to only one of the front-ends. # Return stats to only one of the front-ends.
if (eco := next(iter(engine_core_outputs.values()), None)) is None: if (eco := next(iter(engine_core_outputs.values()), None)) is None:
# We must return the stats even if there are no request # We must return the stats even if there are no request
@ -1178,20 +1186,21 @@ class Scheduler(SchedulerInterface):
def make_stats( def make_stats(
self, self,
spec_decoding_stats: Optional[SpecDecodingStats] = None, spec_decoding_stats: Optional[SpecDecodingStats] = None,
kv_connector_stats: Optional[KVConnectorStats] = None,
) -> Optional[SchedulerStats]: ) -> Optional[SchedulerStats]:
if not self.log_stats: if not self.log_stats:
return None return None
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats() prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
assert prefix_cache_stats is not None assert prefix_cache_stats is not None
return SchedulerStats( return SchedulerStats(num_running_reqs=len(self.running),
num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting),
num_waiting_reqs=len(self.waiting), kv_cache_usage=self.kv_cache_manager.usage,
kv_cache_usage=self.kv_cache_manager.usage, prefix_cache_stats=prefix_cache_stats,
prefix_cache_stats=prefix_cache_stats, spec_decoding_stats=spec_decoding_stats,
spec_decoding_stats=spec_decoding_stats, num_corrupted_reqs=sum(req.is_output_corrupted
num_corrupted_reqs=sum(req.is_output_corrupted for req in self.running),
for req in self.running), kv_connector_stats=kv_connector_stats.data
) if kv_connector_stats else None)
def make_spec_decoding_stats( def make_spec_decoding_stats(
self, self,

View File

@ -437,7 +437,7 @@ class MPClient(EngineCoreClient):
self.engines_running = False self.engines_running = False
self.stats_update_address: Optional[str] = None self.stats_update_address: Optional[str] = None
if client_addresses is not None: if client_addresses:
# Engines are managed externally to this client. # Engines are managed externally to this client.
input_address = client_addresses["input_address"] input_address = client_addresses["input_address"]
output_address = client_addresses["output_address"] output_address = client_addresses["output_address"]
@ -774,6 +774,7 @@ class AsyncMPClient(MPClient):
client_addresses=client_addresses, client_addresses=client_addresses,
) )
self.client_count = client_count
self.client_index = client_index self.client_index = client_index
self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
Exception]]() Exception]]()

View File

@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
TransferResult, TransferSpec)
logger = init_logger(__name__)
def expand_block_ids(block_ids: np.ndarray,
block_size_factor: int,
output: np.ndarray,
skip_count: int = 0):
"""
Convert a list of block IDs to a list of matching block ids,
assuming each block is composed of actual block_size_factor blocks.
Outputs to output tensor.
The first skip_count blocks will be skipped.
Note that skip_count must be less than block_size_factor.
For example, if block_ids = [0, 1, 3] and block_size_factor = 4,
then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
since 0 maps to [0, 1, 2, 3]
1 maps to [4, 5, 6, 7]
and 3 maps to [12, 13, 14, 15]
"""
assert skip_count < block_size_factor
first_range = np.arange(skip_count, block_size_factor)
full_range = np.arange(0, block_size_factor)
output_idx = 0
for i, block_id in enumerate(block_ids):
base_block_id = block_id * block_size_factor
indices = first_range if i == 0 else full_range
output_end_idx = output_idx + len(indices)
output[output_idx:output_end_idx] = base_block_id + indices
output_idx = output_end_idx
class CpuGpuOffloadingHandler(OffloadingHandler):
def __init__(self, gpu_block_size: int, cpu_block_size: int,
num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]]):
assert cpu_block_size % gpu_block_size == 0
self.block_size_factor = cpu_block_size // gpu_block_size
# cuda streams for gpu->cpu and cpu->gpu
self.d2h_stream = torch.cuda.Stream()
self.h2d_stream = torch.cuda.Stream()
# job_id -> transfer cuda event
self.transfer_events: dict[int, torch.cuda.Event] = {}
# list of cuda events available for re-use
self.events_pool: list[torch.cuda.Event] = []
pin_memory = is_pin_memory_available()
# allocate cpu tensors
logger.info("Allocating %d CPU tensors...", len(gpu_caches))
self.gpu_tensors: list[torch.Tensor] = []
self.cpu_tensors: list[torch.Tensor] = []
self.kv_dim_before_num_blocks: list[bool] = []
for layer_name, gpu_tensor in gpu_caches.items():
self.gpu_tensors.append(gpu_tensor)
gpu_shape = gpu_tensor.shape
test_shape = attn_backends[layer_name].get_kv_cache_shape(
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
if test_shape[0] == 1234:
# shape is (num_blocks, ...)
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)
else:
# shape should be (2, num_blocks, ...)
assert test_shape[0] == 2
assert test_shape[1] == 1234
assert gpu_shape[0] == 2
num_blocks_idx = 1
self.kv_dim_before_num_blocks.append(True)
cpu_shape = list(gpu_shape)
cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
self.cpu_tensors.append(
torch.zeros(cpu_shape,
dtype=gpu_tensor.dtype,
device="cpu",
pin_memory=pin_memory))
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
src_spec, dst_spec = spec
if isinstance(src_spec, CPULoadStoreSpec):
assert isinstance(dst_spec, GPULoadStoreSpec)
stream = self.h2d_stream
src_tensors = self.cpu_tensors
dst_tensors = self.gpu_tensors
src_block_size_factor = self.block_size_factor
dst_block_size_factor = 1
else:
assert isinstance(src_spec, GPULoadStoreSpec)
assert isinstance(dst_spec, CPULoadStoreSpec)
stream = self.d2h_stream
src_tensors = self.gpu_tensors
dst_tensors = self.cpu_tensors
src_block_size_factor = 1
dst_block_size_factor = self.block_size_factor
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
src_sub_block_count = src_blocks.size * src_block_size_factor
assert (
src_sub_block_count == dst_blocks.size * dst_block_size_factor -
dst_sub_blocks_to_skip)
src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
expand_block_ids(dst_blocks,
dst_block_size_factor,
src_to_dst[:, 1],
skip_count=dst_sub_blocks_to_skip)
src_to_dst_tensor = torch.from_numpy(src_to_dst)
event = self.events_pool.pop() if self.events_pool \
else torch.cuda.Event()
with torch.cuda.stream(stream):
for src_tensor, dst_tensor, kv_dim in zip(
src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
if kv_dim:
src_key_cache = src_tensor[0]
dst_key_cache = dst_tensor[0]
ops.swap_blocks(src_key_cache, dst_key_cache,
src_to_dst_tensor)
src_value_cache = src_tensor[1]
dst_value_cache = dst_tensor[1]
ops.swap_blocks(src_value_cache, dst_value_cache,
src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self.transfer_events[job_id] = event
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
for job_id, event in self.transfer_events.items():
if event.query():
results.append((job_id, True))
self.events_pool.append(event)
for job_id, _ in results:
del self.transfer_events[job_id]
return results

View File

@ -9,6 +9,8 @@ from typing import Callable, Optional, Union
import prometheus_client import prometheus_client
from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.config import SupportsMetricsInfo, VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorLogging)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
from vllm.v1.engine import FinishReason from vllm.v1.engine import FinishReason
@ -59,6 +61,8 @@ class LoggingStatLogger(StatLoggerBase):
# TODO: Make the interval configurable. # TODO: Make the interval configurable.
self.prefix_caching_metrics = PrefixCachingMetrics() self.prefix_caching_metrics = PrefixCachingMetrics()
self.spec_decoding_logging = SpecDecodingLogging() self.spec_decoding_logging = SpecDecodingLogging()
kv_tranfer_config = self.vllm_config.kv_transfer_config
self.kv_transfer_logging = KVConnectorLogging(kv_tranfer_config)
self.last_prompt_throughput: float = 0.0 self.last_prompt_throughput: float = 0.0
self.last_generation_throughput: float = 0.0 self.last_generation_throughput: float = 0.0
@ -97,7 +101,8 @@ class LoggingStatLogger(StatLoggerBase):
if scheduler_stats.spec_decoding_stats is not None: if scheduler_stats.spec_decoding_stats is not None:
self.spec_decoding_logging.observe( self.spec_decoding_logging.observe(
scheduler_stats.spec_decoding_stats) scheduler_stats.spec_decoding_stats)
if kv_connector_stats := scheduler_stats.kv_connector_stats:
self.kv_transfer_logging.observe(kv_connector_stats)
self.last_scheduler_stats = scheduler_stats self.last_scheduler_stats = scheduler_stats
def log(self): def log(self):
@ -136,6 +141,7 @@ class LoggingStatLogger(StatLoggerBase):
self.prefix_caching_metrics.hit_rate * 100, self.prefix_caching_metrics.hit_rate * 100,
) )
self.spec_decoding_logging.log(log_fn=log_fn) self.spec_decoding_logging.log(log_fn=log_fn)
self.kv_transfer_logging.log(log_fn=log_fn)
def log_engine_initialized(self): def log_engine_initialized(self):
if self.vllm_config.cache_config.num_gpu_blocks: if self.vllm_config.cache_config.num_gpu_blocks:

View File

@ -3,7 +3,7 @@
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Any, Optional
from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.spec_decode.metrics import SpecDecodingStats
@ -43,6 +43,7 @@ class SchedulerStats:
default_factory=PrefixCacheStats) default_factory=PrefixCacheStats)
spec_decoding_stats: Optional[SpecDecodingStats] = None spec_decoding_stats: Optional[SpecDecodingStats] = None
kv_connector_stats: Optional[dict[str, Any]] = None
num_corrupted_reqs: int = 0 num_corrupted_reqs: int = 0

View File

@ -3,11 +3,15 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import NamedTuple, Optional from typing import TYPE_CHECKING, NamedTuple, Optional
import numpy as np import numpy as np
import torch import torch
if TYPE_CHECKING:
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
class LogprobsLists(NamedTuple): class LogprobsLists(NamedTuple):
@ -78,6 +82,11 @@ class KVConnectorOutput:
# [req_ids] # [req_ids]
finished_sending: Optional[set[str]] = None finished_sending: Optional[set[str]] = None
finished_recving: Optional[set[str]] = None finished_recving: Optional[set[str]] = None
kv_connector_stats: Optional["KVConnectorStats"] = None
def is_empty(self):
return (not self.finished_sending and not self.finished_recving
and not self.kv_connector_stats)
# ModelRunnerOutput is serialized and sent to the scheduler process. # ModelRunnerOutput is serialized and sent to the scheduler process.

View File

@ -29,15 +29,12 @@ class TopKTopPSampler(nn.Module):
Implementations may update the logits tensor in-place. Implementations may update the logits tensor in-place.
""" """
def __init__( def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
self,
logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None:
super().__init__() super().__init__()
self.logprobs_mode = logprobs_mode self.logprobs_mode = logprobs_mode
# flashinfer optimization does not apply if intermediate # flashinfer optimization does not apply if intermediate
# logprobs/logits after top_k/top_p need to be returned # logprobs/logits after top_k/top_p need to be returned
if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, if logprobs_mode not in ("processed_logits", "processed_logprobs"
LogprobsMode.PROCESSED_LOGPROBS
) and current_platform.is_cuda(): ) and current_platform.is_cuda():
if is_flashinfer_available: if is_flashinfer_available:
flashinfer_version = flashinfer.__version__ flashinfer_version = flashinfer.__version__
@ -90,9 +87,9 @@ class TopKTopPSampler(nn.Module):
""" """
logits = self.apply_top_k_top_p(logits, k, p) logits = self.apply_top_k_top_p(logits, k, p)
logits_to_return = None logits_to_return = None
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: if self.logprobs_mode == "processed_logits":
logits_to_return = logits logits_to_return = logits
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: elif self.logprobs_mode == "processed_logprobs":
logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
probs = logits.softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32)
return random_sample(probs, generators), logits_to_return return random_sample(probs, generators), logits_to_return
@ -115,7 +112,7 @@ class TopKTopPSampler(nn.Module):
"PyTorch-native implementation.") "PyTorch-native implementation.")
return self.forward_native(logits, generators, k, p) return self.forward_native(logits, generators, k, p)
assert self.logprobs_mode not in ( assert self.logprobs_mode not in (
LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS "processed_logits", "processed_logprobs"
), "FlashInfer does not support returning logits/logprobs" ), "FlashInfer does not support returning logits/logprobs"
# flashinfer sampling functions expect contiguous logits. # flashinfer sampling functions expect contiguous logits.
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous

View File

@ -60,8 +60,7 @@ class Sampler(nn.Module):
9. Return the final `SamplerOutput`. 9. Return the final `SamplerOutput`.
""" """
def __init__(self, def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"):
logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS):
super().__init__() super().__init__()
self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.topk_topp_sampler = TopKTopPSampler(logprobs_mode)
self.pin_memory = is_pin_memory_available() self.pin_memory = is_pin_memory_available()
@ -78,9 +77,9 @@ class Sampler(nn.Module):
# is used for sampling (after penalties and temperature scaling). # is used for sampling (after penalties and temperature scaling).
num_logprobs = sampling_metadata.max_num_logprobs num_logprobs = sampling_metadata.max_num_logprobs
if num_logprobs is not None: if num_logprobs is not None:
if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: if self.logprobs_mode == "raw_logprobs":
raw_logprobs = self.compute_logprobs(logits) raw_logprobs = self.compute_logprobs(logits)
elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: elif self.logprobs_mode == "raw_logits":
raw_logprobs = logits.clone() raw_logprobs = logits.clone()
# Use float32 for the logits. # Use float32 for the logits.
@ -156,9 +155,9 @@ class Sampler(nn.Module):
if sampling_metadata.all_greedy: if sampling_metadata.all_greedy:
processed_logprobs = None processed_logprobs = None
if sampling_metadata.max_num_logprobs is not None: if sampling_metadata.max_num_logprobs is not None:
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: if self.logprobs_mode == "processed_logits":
processed_logprobs = logits processed_logprobs = logits
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: elif self.logprobs_mode == "processed_logprobs":
processed_logprobs = self.compute_logprobs(logits) processed_logprobs = self.compute_logprobs(logits)
return greedy_sampled, processed_logprobs return greedy_sampled, processed_logprobs

View File

@ -90,13 +90,14 @@ def apply_grammar_bitmask(
seq = sorted(scheduler_output.structured_output_request_ids.items(), seq = sorted(scheduler_output.structured_output_request_ids.items(),
key=lambda x: x[1]) key=lambda x: x[1])
for req_id, _ in seq: for req_id, _ in seq:
logit_index = struct_out_req_batch_indices[req_id]
num_spec_tokens = len( num_spec_tokens = len(
scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
for i in range(1 + num_spec_tokens): if req_id in struct_out_req_batch_indices:
sorted_bitmask[logit_index + i] = \ logit_index = struct_out_req_batch_indices[req_id]
grammar_bitmask[cumulative_index + i] for i in range(1 + num_spec_tokens):
out_indices.append(logit_index + i) sorted_bitmask[logit_index + i] = \
grammar_bitmask[cumulative_index + i]
out_indices.append(logit_index + i)
cumulative_index += 1 + num_spec_tokens cumulative_index += 1 + num_spec_tokens
grammar_bitmask = sorted_bitmask grammar_bitmask = sorted_bitmask

View File

@ -107,9 +107,8 @@ class CPUModelRunner(GPUModelRunner):
self.model = get_model(vllm_config=self.vllm_config) self.model = get_model(vllm_config=self.vllm_config)
if self.lora_config: if self.lora_config:
self.model = self.load_lora_model(self.model, self.model_config, self.model = self.load_lora_model(self.model, self.vllm_config,
self.scheduler_config, self.device)
self.lora_config, self.device)
def get_model(self) -> nn.Module: def get_model(self) -> nn.Module:
return self.model return self.model

View File

@ -2552,10 +2552,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.model = model_loader.load_model( self.model = model_loader.load_model(
vllm_config=self.vllm_config, model_config=self.model_config) vllm_config=self.vllm_config, model_config=self.model_config)
if self.lora_config: if self.lora_config:
self.model = self.load_lora_model(self.model, self.model = self.load_lora_model(self.model, self.vllm_config,
self.model_config,
self.scheduler_config,
self.lora_config,
self.device) self.device)
if hasattr(self, "drafter"): if hasattr(self, "drafter"):
logger.info("Loading drafter model...") logger.info("Loading drafter model...")

View File

@ -13,6 +13,8 @@ from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown,
get_kv_transfer_group, get_kv_transfer_group,
has_kv_transfer_group) has_kv_transfer_group)
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorStats)
from vllm.forward_context import get_forward_context, set_forward_context from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput,
@ -119,4 +121,11 @@ class KVConnectorModelRunnerMixin:
output.finished_sending, output.finished_recving = ( output.finished_sending, output.finished_recving = (
kv_connector.get_finished(scheduler_output.finished_req_ids)) kv_connector.get_finished(scheduler_output.finished_req_ids))
kv_connector.clear_connector_metadata() output.kv_connector_stats = KVConnectorModelRunnerMixin.\
get_kv_connector_stats()
@staticmethod
def get_kv_connector_stats() -> Optional[KVConnectorStats]:
if has_kv_transfer_group():
return get_kv_transfer_group().get_kv_connector_stats()
return None

View File

@ -11,7 +11,7 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import ModelConfig, SchedulerConfig from vllm.config import VllmConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.layers import LoRAMapping from vllm.lora.layers import LoRAMapping
@ -31,9 +31,7 @@ class LoRAModelRunnerMixin:
LORA_WARMUP_RANK = 8 LORA_WARMUP_RANK = 8
def load_lora_model(self, model: nn.Module, model_config: ModelConfig, def load_lora_model(self, model: nn.Module, vllm_config: VllmConfig,
scheduler_config: SchedulerConfig,
lora_config: LoRAConfig,
device: torch.device) -> nn.Module: device: torch.device) -> nn.Module:
if not supports_lora(model): if not supports_lora(model):
@ -44,19 +42,12 @@ class LoRAModelRunnerMixin:
logger.warning("Regarding multimodal models, vLLM currently " logger.warning("Regarding multimodal models, vLLM currently "
"only supports adding LoRA to language model.") "only supports adding LoRA to language model.")
# Use get_text_config() in case of multimodal models
text_config = model_config.hf_config.get_text_config()
# Add LoRA Manager to the Model Runner # Add LoRA Manager to the Model Runner
self.lora_manager = LRUCacheWorkerLoRAManager( self.lora_manager = LRUCacheWorkerLoRAManager(
scheduler_config.max_num_seqs, vllm_config,
scheduler_config.max_num_batched_tokens,
model_config.get_vocab_size(),
lora_config,
device, device,
model.embedding_modules, model.embedding_modules,
model.embedding_padding_modules, model.embedding_padding_modules,
max_position_embeddings=text_config.max_position_embeddings,
) )
return self.lora_manager.create_lora_manager(model) return self.lora_manager.create_lora_manager(model)

View File

@ -1178,9 +1178,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
"or sharding the weights on more chips. " "or sharding the weights on more chips. "
f"See the detailed error: {e}") from e f"See the detailed error: {e}") from e
if self.lora_config is not None: if self.lora_config is not None:
model = self.load_lora_model(model, self.model_config, model = self.load_lora_model(model, self.vllm_config, self.device)
self.scheduler_config,
self.lora_config, self.device)
replace_set_lora(model) replace_set_lora(model)
# Sync all pending XLA execution during model initialization and weight # Sync all pending XLA execution during model initialization and weight

View File

@ -1078,20 +1078,13 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"Regarding multimodal models, vLLM currently " "Regarding multimodal models, vLLM currently "
"only supports adding LoRA to language model.") "only supports adding LoRA to language model.")
# Use get_text_config() in case of multimodal models
text_config = self.model_config.hf_config.get_text_config()
self.lora_manager = LRUCacheWorkerLoRAManager( self.lora_manager = LRUCacheWorkerLoRAManager(
self.scheduler_config.max_num_seqs, self.vllm_config,
self.scheduler_config.max_num_batched_tokens,
self.vocab_size,
self.lora_config,
self.device, self.device,
self.model.embedding_modules, self.model.embedding_modules,
self.model.embedding_padding_modules, self.model.embedding_padding_modules,
max_position_embeddings=text_config.
max_position_embeddings,
) )
self.model = self.lora_manager.create_lora_manager(self.model) self.model = self.lora_manager.create_lora_manager(self.model)
time_after_load = time.perf_counter() time_after_load = time.perf_counter()