mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 05:17:03 +08:00
Merge branch 'main' into woosuk/model-runner-v2
This commit is contained in:
commit
396bbe67d3
13
.github/CODEOWNERS
vendored
13
.github/CODEOWNERS
vendored
@ -66,18 +66,25 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
/docs/mkdocs @hmellor
|
||||
/docs/**/*.yml @hmellor
|
||||
/requirements/docs.txt @hmellor
|
||||
.readthedocs.yaml @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
|
||||
# Linting
|
||||
.markdownlint.yaml @hmellor
|
||||
.pre-commit-config.yaml @hmellor
|
||||
|
||||
# CPU
|
||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
||||
/csrc/cpu @bigPYJ1151
|
||||
/vllm/platforms/cpu.py @bigPYJ1151
|
||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||
/docker/Dockerfile.cpu @bigPYJ1151
|
||||
|
||||
# Intel GPU
|
||||
/vllm/v1/worker/^xpu @jikunshang
|
||||
/vllm/v1/worker/xpu* @jikunshang
|
||||
/vllm/platforms/xpu.py @jikunshang
|
||||
/docker/Dockerfile.xpu @jikunshang
|
||||
|
||||
|
||||
@ -11,13 +11,13 @@ from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
@ -59,7 +59,7 @@ enabling the corresponding APIs:
|
||||
#### Predefined models
|
||||
|
||||
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
|
||||
you can override some of its attributes via the `--override-pooler-config` option.
|
||||
you can override some of its attributes via the `--pooler-config` option.
|
||||
|
||||
#### Converted models
|
||||
|
||||
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--override-pooler-config` option,
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models]
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
Currently, the Transformers backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only
|
||||
- Attention types: full attention and/or sliding attention
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
|
||||
Checking if the modeling backend is Transformers is as simple as:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -27,16 +42,12 @@ llm = LLM(model=...) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(type(model)))
|
||||
```
|
||||
|
||||
If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
|
||||
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
|
||||
|
||||
!!! tip
|
||||
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
|
||||
!!! note
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
|
||||
!!! note
|
||||
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
|
||||
#### Custom models
|
||||
|
||||
@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
<details>
|
||||
<details class="code">
|
||||
<summary>modeling_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -78,6 +90,7 @@ from transformers import PreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
is_causal = False # Only do this for encoder-only models
|
||||
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
...
|
||||
@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||
|
||||
<details>
|
||||
<details class="code">
|
||||
<summary>configuration_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -457,7 +470,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
@ -552,7 +565,7 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Token Classification
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ python client.py
|
||||
|
||||
### Server Configuration
|
||||
|
||||
The key parameters for chunked processing are in the `--override-pooler-config`:
|
||||
The key parameters for chunked processing are in the `--pooler-config`:
|
||||
|
||||
```json
|
||||
{
|
||||
|
||||
@ -13,7 +13,7 @@ Prerequisites:
|
||||
|
||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||
vllm serve intfloat/multilingual-e5-large \
|
||||
--override-pooler-config \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||
--served-model-name multilingual-e5-large \
|
||||
@ -23,7 +23,7 @@ Prerequisites:
|
||||
|
||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||
vllm serve BAAI/bge-large-en-v1.5 \
|
||||
--override-pooler-config \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||
--served-model-name bge-large-en-v1.5 \
|
||||
|
||||
@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
|
||||
vllm serve "$MODEL_NAME" \
|
||||
--tensor-parallel-size "$GPU_COUNT" \
|
||||
--enforce-eager \
|
||||
--override-pooler-config "$POOLER_CONFIG" \
|
||||
--pooler-config "$POOLER_CONFIG" \
|
||||
--served-model-name ${MODEL_CODE} \
|
||||
--api-key "$API_KEY" \
|
||||
--trust-remote-code \
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -327,12 +325,7 @@ def main():
|
||||
|
||||
|
||||
if args.command == "serialize":
|
||||
eng_args_dict = {f.name: getattr(args, f.name) for f in
|
||||
dataclasses.fields(EngineArgs)}
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(
|
||||
argparse.Namespace(**eng_args_dict)
|
||||
)
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
input_dir = tensorizer_dir.rstrip('/')
|
||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||
|
||||
@ -39,7 +39,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||
from vllm.config.model import (ConvertOption, RunnerOption,
|
||||
_get_and_verify_dtype)
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -244,39 +245,6 @@ class DecoderPromptType(Enum):
|
||||
EMPTY_STR = 3
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_encoder_decoder_prompts(
|
||||
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
|
||||
'''
|
||||
Returns an encoder prompt list and a decoder prompt list, wherein each pair
|
||||
of same-index entries in both lists corresponds to an (encoder prompt,
|
||||
decoder prompt) tuple.
|
||||
|
||||
Returns:
|
||||
|
||||
* Encoder prompt list
|
||||
* Decoder prompt list (reverse of encoder prompt list)
|
||||
'''
|
||||
|
||||
encoder_prompts = []
|
||||
for filename in _TEST_PROMPTS:
|
||||
encoder_prompts += _read_prompts(filename)
|
||||
|
||||
custom_decoder_prompts = encoder_prompts[::-1]
|
||||
empty_str_decoder_prompts = [""] * len(encoder_prompts)
|
||||
none_decoder_prompts = [None] * len(encoder_prompts)
|
||||
|
||||
# NONE decoder prompt type
|
||||
return {
|
||||
DecoderPromptType.NONE:
|
||||
zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
|
||||
DecoderPromptType.EMPTY_STR:
|
||||
zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
|
||||
DecoderPromptType.CUSTOM:
|
||||
zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_long_prompts() -> list[str]:
|
||||
prompts = []
|
||||
@ -690,68 +658,6 @@ class HfRunner:
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs_limit(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs]:
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
all_logprobs: list[list[dict[int, float]]] = []
|
||||
all_output_ids: list[list[int]] = []
|
||||
all_output_strs: list[str] = []
|
||||
|
||||
for i, (encoder_prompt, decoder_prompt) in enumerate(
|
||||
to_enc_dec_tuple_list(encoder_decoder_prompts)):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": encoder_prompt,
|
||||
"return_tensors": "pt",
|
||||
}
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_inputs = self.processor(**processor_kwargs)
|
||||
encoder_inputs = self.wrap_device(encoder_inputs)
|
||||
|
||||
if decoder_prompt is None:
|
||||
decoder_input_ids = None
|
||||
else:
|
||||
decoder_inputs = self.tokenizer(decoder_prompt,
|
||||
return_tensors="pt")
|
||||
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
|
||||
|
||||
output = self.model.generate(
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
output_hidden_states=True,
|
||||
return_dict_in_generate=True,
|
||||
**encoder_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
(
|
||||
seq_logprobs_lst,
|
||||
output_len,
|
||||
) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
|
||||
num_logprobs)
|
||||
|
||||
all_logprobs.append(seq_logprobs_lst)
|
||||
seq_ids = output.sequences[0]
|
||||
output_ids = seq_ids[-output_len:]
|
||||
all_output_ids.append(output_ids.tolist())
|
||||
all_output_strs.append(self.tokenizer.decode(output_ids))
|
||||
|
||||
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def encode(self, prompts: list[str], *args,
|
||||
**kwargs) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts, *args, **kwargs)
|
||||
@ -940,26 +846,6 @@ class VllmRunner:
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_encoder_decoder_w_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
sampling_params: SamplingParams,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
'''
|
||||
Logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
assert sampling_params.logprobs is not None
|
||||
req_outputs = self.llm.generate(encoder_decoder_prompts,
|
||||
sampling_params=sampling_params)
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
@ -1037,29 +923,6 @@ class VllmRunner:
|
||||
|
||||
return perplexities
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
skip_special_tokens: bool = True,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
greedy_logprobs_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_logprobs,
|
||||
prompt_logprobs=(num_prompt_logprobs),
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
return self.generate_encoder_decoder_w_logprobs(
|
||||
encoder_decoder_prompts, greedy_logprobs_params)
|
||||
|
||||
def generate_beam_search(
|
||||
self,
|
||||
prompts: list[str],
|
||||
|
||||
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
|
||||
@ -515,6 +515,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
@ -216,7 +216,7 @@ def server_with_chunked_processing():
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||
'--override-pooler-config',
|
||||
'--pooler-config',
|
||||
('{"pooling_type": "MEAN", "normalize": true, '
|
||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'),
|
||||
"--gpu-memory-utilization",
|
||||
|
||||
@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 0.5
|
||||
|
||||
# Copy the args to avoid mutating the
|
||||
# Copy the args to avoid mutating them
|
||||
args = api_server_args.copy()
|
||||
|
||||
if not with_stats_update:
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
@ -15,6 +18,7 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
head_size: int, max_position_embeddings: int,
|
||||
dtype: torch.dtype, device: torch.device):
|
||||
"""Generate test data for given configuration."""
|
||||
current_platform.seed_everything(42)
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(0,
|
||||
max_position_embeddings // 4, (3, num_tokens),
|
||||
@ -33,22 +37,37 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
return positions, query, key
|
||||
|
||||
|
||||
def unroll_model_tp_dict(model_tp_dict):
|
||||
return [(model_name, tp_size)
|
||||
for model_name, tp_sizes in model_tp_dict.items()
|
||||
for tp_size in tp_sizes]
|
||||
class MRoPETestInfo(NamedTuple):
|
||||
model_name: str
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
atol: float = 1e-2
|
||||
rtol: float = 1.6e-2
|
||||
marks: list[pytest.MarkDecorator] = []
|
||||
|
||||
|
||||
model_tp_dict = {
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
|
||||
}
|
||||
TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
|
||||
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
dtype_atol_rtol_list = [
|
||||
[torch.bfloat16, 1e-2, 1.6e-2],
|
||||
MODELS_TO_TEST = [
|
||||
MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-4B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
]
|
||||
|
||||
num_tokens_list = [11, 8192]
|
||||
@ -56,20 +75,29 @@ num_tokens_list = [11, 8192]
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize("model_name, tp_size",
|
||||
unroll_model_tp_dict(model_tp_dict))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
is_neox_style = True
|
||||
|
||||
rope_theta = config.rope_theta
|
||||
@ -111,24 +139,30 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tp_size",
|
||||
unroll_model_tp_dict({
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2]
|
||||
}))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", [4])
|
||||
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
|
||||
num_tokens):
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope_torch_compile_tracing(model_name: str,
|
||||
model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
@ -8,11 +8,12 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
@ -435,10 +436,19 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
target_modules=["layer1.dense1", "dense2"],
|
||||
lora_dtype=DEFAULT_DTYPE,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
worker_adapter_manager = LRUCacheWorkerLoRAManager(
|
||||
4, 2,
|
||||
dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
|
||||
lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
|
||||
worker_adapter_manager.max_num_seqs = 4
|
||||
worker_adapter_manager.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager.create_lora_manager(dummy_model)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
@ -517,10 +527,20 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
max_cpu_loras=4,
|
||||
max_loras=4,
|
||||
lora_dtype=DEFAULT_DTYPE)
|
||||
worker_adapter_manager = WorkerLoRAManager(
|
||||
4, 2, dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size, lora_config, device,
|
||||
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
|
||||
EMBEDDING_MODULES,
|
||||
EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager.vocab_size = (
|
||||
dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size)
|
||||
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
|
||||
@ -9,7 +9,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
@ -58,7 +58,7 @@ def test_models(
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
vllm_extra_kwargs["pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@ -99,7 +100,7 @@ def test_gemma_multimodal(
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
override_pooler_config={"pooling_type": "LAST"},
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
|
||||
@ -24,18 +24,18 @@ def test_classify_models_using_activation(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=False)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False)) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True)) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out,
|
||||
@ -43,9 +43,8 @@ def test_classify_models_using_activation(
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_activation, w_activation,
|
||||
atol=1e-2), "override_pooler_config is not working"
|
||||
assert not torch.allclose(wo_activation, w_activation,
|
||||
atol=1e-2), "pooler_config is not working"
|
||||
assert torch.allclose(softmax(wo_activation), w_activation,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -65,23 +64,22 @@ def test_embed_models_using_normalize(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_normalize, w_normalize,
|
||||
atol=1e-2), "override_pooler_config normalize is not working"
|
||||
atol=1e-2), "pooler_config normalize is not working"
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
@ -102,18 +100,16 @@ def test_reward_models_using_softmax(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
@ -121,7 +117,7 @@ def test_reward_models_using_softmax(
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo, w, atol=1e-2), "override_pooler_config softmax is not working"
|
||||
wo, w, atol=1e-2), "pooler_config softmax is not working"
|
||||
assert torch.allclose(
|
||||
softmax(wo), w,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
@ -7,7 +7,6 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import ModelImpl
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
@ -111,8 +110,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# these tests seem to produce leftover memory
|
||||
gpu_memory_utilization=0.80,
|
||||
load_format="dummy",
|
||||
model_impl=ModelImpl.TRANSFORMERS
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
|
||||
model_impl="transformers"
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
|
||||
hf_overrides=hf_overrides_fn,
|
||||
max_num_seqs=model_info.max_num_seqs)
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from ..utils import multi_gpu_test, prep_prompts
|
||||
from .utils import check_logprobs_close
|
||||
from .utils import check_embeddings_close, check_logprobs_close
|
||||
|
||||
|
||||
def check_implementation(
|
||||
@ -165,6 +165,40 @@ def test_embed_loading(vllm_runner, model):
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Encoder model
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
])
|
||||
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if installed < required:
|
||||
pytest.skip("Encoder models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
|
||||
with vllm_runner(model, max_model_len=512,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
|
||||
@ -207,25 +207,19 @@ def test_get_pooling_config():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
pooling_config = model_config._init_pooler_config()
|
||||
assert pooling_config is not None
|
||||
|
||||
assert pooling_config.normalize
|
||||
assert pooling_config.pooling_type == PoolingType.MEAN.name
|
||||
assert model_config.pooler_config is not None
|
||||
assert model_config.pooler_config.normalize
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_get_pooling_config_from_args():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(model_id)
|
||||
pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
|
||||
model_config = ModelConfig(model_id, pooler_config=pooler_config)
|
||||
|
||||
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
|
||||
model_config.override_pooler_config = override_pooler_config
|
||||
|
||||
pooling_config = model_config._init_pooler_config()
|
||||
assert pooling_config is not None
|
||||
assert asdict(pooling_config) == asdict(override_pooler_config)
|
||||
assert asdict(model_config.pooler_config) == asdict(pooler_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
|
||||
assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
|
||||
|
||||
|
||||
def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats:
|
||||
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
|
||||
|
||||
|
||||
def test_metrics():
|
||||
"""
|
||||
Test the prefix caching metrics.
|
||||
"""
|
||||
|
||||
def stats(requests, queries, hits):
|
||||
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
|
||||
|
||||
metrics = PrefixCachingMetrics(max_recent_requests=5)
|
||||
assert metrics.hit_rate == 0.0
|
||||
|
||||
metrics.observe(stats(1, 20, 9))
|
||||
metrics.observe(_stats(1, 20, 9))
|
||||
# 9 / 20 = 0.45
|
||||
assert metrics.hit_rate == 0.45
|
||||
|
||||
metrics.observe(stats(4, 80, 16))
|
||||
metrics.observe(_stats(4, 80, 16))
|
||||
|
||||
# 25 / 100 = 0.25
|
||||
assert metrics.hit_rate == 0.25
|
||||
|
||||
metrics.observe(stats(1, 10, 2))
|
||||
metrics.observe(_stats(1, 10, 2))
|
||||
|
||||
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
|
||||
assert metrics.aggregated_requests == 5
|
||||
@ -549,6 +549,38 @@ def test_metrics():
|
||||
assert not metrics.query_queue
|
||||
|
||||
|
||||
def test_metrics_empty_stats():
|
||||
"""
|
||||
Test the prefix caching metrics with empty stats.
|
||||
"""
|
||||
metrics = PrefixCachingMetrics(max_recent_requests=5)
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(1, 20, 9))
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(4, 80, 16))
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(1, 10, 2))
|
||||
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
|
||||
assert metrics.aggregated_requests == 5
|
||||
assert metrics.aggregated_query_total == 90
|
||||
assert metrics.aggregated_query_hit == 18
|
||||
assert metrics.hit_rate == 0.2
|
||||
|
||||
# Only the latest added stats preserved 10 / 20 = 0.5
|
||||
metrics.observe(_stats(11, 20, 10))
|
||||
assert metrics.aggregated_requests == 11
|
||||
assert metrics.aggregated_query_total == 20
|
||||
assert metrics.aggregated_query_hit == 10
|
||||
assert metrics.hit_rate == 0.5
|
||||
|
||||
# Only the latest added stats preserved 30 / 40 = 0.75
|
||||
metrics.observe(_stats(22, 40, 30))
|
||||
assert metrics.aggregated_requests == 22
|
||||
assert metrics.aggregated_query_total == 40
|
||||
assert metrics.aggregated_query_hit == 30
|
||||
assert metrics.hit_rate == 0.75
|
||||
|
||||
|
||||
def test_get_kv_cache_configs_multiple_workers():
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
@ -18,12 +18,18 @@ import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
|
||||
MultiKVConnectorStats)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
||||
KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
|
||||
NixlConnectorWorker)
|
||||
NixlConnectorWorker, NixlKVConnectorStats)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
|
||||
from .utils import create_request, create_scheduler, create_vllm_config
|
||||
|
||||
@ -475,6 +481,209 @@ class TestNixlHandshake:
|
||||
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
|
||||
# we put here is important. First run ray, it will clean up the resources, then
|
||||
# the rest of the tests.
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FakeNixlWrapper)
|
||||
def test_kv_connector_stats(dist_init):
|
||||
"""Test that KV transfer stats are properly recorded and retrieved."""
|
||||
vllm_config = create_vllm_config()
|
||||
|
||||
# Test worker role in decode server.
|
||||
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
|
||||
connector.connector_worker = FakeNixlConnectorWorker(vllm_config,
|
||||
connector.engine_id,
|
||||
hand_shake_latency=0)
|
||||
|
||||
# Verify that xfer_stats starts empty
|
||||
initial_stats = connector.get_kv_connector_stats()
|
||||
assert initial_stats is None
|
||||
|
||||
# Create transfer metadata
|
||||
request_id = "test_req_for_stats"
|
||||
metadata = NixlConnectorMetadata()
|
||||
metadata.add_new_req(request_id=request_id,
|
||||
local_block_ids=[1, 2, 3],
|
||||
kv_transfer_params={
|
||||
"remote_block_ids": [4, 5, 6],
|
||||
"remote_engine_id":
|
||||
FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
|
||||
"remote_host": "localhost",
|
||||
"remote_port": 1234,
|
||||
"remote_tp_size": 1,
|
||||
})
|
||||
connector.bind_connector_metadata(metadata)
|
||||
|
||||
# Start the transfer
|
||||
dummy_ctx = ForwardContext(
|
||||
no_compile_layers={},
|
||||
attn_metadata={},
|
||||
virtual_engine=0,
|
||||
)
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
|
||||
# Verify stats are recorded after transfer is complete
|
||||
max_iterations = 2
|
||||
# Clear metadata before start_load_kv to prevent reprocessing same request
|
||||
connector.bind_connector_metadata(NixlConnectorMetadata())
|
||||
for _ in range(max_iterations):
|
||||
# Need to call start_load_kv to process completed handshakes
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
_, done_recving = connector.get_finished(finished_req_ids=set())
|
||||
if len(done_recving) > 0 and request_id in done_recving:
|
||||
break
|
||||
time.sleep(
|
||||
0.1) # Small delay to allow background handshake to complete
|
||||
else:
|
||||
assert "Transfer did not complete within expected iterations"
|
||||
|
||||
# Now check that stats were recorded
|
||||
stats_after_transfer = connector.get_kv_connector_stats()
|
||||
assert isinstance(stats_after_transfer, NixlKVConnectorStats)
|
||||
|
||||
# Verify stats values are recorded
|
||||
assert not stats_after_transfer.is_empty()
|
||||
assert stats_after_transfer.data["num_successful_transfers"] == 1
|
||||
|
||||
# Verify stats are reset after retrieval
|
||||
stats_after_reset = connector.get_kv_connector_stats()
|
||||
assert stats_after_reset is None
|
||||
|
||||
|
||||
def test_kv_connector_stats_aggregation():
|
||||
"""
|
||||
Test KV transfer stats aggregation across TP ranks using
|
||||
KVOutputAggregator (used by MultiprocExecutor).
|
||||
"""
|
||||
|
||||
# Create KVOutputAggregator for 3 workers (simulating TP=3), same thing
|
||||
# done in MultiprocExecutor.execute_model
|
||||
aggregator = KVOutputAggregator(world_size=3)
|
||||
|
||||
# Create stats for multiple workers with different transfer patterns
|
||||
worker1_stats = NixlKVConnectorStats()
|
||||
worker2_stats = NixlKVConnectorStats()
|
||||
worker3_stats = NixlKVConnectorStats()
|
||||
|
||||
# Record different transfers on each worker
|
||||
# Worker 1: 2 transfers
|
||||
worker1_stats.record_transfer()
|
||||
worker1_stats.record_transfer()
|
||||
|
||||
# Worker 2: 1 transfer
|
||||
worker2_stats.record_transfer()
|
||||
|
||||
# Worker 3: 3 transfers
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
|
||||
# Create ModelRunnerOutput instances for each worker
|
||||
worker_outputs = []
|
||||
for i, worker_stats in enumerate(
|
||||
[worker1_stats, worker2_stats, worker3_stats]):
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]], # dummy token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[None],
|
||||
kv_connector_output=KVConnectorOutput(
|
||||
finished_sending=set([f"req_{i}_send"])
|
||||
if i < 2 else None, # Workers 0,1 finished sending
|
||||
finished_recving=set([f"req_{i}_recv"])
|
||||
if i > 0 else None, # Workers 1,2 finished receiving
|
||||
kv_connector_stats=worker_stats,
|
||||
))
|
||||
worker_outputs.append(output)
|
||||
|
||||
# Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
|
||||
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
|
||||
kv_connector_stats = \
|
||||
aggregated_output.kv_connector_output.kv_connector_stats
|
||||
assert isinstance(kv_connector_stats, NixlKVConnectorStats)
|
||||
# Number of total transfers across all workers.
|
||||
assert kv_connector_stats.data["num_successful_transfers"] == 6
|
||||
|
||||
|
||||
def test_multi_kv_connector_stats_aggregation():
|
||||
"""
|
||||
Test MultiKVConnectorStats aggregation across TP ranks using
|
||||
KVOutputAggregator (used by MultiprocExecutor).
|
||||
"""
|
||||
|
||||
aggregator = KVOutputAggregator(world_size=3)
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class FooKVConnectorStats(KVConnectorStats):
|
||||
|
||||
def reset(self):
|
||||
self.data = {"num_foo_transfers": 0}
|
||||
|
||||
def record_transfer(self):
|
||||
if "num_foo_transfers" not in self.data:
|
||||
self.data["num_foo_transfers"] = 0
|
||||
self.data["num_foo_transfers"] += 1
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return self.data["num_foo_transfers"] == 0
|
||||
|
||||
def aggregate(self,
|
||||
other: "FooKVConnectorStats") -> "FooKVConnectorStats":
|
||||
if not other.is_empty():
|
||||
self.data["num_foo_transfers"] += other.data[
|
||||
"num_foo_transfers"]
|
||||
return self
|
||||
|
||||
def make_multi_stats(nixl_count: int,
|
||||
foo_count: int) -> MultiKVConnectorStats:
|
||||
data: dict[str, KVConnectorStats] = {}
|
||||
if nixl_count > 0:
|
||||
nixl_stats = NixlKVConnectorStats()
|
||||
for _ in range(nixl_count):
|
||||
nixl_stats.record_transfer()
|
||||
data["NixlConnector"] = nixl_stats
|
||||
if foo_count > 0:
|
||||
foo_stats = FooKVConnectorStats()
|
||||
for _ in range(foo_count):
|
||||
foo_stats.record_transfer()
|
||||
data["FooConnector"] = foo_stats
|
||||
return MultiKVConnectorStats(data=data)
|
||||
|
||||
# Create heterogeneous stats across 3 workers
|
||||
worker_patterns = [(2, 1), (3, 0), (0, 5)] # (Nixl, Foo)
|
||||
|
||||
worker_outputs: list[ModelRunnerOutput] = []
|
||||
for i, (nixl, foo) in enumerate(worker_patterns):
|
||||
stats = make_multi_stats(nixl, foo)
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[None],
|
||||
kv_connector_output=KVConnectorOutput(
|
||||
finished_sending=set([f"req_{i}_send"]) if i < 2 else None,
|
||||
finished_recving=set([f"req_{i}_recv"]) if i > 0 else None,
|
||||
kv_connector_stats=stats,
|
||||
),
|
||||
)
|
||||
worker_outputs.append(output)
|
||||
|
||||
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
|
||||
kv_connector_stats = \
|
||||
aggregated_output.kv_connector_output.kv_connector_stats
|
||||
assert isinstance(kv_connector_stats, MultiKVConnectorStats)
|
||||
|
||||
# Validate per-connector totals across workers
|
||||
assert kv_connector_stats["NixlConnector"].data[
|
||||
"num_successful_transfers"] == 5
|
||||
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
|
||||
177
tests/v1/kv_offload/test_cpu_gpu.py
Normal file
177
tests/v1/kv_offload/test_cpu_gpu.py
Normal file
@ -0,0 +1,177 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
|
||||
from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
|
||||
|
||||
NUM_GPU_BLOCKS = [64]
|
||||
NUM_CPU_BLOCKS = [256]
|
||||
GPU_BLOCK_SIZES = [16]
|
||||
GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
|
||||
HEAD_SIZES = [64]
|
||||
NUM_HEADS = [8]
|
||||
NUM_LAYERS = [4]
|
||||
DTYPES = [torch.bfloat16]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = ['cuda:0']
|
||||
NUM_MAPPINGS = [3]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gpu_to_cpu", [True, False])
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
|
||||
@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
|
||||
@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_transfer(
|
||||
gpu_to_cpu: bool,
|
||||
num_mappings: int,
|
||||
head_size: int,
|
||||
num_heads: int,
|
||||
gpu_block_size: int,
|
||||
gpu_blocks_per_cpu_block: int,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
num_layers: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
# create per-layer GPU KV caches
|
||||
attn_backends_list = [
|
||||
FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend
|
||||
]
|
||||
|
||||
gpu_caches = {}
|
||||
attn_backends = {}
|
||||
for i in range(num_layers):
|
||||
layer_name = f'layer {i}'
|
||||
|
||||
attn_backend = attn_backends_list[i % len(attn_backends_list)]
|
||||
attn_backends[layer_name] = attn_backend
|
||||
|
||||
gpu_cache_shape = attn_backend.get_kv_cache_shape(
|
||||
num_gpu_blocks, gpu_block_size, num_heads, head_size)
|
||||
gpu_caches[layer_name] = torch.rand(gpu_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
|
||||
# create handler
|
||||
cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
|
||||
handler = CpuGpuOffloadingHandler(attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=cpu_block_size,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
gpu_caches=gpu_caches)
|
||||
|
||||
# select block mappings
|
||||
gpu_blocks = random.sample(range(num_gpu_blocks),
|
||||
num_mappings * gpu_blocks_per_cpu_block)
|
||||
cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
|
||||
|
||||
# convert cpu blocks to gpu block size
|
||||
cpu_blocks_in_gpu_block_size = []
|
||||
for cpu_block in cpu_blocks:
|
||||
base_block_id = cpu_block * gpu_blocks_per_cpu_block
|
||||
for i in range(gpu_blocks_per_cpu_block):
|
||||
cpu_blocks_in_gpu_block_size.append(i + base_block_id)
|
||||
|
||||
# maybe skip a GPU block to test writing to the middle of a CPU block
|
||||
if gpu_to_cpu:
|
||||
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:]
|
||||
cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
|
||||
gpu_blocks_per_cpu_block - 1:]
|
||||
|
||||
# set transfer direction
|
||||
if gpu_to_cpu:
|
||||
src_kv_caches = handler.gpu_tensors
|
||||
dst_kv_caches = handler.cpu_tensors
|
||||
src_spec_class = GPULoadStoreSpec
|
||||
dst_spec_class = CPULoadStoreSpec
|
||||
src_blocks = gpu_blocks
|
||||
dst_blocks = cpu_blocks
|
||||
src_blocks_in_gpu_block_size = gpu_blocks
|
||||
dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
|
||||
dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
|
||||
else:
|
||||
src_kv_caches = handler.cpu_tensors
|
||||
dst_kv_caches = handler.gpu_tensors
|
||||
src_spec_class = CPULoadStoreSpec
|
||||
dst_spec_class = GPULoadStoreSpec
|
||||
src_blocks = cpu_blocks
|
||||
dst_blocks = gpu_blocks
|
||||
src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
|
||||
dst_blocks_in_gpu_block_size = gpu_blocks
|
||||
dst_size_in_gpu_blocks = num_gpu_blocks
|
||||
|
||||
# build dst -> src mapping
|
||||
dst_to_src = {}
|
||||
for src_block, dst_block in zip(src_blocks_in_gpu_block_size,
|
||||
dst_blocks_in_gpu_block_size):
|
||||
dst_to_src[dst_block] = src_block
|
||||
|
||||
# build transfer specs
|
||||
src_spec = src_spec_class(src_blocks)
|
||||
dst_spec = dst_spec_class(dst_blocks)
|
||||
|
||||
# clone src and dst tensors before transfer
|
||||
orig_src_caches = [x.clone() for x in src_kv_caches]
|
||||
orig_dst_caches = [x.clone() for x in dst_kv_caches]
|
||||
|
||||
# call transfer function
|
||||
assert handler.transfer_async(1, (src_spec, dst_spec))
|
||||
assert set(handler.transfer_events.keys()) == {1}
|
||||
|
||||
# wait for transfer to complete
|
||||
end_time = time.time() + 10
|
||||
while time.time() < end_time:
|
||||
finished = handler.get_finished()
|
||||
if finished:
|
||||
assert finished == [(1, True)]
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
# verify src tensors did not change
|
||||
for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches):
|
||||
assert torch.equal(orig_tensor, tensor)
|
||||
|
||||
# verify dst tensors
|
||||
for dst_block in range(dst_size_in_gpu_blocks):
|
||||
src_block_candidate = dst_to_src.get(dst_block)
|
||||
for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
|
||||
src_kv_caches, dst_kv_caches, orig_dst_caches,
|
||||
handler.kv_dim_before_num_blocks):
|
||||
if kv_dim:
|
||||
# iterate over key, value
|
||||
for i in range(2):
|
||||
if src_block_candidate is not None:
|
||||
expected_value = src_cache[i][src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[i][dst_block]
|
||||
torch.testing.assert_close(dst_cache[i][dst_block].cpu(),
|
||||
expected_value.cpu())
|
||||
else:
|
||||
if src_block_candidate is not None:
|
||||
expected_value = src_cache[src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[dst_block]
|
||||
torch.testing.assert_close(dst_cache[dst_block].cpu(),
|
||||
expected_value.cpu())
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import itertools
|
||||
from collections.abc import Generator
|
||||
from typing import get_args
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -464,7 +465,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
|
||||
assert len(prompt_logprob) == vocab_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
|
||||
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
|
||||
def test_logprobs_mode(logprobs_mode: LogprobsMode,
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test with LLM engine with different logprobs_mode.
|
||||
@ -493,14 +494,12 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode,
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
logprob = logprobs[token_id]
|
||||
if logprobs_mode in (LogprobsMode.RAW_LOGPROBS,
|
||||
LogprobsMode.PROCESSED_LOGPROBS):
|
||||
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
|
||||
assert logprob.logprob <= 0
|
||||
if logprob.logprob > 0:
|
||||
positive_values = positive_values + 1
|
||||
total_token_with_logprobs = total_token_with_logprobs + 1
|
||||
assert total_token_with_logprobs >= len(results[0].outputs)
|
||||
if logprobs_mode in (LogprobsMode.RAW_LOGITS,
|
||||
LogprobsMode.PROCESSED_LOGITS):
|
||||
if logprobs_mode in ("raw_logits", "processed_logits"):
|
||||
assert positive_values > 0
|
||||
del llm
|
||||
|
||||
@ -9,6 +9,7 @@ from contextlib import AsyncExitStack
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
@ -70,6 +71,8 @@ class ExternalLBServerManager:
|
||||
sargs,
|
||||
auto_port=False,
|
||||
env_dict={
|
||||
"VLLM_SERVER_DEV_MODE":
|
||||
"1",
|
||||
current_platform.device_control_env_var:
|
||||
",".join(
|
||||
str(
|
||||
@ -127,11 +130,19 @@ def default_server_args():
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[1, 4])
|
||||
def servers(request, default_server_args):
|
||||
def server_manager(request, default_server_args):
|
||||
api_server_count = request.param
|
||||
with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
|
||||
default_server_args) as server_list:
|
||||
yield server_list
|
||||
server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE,
|
||||
api_server_count,
|
||||
default_server_args)
|
||||
|
||||
with server_manager:
|
||||
yield server_manager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def servers(server_manager):
|
||||
return server_manager.servers
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@ -144,6 +155,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
|
||||
]
|
||||
|
||||
|
||||
def _get_parallel_config(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("server_info?config_format=json"))
|
||||
response.raise_for_status()
|
||||
|
||||
vllm_config = response.json()["vllm_config"]
|
||||
return vllm_config["parallel_config"]
|
||||
|
||||
|
||||
def test_external_lb_server_info(server_manager):
|
||||
servers = server_manager.servers
|
||||
api_server_count = server_manager.api_server_count
|
||||
|
||||
for i, (server, _) in enumerate(servers):
|
||||
print(f"Testing {i=}")
|
||||
|
||||
# Each request will hit one of the API servers
|
||||
# `n_reqs` is set so that there is a good chance each server
|
||||
# receives at least one request
|
||||
n_reqs = 2 * api_server_count * api_server_count
|
||||
parallel_configs = [
|
||||
_get_parallel_config(server) for _ in range(n_reqs)
|
||||
]
|
||||
api_process_counts = [
|
||||
c["_api_process_count"] for c in parallel_configs
|
||||
]
|
||||
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
|
||||
|
||||
assert all(c == api_server_count
|
||||
for c in api_process_counts), api_process_counts
|
||||
assert all(0 <= r < api_server_count
|
||||
for r in api_process_ranks), api_process_ranks
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
|
||||
@ -9,6 +9,7 @@ from contextlib import AsyncExitStack
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from tests.v1.test_utils import check_request_balancing
|
||||
@ -92,6 +93,8 @@ class HybridLBServerManager:
|
||||
sargs,
|
||||
auto_port=False,
|
||||
env_dict={
|
||||
"VLLM_SERVER_DEV_MODE":
|
||||
"1",
|
||||
current_platform.device_control_env_var:
|
||||
",".join(
|
||||
str(
|
||||
@ -150,12 +153,20 @@ def default_server_args():
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[1, 4])
|
||||
def servers(request, default_server_args):
|
||||
def server_manager(request, default_server_args):
|
||||
api_server_count = request.param
|
||||
with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
|
||||
default_server_args, DP_SIZE_LOCAL,
|
||||
TP_SIZE) as server_list:
|
||||
yield server_list
|
||||
server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE,
|
||||
api_server_count,
|
||||
default_server_args, DP_SIZE_LOCAL,
|
||||
TP_SIZE)
|
||||
|
||||
with server_manager:
|
||||
yield server_manager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def servers(server_manager):
|
||||
return server_manager.servers
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@ -168,6 +179,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
|
||||
]
|
||||
|
||||
|
||||
def _get_parallel_config(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("server_info?config_format=json"))
|
||||
response.raise_for_status()
|
||||
|
||||
vllm_config = response.json()["vllm_config"]
|
||||
return vllm_config["parallel_config"]
|
||||
|
||||
|
||||
def test_hybrid_dp_server_info(server_manager):
|
||||
servers = server_manager.servers
|
||||
api_server_count = server_manager.api_server_count
|
||||
|
||||
for i, (server, _) in enumerate(servers):
|
||||
print(f"Testing {i=}")
|
||||
|
||||
# Each request will hit one of the API servers
|
||||
# `n_reqs` is set so that there is a good chance each server
|
||||
# receives at least one request
|
||||
n_reqs = 2 * api_server_count * api_server_count
|
||||
parallel_configs = [
|
||||
_get_parallel_config(server) for _ in range(n_reqs)
|
||||
]
|
||||
api_process_counts = [
|
||||
c["_api_process_count"] for c in parallel_configs
|
||||
]
|
||||
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
|
||||
|
||||
assert all(c == api_server_count
|
||||
for c in api_process_counts), api_process_counts
|
||||
assert all(0 <= r < api_server_count
|
||||
for r in api_process_ranks), api_process_ranks
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Optional, cast
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from tests.v1.test_utils import check_request_balancing
|
||||
@ -101,6 +102,8 @@ class MultinodeInternalLBServerManager:
|
||||
sargs,
|
||||
auto_port=False,
|
||||
env_dict={
|
||||
"VLLM_SERVER_DEV_MODE":
|
||||
"1",
|
||||
current_platform.device_control_env_var:
|
||||
",".join(
|
||||
str(
|
||||
@ -214,7 +217,10 @@ class APIOnlyServerManager:
|
||||
self.model_name,
|
||||
api_server_args,
|
||||
auto_port=False,
|
||||
env_dict={}) # No GPUs needed for API-only server
|
||||
env_dict={
|
||||
"VLLM_SERVER_DEV_MODE": "1",
|
||||
# No GPUs needed for API-only server
|
||||
})
|
||||
server.__enter__()
|
||||
print(f"API-only server started successfully with "
|
||||
f"{self.api_server_count} API servers")
|
||||
@ -293,14 +299,21 @@ def default_server_args():
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[1, 4])
|
||||
def servers(request, default_server_args):
|
||||
def server_manager(request, default_server_args):
|
||||
api_server_count = request.param
|
||||
with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
|
||||
api_server_count,
|
||||
default_server_args,
|
||||
DP_SIZE // NUM_NODES,
|
||||
TP_SIZE) as server_list:
|
||||
yield server_list
|
||||
server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
|
||||
api_server_count,
|
||||
default_server_args,
|
||||
DP_SIZE // NUM_NODES,
|
||||
TP_SIZE)
|
||||
|
||||
with server_manager:
|
||||
yield server_manager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def servers(server_manager):
|
||||
return server_manager.servers
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[1, 4])
|
||||
@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
|
||||
yield client
|
||||
|
||||
|
||||
def _get_parallel_config(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("server_info?config_format=json"))
|
||||
response.raise_for_status()
|
||||
|
||||
vllm_config = response.json()["vllm_config"]
|
||||
return vllm_config["parallel_config"]
|
||||
|
||||
|
||||
def test_multinode_dp_server_info(server_manager):
|
||||
head_server = server_manager.servers[0][0]
|
||||
api_server_count = server_manager.api_server_count
|
||||
|
||||
# Each request will hit one of the API servers
|
||||
# `n_reqs` is set so that there is a good chance each server
|
||||
# receives at least one request
|
||||
n_reqs = 2 * api_server_count * api_server_count
|
||||
parallel_configs = [
|
||||
_get_parallel_config(head_server) for _ in range(n_reqs)
|
||||
]
|
||||
api_process_counts = [c["_api_process_count"] for c in parallel_configs]
|
||||
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
|
||||
|
||||
assert all(c == api_server_count
|
||||
for c in api_process_counts), api_process_counts
|
||||
assert all(0 <= r < api_server_count
|
||||
for r in api_process_ranks), api_process_ranks
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
|
||||
@ -23,14 +23,14 @@ class AttentionType:
|
||||
Attention type.
|
||||
Use string to be compatible with `torch.compile`.
|
||||
"""
|
||||
# Decoder attention between previous layer Q/K/V
|
||||
DECODER = "decoder"
|
||||
# Encoder attention between previous layer Q/K/V for encoder-decoder
|
||||
"""Decoder attention between previous layer Q/K/V."""
|
||||
ENCODER = "encoder"
|
||||
# Encoder attention between previous layer Q/K/V
|
||||
"""Encoder attention between previous layer Q/K/V for encoder-decoder."""
|
||||
ENCODER_ONLY = "encoder_only"
|
||||
# Attention between dec. Q and enc. K/V for encoder-decoder
|
||||
"""Encoder attention between previous layer Q/K/V."""
|
||||
ENCODER_DECODER = "encoder_decoder"
|
||||
"""Attention between dec. Q and enc. K/V for encoder-decoder."""
|
||||
|
||||
|
||||
class AttentionBackend(ABC):
|
||||
|
||||
@ -430,9 +430,11 @@ class MultiHeadAttention(nn.Module):
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""Input shape: batch_size x seq_len x hidden_size"""
|
||||
# TODO(Isotr0py): Use existing backend implementations and support FA3
|
||||
bsz, q_len, _ = query.size()
|
||||
"""Input shape:
|
||||
(batch_size x seq_len x hidden_size) or
|
||||
(batch_size x seq_len x num_heads x head_size)
|
||||
"""
|
||||
bsz, q_len = query.size()[:2]
|
||||
kv_len = key.size(1)
|
||||
|
||||
query = query.view(bsz, q_len, self.num_heads, self.head_size)
|
||||
|
||||
@ -8,8 +8,9 @@ import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from collections.abc import Awaitable
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
from typing import Optional, Protocol, Union
|
||||
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
@ -92,6 +93,16 @@ class RequestFuncOutput:
|
||||
start_time: float = 0.0
|
||||
|
||||
|
||||
class RequestFunc(Protocol):
|
||||
def __call__(
|
||||
self,
|
||||
request_func_input: RequestFuncInput,
|
||||
session: aiohttp.ClientSession,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> Awaitable[RequestFuncOutput]:
|
||||
...
|
||||
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
session: aiohttp.ClientSession,
|
||||
@ -507,7 +518,7 @@ async def async_request_openai_embeddings(
|
||||
|
||||
|
||||
# TODO: Add more request functions for different API protocols.
|
||||
ASYNC_REQUEST_FUNCS = {
|
||||
ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
|
||||
"vllm": async_request_openai_completions,
|
||||
"openai": async_request_openai_completions,
|
||||
"openai-chat": async_request_openai_chat_completions,
|
||||
|
||||
@ -8,11 +8,12 @@ import time
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
|
||||
from .endpoint_request_func import (RequestFunc, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
|
||||
|
||||
async def wait_for_endpoint(
|
||||
request_func,
|
||||
request_func: RequestFunc,
|
||||
test_input: RequestFuncInput,
|
||||
session: aiohttp.ClientSession,
|
||||
timeout_seconds: int = 600,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
2006
vllm/config/model.py
Normal file
2006
vllm/config/model.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -193,6 +193,25 @@ class ParallelConfig:
|
||||
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
|
||||
needs to be divisible by dcp_size."""
|
||||
|
||||
_api_process_count: int = 1
|
||||
"""
|
||||
The number of API processes initialized.
|
||||
|
||||
Note:
|
||||
This is an internal config that is only valid for and
|
||||
should only be set by API server scale-out.
|
||||
"""
|
||||
|
||||
_api_process_rank: int = 0
|
||||
"""
|
||||
The rank of this API process, or `-1` for engine core processes
|
||||
under API server scale-out.
|
||||
|
||||
Note:
|
||||
This is an internal config that is only valid for and
|
||||
should only be set by API server scale-out.
|
||||
"""
|
||||
|
||||
@property
|
||||
def world_size_across_dp(self) -> int:
|
||||
"""world_size_across_dp is TPxPPxDP, it is the size of the world
|
||||
@ -428,6 +447,12 @@ class ParallelConfig:
|
||||
if self.distributed_executor_backend is None and self.world_size == 1:
|
||||
self.distributed_executor_backend = "uni"
|
||||
|
||||
if not -1 <= self._api_process_rank < self._api_process_count:
|
||||
raise ValueError(
|
||||
"Invalid value of `_api_process_rank`. "
|
||||
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
|
||||
f"but found: {self._api_process_rank}")
|
||||
|
||||
@property
|
||||
def use_ray(self) -> bool:
|
||||
return self.distributed_executor_backend == "ray" or (
|
||||
|
||||
97
vllm/config/pooler.py
Normal file
97
vllm/config/pooler.py
Normal file
@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class PoolerConfig:
|
||||
"""Controls the behavior of output pooling in pooling models."""
|
||||
|
||||
pooling_type: Optional[str] = None
|
||||
"""
|
||||
The pooling method of the pooling model. This should be a key in
|
||||
[`vllm.model_executor.layers.pooler.PoolingType`][].
|
||||
"""
|
||||
|
||||
## for embeddings models
|
||||
normalize: Optional[bool] = None
|
||||
"""
|
||||
Whether to normalize the embeddings outputs. Defaults to True.
|
||||
"""
|
||||
dimensions: Optional[int] = None
|
||||
"""
|
||||
Reduce the dimensions of embeddings if model
|
||||
support matryoshka representation. Defaults to None.
|
||||
"""
|
||||
enable_chunked_processing: Optional[bool] = None
|
||||
"""
|
||||
Whether to enable chunked processing for long inputs that exceed the model's
|
||||
maximum position embeddings. When enabled, long inputs will be split into
|
||||
chunks, processed separately, and then aggregated using weighted averaging.
|
||||
This allows embedding models to handle arbitrarily long text without CUDA
|
||||
errors. Defaults to False.
|
||||
"""
|
||||
max_embed_len: Optional[int] = None
|
||||
"""
|
||||
Maximum input length allowed for embedding generation. When set, allows
|
||||
inputs longer than max_embed_len to be accepted for embedding models.
|
||||
When an input exceeds max_embed_len, it will be handled according to
|
||||
the original max_model_len validation logic.
|
||||
Defaults to None (i.e. set to max_model_len).
|
||||
"""
|
||||
|
||||
## for classification models
|
||||
activation: Optional[bool] = None
|
||||
"""
|
||||
Whether to apply activation function to the classification outputs.
|
||||
Defaults to True.
|
||||
"""
|
||||
logit_bias: Optional[float] = None
|
||||
"""
|
||||
If provided, apply classification logit biases. Defaults to None.
|
||||
"""
|
||||
|
||||
## for reward models
|
||||
softmax: Optional[bool] = None
|
||||
"""
|
||||
Whether to apply softmax to the reward outputs.
|
||||
Defaults to True.
|
||||
"""
|
||||
step_tag_id: Optional[int] = None
|
||||
"""
|
||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||
are returned.
|
||||
"""
|
||||
returned_token_ids: Optional[list[int]] = None
|
||||
"""
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
||||
``math-shepherd-mistral-7b-prm`` model.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(),
|
||||
usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
||||
from typing import Any, Literal, Optional, Union
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@ -15,13 +15,9 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import RunnerType
|
||||
else:
|
||||
RunnerType = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
RunnerType = Literal["generate", "pooling", "draft"]
|
||||
PreemptionMode = Literal["swap", "recompute"]
|
||||
SchedulerPolicy = Literal["fcfs", "priority"]
|
||||
|
||||
|
||||
@ -1,8 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import ast
|
||||
import inspect
|
||||
import textwrap
|
||||
from dataclasses import MISSING, Field, field, fields, is_dataclass
|
||||
from typing import TYPE_CHECKING, TypeVar
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
|
||||
import regex as re
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import DataclassInstance
|
||||
@ -45,3 +50,96 @@ def get_field(cls: ConfigType, name: str) -> Field:
|
||||
return field(default=default)
|
||||
raise ValueError(
|
||||
f"{cls.__name__}.{name} must have a default value or default factory.")
|
||||
|
||||
|
||||
def contains_object_print(text: str) -> bool:
|
||||
"""
|
||||
Check if the text looks like a printed Python object, e.g.
|
||||
contains any substring matching the pattern: "at 0xFFFFFFF>"
|
||||
We match against 0x followed by 2-16 hex chars (there's
|
||||
a max of 16 on a 64-bit system).
|
||||
|
||||
Args:
|
||||
text (str): The text to check
|
||||
|
||||
Returns:
|
||||
result (bool): `True` if a match is found, `False` otherwise.
|
||||
"""
|
||||
pattern = r'at 0x[a-fA-F0-9]{2,16}>'
|
||||
match = re.search(pattern, text)
|
||||
return match is not None
|
||||
|
||||
|
||||
def assert_hashable(text: str) -> bool:
|
||||
if not contains_object_print(text):
|
||||
return True
|
||||
raise AssertionError(
|
||||
f"vLLM tried to hash some configs that may have Python objects ids "
|
||||
f"in them. This is a bug, please file an issue. "
|
||||
f"Text being hashed: {text}")
|
||||
|
||||
|
||||
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
|
||||
"""
|
||||
Get any docstrings placed after attribute assignments in a class body.
|
||||
|
||||
https://davidism.com/mit-license/
|
||||
"""
|
||||
|
||||
def pairwise(iterable):
|
||||
"""
|
||||
Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
|
||||
|
||||
Can be removed when Python 3.9 support is dropped.
|
||||
"""
|
||||
iterator = iter(iterable)
|
||||
a = next(iterator, None)
|
||||
|
||||
for b in iterator:
|
||||
yield a, b
|
||||
a = b
|
||||
|
||||
try:
|
||||
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
|
||||
except (OSError, KeyError, TypeError):
|
||||
# HACK: Python 3.13+ workaround - set missing __firstlineno__
|
||||
# Workaround can be removed after we upgrade to pydantic==2.12.0
|
||||
with open(inspect.getfile(cls)) as f:
|
||||
for i, line in enumerate(f):
|
||||
if f"class {cls.__name__}" in line and ":" in line:
|
||||
cls.__firstlineno__ = i + 1
|
||||
break
|
||||
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
|
||||
|
||||
if not isinstance(cls_node, ast.ClassDef):
|
||||
raise TypeError("Given object was not a class.")
|
||||
|
||||
out = {}
|
||||
|
||||
# Consider each pair of nodes.
|
||||
for a, b in pairwise(cls_node.body):
|
||||
# Must be an assignment then a constant string.
|
||||
if (not isinstance(a, (ast.Assign, ast.AnnAssign))
|
||||
or not isinstance(b, ast.Expr)
|
||||
or not isinstance(b.value, ast.Constant)
|
||||
or not isinstance(b.value.value, str)):
|
||||
continue
|
||||
|
||||
doc = inspect.cleandoc(b.value.value)
|
||||
|
||||
# An assignment can have multiple targets (a = b = v), but an
|
||||
# annotated assignment only has one target.
|
||||
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
|
||||
|
||||
for target in targets:
|
||||
# Must be assigning to a plain name.
|
||||
if not isinstance(target, ast.Name):
|
||||
continue
|
||||
|
||||
out[target.id] = doc
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def is_init_field(cls: ConfigType, name: str) -> bool:
|
||||
return next(f for f in fields(cls) if f.name == name).init
|
||||
|
||||
@ -129,7 +129,7 @@ class KVOutputAggregator:
|
||||
def aggregate(self,
|
||||
outputs: list[ModelRunnerOutput],
|
||||
output_rank: int = 0) -> ModelRunnerOutput:
|
||||
# aggregate kv_connector_output from all workers
|
||||
# Aggregate kv_connector_output from all workers
|
||||
|
||||
def update_finished_set(req_ids: Optional[set[str]],
|
||||
remaining_count_dict: dict[str, int],
|
||||
@ -142,8 +142,9 @@ class KVOutputAggregator:
|
||||
|
||||
finished_sending = set[str]()
|
||||
finished_recving = set[str]()
|
||||
for output in outputs:
|
||||
output = output.kv_connector_output
|
||||
aggregated_kv_connector_stats = None
|
||||
for model_runner_output in outputs:
|
||||
output = model_runner_output.kv_connector_output
|
||||
if not output:
|
||||
continue
|
||||
update_finished_set(output.finished_sending,
|
||||
@ -151,12 +152,26 @@ class KVOutputAggregator:
|
||||
update_finished_set(output.finished_recving,
|
||||
self._recv_remaining_count, finished_recving)
|
||||
|
||||
# Aggregate kv_connector_stats from all workers.
|
||||
if aggregated_kv_connector_stats is None:
|
||||
# Use the first worker's kv_connector_stats as accumulator.
|
||||
aggregated_kv_connector_stats = output.kv_connector_stats
|
||||
elif kv_connector_stats := output.kv_connector_stats:
|
||||
if aggregated_kv_connector_stats is None:
|
||||
aggregated_kv_connector_stats = kv_connector_stats
|
||||
else:
|
||||
assert isinstance(aggregated_kv_connector_stats,
|
||||
type(kv_connector_stats))
|
||||
aggregated_kv_connector_stats = \
|
||||
aggregated_kv_connector_stats.aggregate(kv_connector_stats)
|
||||
|
||||
# select output of the worker specified by output_rank
|
||||
output = outputs[output_rank]
|
||||
|
||||
output.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=finished_sending or None,
|
||||
finished_recving=finished_recving or None,
|
||||
kv_connector_stats=aggregated_kv_connector_stats or None,
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
@ -49,6 +49,8 @@ if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_events import KVCacheEvent
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.request import Request
|
||||
@ -235,6 +237,12 @@ class KVConnectorBase_V1(ABC):
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
|
||||
"""
|
||||
Get the KV connector stats collected during the last interval.
|
||||
"""
|
||||
return None
|
||||
|
||||
# ==============================
|
||||
# Scheduler-side methods
|
||||
# ==============================
|
||||
@ -365,4 +373,16 @@ class KVConnectorBase_V1(ABC):
|
||||
int: expected sending or receiving completion count.
|
||||
"""
|
||||
|
||||
return None
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def build_kv_connector_stats(
|
||||
cls,
|
||||
data: Optional[dict[str,
|
||||
Any]] = None) -> Optional["KVConnectorStats"]:
|
||||
"""
|
||||
KVConnectorStats resolution method. This method allows dynamically
|
||||
registered connectors to return their own KVConnectorStats object,
|
||||
which can implement custom aggregation logic on the data dict.
|
||||
"""
|
||||
return None
|
||||
|
||||
100
vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
Normal file
100
vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
Normal file
@ -0,0 +1,100 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import (
|
||||
KVConnectorFactory)
|
||||
from vllm.distributed.kv_transfer.kv_transfer_state import (
|
||||
has_kv_transfer_group)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class KVConnectorStats:
|
||||
"""
|
||||
Base class for KV Connector Stats, a container for transfer performance
|
||||
metrics or otherwise important telemetry from the connector.
|
||||
All sub-classes need to be serializable as stats are sent from worker to
|
||||
logger process.
|
||||
"""
|
||||
data: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def reset(self):
|
||||
"""Reset the stats, clear the state."""
|
||||
raise NotImplementedError
|
||||
|
||||
def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats":
|
||||
"""
|
||||
Aggregate stats with another `KVConnectorStats` object.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reduce(self) -> dict[str, Union[int, float]]:
|
||||
"""
|
||||
Reduce the observations collected during a time interval to one or
|
||||
more representative values (eg avg/median/sum of the series).
|
||||
This is meant to be called by the logger to produce a summary of the
|
||||
stats for the last time interval.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
"""Return True if the stats are empty."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class KVConnectorLogging:
|
||||
|
||||
def __init__(self, kv_tranfer_config: KVTransferConfig):
|
||||
# This should be called on frontend process.
|
||||
assert not has_kv_transfer_group()
|
||||
# Instantiate the connector's stats class.
|
||||
if kv_tranfer_config and kv_tranfer_config.kv_connector:
|
||||
self.connector_cls = KVConnectorFactory.get_connector_class(
|
||||
kv_tranfer_config)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.transfer_stats_accumulator: Optional[KVConnectorStats] = None
|
||||
|
||||
def observe(self, transfer_stats_data: dict[str, Any]):
|
||||
# Should not be called when a KVConnector is not configured.
|
||||
assert self.connector_cls is not None
|
||||
# Called periodically when connector syncs with the scheduler.
|
||||
# Note that this is not the same as the logging interval.
|
||||
# We expect transfer_stats_data to be aggregated across all workers and
|
||||
# consist of observations from a single connector or a MultiConnector.
|
||||
transfer_stats = self.connector_cls.build_kv_connector_stats(
|
||||
transfer_stats_data)
|
||||
if transfer_stats is None:
|
||||
logger.warning_once(
|
||||
"The connector %s is collecting stats but "
|
||||
"does not implement the "
|
||||
"`build_kv_connector_stats` method. "
|
||||
"Stats will not be logged.", self.connector_cls)
|
||||
return
|
||||
|
||||
if self.transfer_stats_accumulator is None:
|
||||
self.transfer_stats_accumulator = transfer_stats
|
||||
else:
|
||||
# Accumulate last interval stats.
|
||||
self.transfer_stats_accumulator = \
|
||||
self.transfer_stats_accumulator.aggregate(transfer_stats)
|
||||
|
||||
def log(self, log_fn=logger.info):
|
||||
"""Log transfer metrics periodically, similar to throughput logging"""
|
||||
if (self.transfer_stats_accumulator
|
||||
and not self.transfer_stats_accumulator.is_empty()):
|
||||
# Produce a single cumulative stats object for the last time
|
||||
# interval from the recorded observations.
|
||||
xfer_metrics = self.transfer_stats_accumulator.reduce()
|
||||
xfer_metrics_str = ", ".join(f"{k}={v}"
|
||||
for k, v in xfer_metrics.items())
|
||||
log_fn("KV Transfer metrics: %s", xfer_metrics_str)
|
||||
|
||||
# Reset metrics for next interval
|
||||
self.reset()
|
||||
@ -9,19 +9,21 @@ import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
from vllm.distributed.kv_events import KVCacheEvent
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import (
|
||||
KVConnectorFactory)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import KVConnectorOutput
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.distributed.kv_events import KVCacheEvent
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -33,6 +35,43 @@ class MultiKVConnectorMetadata(KVConnectorMetadata):
|
||||
extra_async_saves: Optional[dict[str, int]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultiKVConnectorStats(KVConnectorStats):
|
||||
"""
|
||||
Maintain a dict of KVConnectorStats objects, one for each connector.
|
||||
This is used to aggregate the stats from all connectors separately.
|
||||
"""
|
||||
|
||||
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
|
||||
for connector_id, stats in other.data.items():
|
||||
if connector_id not in self.data:
|
||||
self[connector_id] = stats
|
||||
else:
|
||||
assert isinstance(stats, type(self.data[connector_id]))
|
||||
self[connector_id] = self[connector_id].aggregate(stats)
|
||||
return self
|
||||
|
||||
def reset(self):
|
||||
for stats in self.data.values():
|
||||
stats.reset()
|
||||
|
||||
def reduce(self) -> dict[str, Any]:
|
||||
# TODO (NickLucche) Adjust for logging on separate lines
|
||||
return {
|
||||
connector_id: stats.reduce()
|
||||
for connector_id, stats in self.data.items()
|
||||
}
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return all(stats.is_empty() for stats in self.data.values())
|
||||
|
||||
def __getitem__(self, connector_id: str) -> KVConnectorStats:
|
||||
return self.data[connector_id]
|
||||
|
||||
def __setitem__(self, connector_id: str, stats: KVConnectorStats):
|
||||
self.data[connector_id] = stats
|
||||
|
||||
|
||||
class MultiConnector(KVConnectorBase_V1):
|
||||
"""
|
||||
A wrapper for using multiple KVConnectors at the same time.
|
||||
@ -46,6 +85,7 @@ class MultiConnector(KVConnectorBase_V1):
|
||||
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
|
||||
super().__init__(vllm_config=vllm_config, role=role)
|
||||
self._connectors: list[KVConnectorBase_V1] = []
|
||||
self._ktc_kv_transfer_config = []
|
||||
ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
||||
"connectors")
|
||||
assert ktcs is not None
|
||||
@ -57,6 +97,7 @@ class MultiConnector(KVConnectorBase_V1):
|
||||
**ktc, engine_id=engine_id)
|
||||
self._connectors.append(
|
||||
KVConnectorFactory.create_connector(temp_config, role))
|
||||
self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
|
||||
|
||||
# A mapping from request id to the index of the connector chosen to
|
||||
# load the request from (if any).
|
||||
@ -227,7 +268,7 @@ class MultiConnector(KVConnectorBase_V1):
|
||||
|
||||
return async_saves > 0, kv_txfer_params
|
||||
|
||||
def take_events(self) -> Iterable[KVCacheEvent]:
|
||||
def take_events(self) -> Iterable["KVCacheEvent"]:
|
||||
for c in self._connectors:
|
||||
yield from c.take_events()
|
||||
|
||||
@ -264,3 +305,24 @@ class MultiConnector(KVConnectorBase_V1):
|
||||
f"({', '.join(layouts) })."
|
||||
f"All connectors must use the same layout.")
|
||||
return next(iter(layouts), None)
|
||||
|
||||
@classmethod
|
||||
def build_kv_connector_stats(
|
||||
cls,
|
||||
data: Optional[dict[str,
|
||||
Any]] = None) -> Optional[KVConnectorStats]:
|
||||
return MultiKVConnectorStats(data=data) if data is not None \
|
||||
else MultiKVConnectorStats()
|
||||
|
||||
def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]:
|
||||
# Group connector stats by connector type.
|
||||
stats_by_connector: Optional[MultiKVConnectorStats] = None
|
||||
for c in self._connectors:
|
||||
stats = c.get_kv_connector_stats()
|
||||
if stats is None:
|
||||
continue
|
||||
if stats_by_connector is None:
|
||||
# Lazy init to allow optional return value.
|
||||
stats_by_connector = MultiKVConnectorStats()
|
||||
stats_by_connector[c.__class__.__name__] = stats
|
||||
return stats_by_connector
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
import queue
|
||||
@ -11,7 +12,7 @@ from collections import defaultdict
|
||||
from collections.abc import Iterator
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import msgspec
|
||||
import numpy as np
|
||||
@ -23,6 +24,8 @@ from vllm.attention.selector import backend_name_to_enum, get_attn_backend
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
|
||||
get_tp_group)
|
||||
@ -33,7 +36,6 @@ from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import make_zmq_path, make_zmq_socket
|
||||
from vllm.v1.attention.backends.utils import get_kv_cache_layout
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
@ -206,6 +208,18 @@ class NixlConnector(KVConnectorBase_V1):
|
||||
assert self.connector_worker is not None
|
||||
return self.connector_worker.get_finished()
|
||||
|
||||
def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
|
||||
assert self.connector_worker is not None
|
||||
return self.connector_worker.get_kv_connector_stats()
|
||||
|
||||
@classmethod
|
||||
def build_kv_connector_stats(
|
||||
cls,
|
||||
data: Optional[dict[str,
|
||||
Any]] = None) -> Optional[KVConnectorStats]:
|
||||
return NixlKVConnectorStats(data=data) if data is not None \
|
||||
else NixlKVConnectorStats()
|
||||
|
||||
def start_load_kv(self, forward_context: "ForwardContext",
|
||||
**kwargs) -> None:
|
||||
assert self.connector_worker is not None
|
||||
@ -377,6 +391,7 @@ class NixlConnectorScheduler:
|
||||
Once a request is finished, determine whether request blocks
|
||||
should be freed now or will be sent asynchronously and freed later.
|
||||
"""
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
params = request.kv_transfer_params
|
||||
logger.debug(
|
||||
@ -550,6 +565,7 @@ class NixlConnectorWorker:
|
||||
# With heterogeneous TP, P must wait for all assigned D TP workers to
|
||||
# finish reading before safely freeing the blocks.
|
||||
self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
|
||||
self.xfer_stats = NixlKVConnectorStats()
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup background threads on destruction."""
|
||||
@ -1097,6 +1113,8 @@ class NixlConnectorWorker:
|
||||
xfer_state = self.nixl_wrapper.check_xfer_state(handle)
|
||||
if xfer_state == "DONE":
|
||||
self.nixl_wrapper.release_xfer_handle(handle)
|
||||
# TODO (NickLucche) Get from NIXL telemetry once integrated
|
||||
self.xfer_stats.record_transfer()
|
||||
elif xfer_state == "PROC":
|
||||
in_progress = True
|
||||
continue
|
||||
@ -1248,7 +1266,6 @@ class NixlConnectorWorker:
|
||||
self.nixl_wrapper.transfer(handle)
|
||||
|
||||
# Use handle to check completion in future step().
|
||||
# TODO (NickLucche) surface xfer elapsed time
|
||||
self._recving_transfers[request_id].append(
|
||||
(handle, time.perf_counter()))
|
||||
|
||||
@ -1300,6 +1317,15 @@ class NixlConnectorWorker:
|
||||
block_len = self.block_len
|
||||
return block_len
|
||||
|
||||
def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
|
||||
"""
|
||||
Get the KV transfer stats for the connector.
|
||||
"""
|
||||
# Clear stats for next iteration
|
||||
if not self.xfer_stats.is_empty():
|
||||
return self.xfer_stats.clone_and_reset()
|
||||
return None
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
|
||||
@ -1318,3 +1344,39 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
|
||||
finally:
|
||||
if ctx is not None:
|
||||
ctx.destroy(linger=0)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NixlKVConnectorStats(KVConnectorStats):
|
||||
"""Container for transfer performance metrics"""
|
||||
|
||||
def __post_init__(self):
|
||||
if "num_successful_transfers" not in self.data:
|
||||
self.data["num_successful_transfers"] = 0
|
||||
|
||||
def reset(self):
|
||||
self.data = {"num_successful_transfers": 0}
|
||||
|
||||
def record_transfer(self):
|
||||
# TODO: record actual transfer stats when available
|
||||
self.data["num_successful_transfers"] += 1
|
||||
|
||||
def clone_and_reset(self) -> "NixlKVConnectorStats":
|
||||
old = copy.copy(self)
|
||||
self.reset()
|
||||
return old
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return self.data["num_successful_transfers"] == 0
|
||||
|
||||
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
|
||||
if not other.is_empty():
|
||||
self.data["num_successful_transfers"] += other.data[
|
||||
"num_successful_transfers"]
|
||||
return self
|
||||
|
||||
def reduce(self) -> dict[str, Union[int, float]]:
|
||||
# TODO: reduce stats to a single value, calculate latency/throughput
|
||||
return {
|
||||
"num_successful_transfers": self.data["num_successful_transfers"]
|
||||
}
|
||||
@ -27,11 +27,11 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
EPLBConfig, HfOverrides, KVEventsConfig,
|
||||
KVTransferConfig, LoadConfig, LogprobsMode,
|
||||
LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
|
||||
ModelDType, ModelImpl, ObservabilityConfig,
|
||||
ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
|
||||
RunnerOption, SchedulerConfig, SchedulerPolicy,
|
||||
SpeculativeConfig, StructuredOutputsConfig,
|
||||
TaskOption, TokenizerMode, VllmConfig, get_attr_docs)
|
||||
ModelDType, ObservabilityConfig, ParallelConfig,
|
||||
PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
|
||||
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
|
||||
StructuredOutputsConfig, TaskOption, TokenizerMode,
|
||||
VllmConfig, get_attr_docs)
|
||||
from vllm.config.multimodal import MMCacheType, MultiModalConfig
|
||||
from vllm.config.parallel import ExpertPlacementStrategy
|
||||
from vllm.config.utils import get_field
|
||||
@ -333,6 +333,8 @@ class EngineArgs:
|
||||
enable_eplb: bool = ParallelConfig.enable_eplb
|
||||
expert_placement_strategy: ExpertPlacementStrategy = \
|
||||
ParallelConfig.expert_placement_strategy
|
||||
_api_process_count: int = ParallelConfig._api_process_count
|
||||
_api_process_rank: int = ParallelConfig._api_process_rank
|
||||
num_redundant_experts: int = EPLBConfig.num_redundant_experts
|
||||
eplb_window_size: int = EPLBConfig.window_size
|
||||
eplb_step_interval: int = EPLBConfig.step_interval
|
||||
@ -441,6 +443,7 @@ class EngineArgs:
|
||||
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
||||
scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
|
||||
|
||||
pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
|
||||
override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
|
||||
ModelConfig.override_pooler_config
|
||||
compilation_config: CompilationConfig = \
|
||||
@ -547,7 +550,6 @@ class EngineArgs:
|
||||
model_group.add_argument("--max-logprobs",
|
||||
**model_kwargs["max_logprobs"])
|
||||
model_group.add_argument("--logprobs-mode",
|
||||
choices=[f.value for f in LogprobsMode],
|
||||
**model_kwargs["logprobs_mode"])
|
||||
model_group.add_argument("--disable-sliding-window",
|
||||
**model_kwargs["disable_sliding_window"])
|
||||
@ -579,8 +581,11 @@ class EngineArgs:
|
||||
help=model_kwargs["hf_token"]["help"])
|
||||
model_group.add_argument("--hf-overrides",
|
||||
**model_kwargs["hf_overrides"])
|
||||
model_group.add_argument("--pooler-config",
|
||||
**model_kwargs["pooler_config"])
|
||||
model_group.add_argument("--override-pooler-config",
|
||||
**model_kwargs["override_pooler_config"])
|
||||
**model_kwargs["override_pooler_config"],
|
||||
deprecated=True)
|
||||
model_group.add_argument("--logits-processor-pattern",
|
||||
**model_kwargs["logits_processor_pattern"])
|
||||
model_group.add_argument("--generation-config",
|
||||
@ -589,9 +594,7 @@ class EngineArgs:
|
||||
**model_kwargs["override_generation_config"])
|
||||
model_group.add_argument("--enable-sleep-mode",
|
||||
**model_kwargs["enable_sleep_mode"])
|
||||
model_group.add_argument("--model-impl",
|
||||
choices=[f.value for f in ModelImpl],
|
||||
**model_kwargs["model_impl"])
|
||||
model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
|
||||
model_group.add_argument("--override-attention-dtype",
|
||||
**model_kwargs["override_attention_dtype"])
|
||||
model_group.add_argument("--logits-processors",
|
||||
@ -951,7 +954,10 @@ class EngineArgs:
|
||||
# Get the list of attributes of this dataclass.
|
||||
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
||||
# Set the attributes from the parsed arguments.
|
||||
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
|
||||
engine_args = cls(**{
|
||||
attr: getattr(args, attr)
|
||||
for attr in attrs if hasattr(args, attr)
|
||||
})
|
||||
return engine_args
|
||||
|
||||
def create_model_config(self) -> ModelConfig:
|
||||
@ -1031,6 +1037,7 @@ class EngineArgs:
|
||||
mm_shm_cache_max_object_size_mb=self.
|
||||
mm_shm_cache_max_object_size_mb,
|
||||
mm_encoder_tp_mode=self.mm_encoder_tp_mode,
|
||||
pooler_config=self.pooler_config,
|
||||
override_pooler_config=self.override_pooler_config,
|
||||
logits_processor_pattern=self.logits_processor_pattern,
|
||||
generation_config=self.generation_config,
|
||||
@ -1364,6 +1371,8 @@ class EngineArgs:
|
||||
worker_cls=self.worker_cls,
|
||||
worker_extension_cls=self.worker_extension_cls,
|
||||
decode_context_parallel_size=self.decode_context_parallel_size,
|
||||
_api_process_count=self._api_process_count,
|
||||
_api_process_rank=self._api_process_rank,
|
||||
)
|
||||
|
||||
speculative_config = self.create_speculative_config(
|
||||
|
||||
@ -135,23 +135,20 @@ def run_headless(args: argparse.Namespace):
|
||||
def run_multi_api_server(args: argparse.Namespace):
|
||||
|
||||
assert not args.headless
|
||||
num_api_servers = args.api_server_count
|
||||
num_api_servers: int = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
orig_mm_processor_cache_gb = args.mm_processor_cache_gb
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
# Not compatible with API server scale-out
|
||||
args.mm_processor_cache_gb = 0
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
engine_args._api_process_count = num_api_servers
|
||||
engine_args._api_process_rank = -1
|
||||
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if num_api_servers > 1:
|
||||
if not envs.VLLM_USE_V1:
|
||||
@ -161,10 +158,6 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
|
||||
"with api_server_count > 1")
|
||||
|
||||
if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0:
|
||||
logger.warning("Multi-modal processor cache is disabled because "
|
||||
"it is not compatible with `api_server_count > 1`.")
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
log_stats = not engine_args.disable_log_stats
|
||||
|
||||
@ -221,9 +214,10 @@ def run_api_server_worker_proc(listen_address,
|
||||
client_config=None,
|
||||
**uvicorn_kwargs) -> None:
|
||||
"""Entrypoint for individual API server worker processes."""
|
||||
client_config = client_config or {}
|
||||
server_index = client_config.get("client_index", 0)
|
||||
|
||||
# Set process title and add process-specific prefix to stdout and stderr.
|
||||
server_index = client_config.get("client_index", 0) if client_config else 0
|
||||
set_process_title("APIServer", str(server_index))
|
||||
decorate_logs()
|
||||
|
||||
|
||||
@ -151,9 +151,11 @@ class LLM:
|
||||
multi-modal processor obtained from `AutoProcessor.from_pretrained`.
|
||||
The available overrides depend on the model that is being run.
|
||||
For example, for Phi-3-Vision: `{"num_crops": 4}`.
|
||||
override_pooler_config: Initialize non-default pooling config or
|
||||
override default pooling config for the pooling model.
|
||||
e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
|
||||
pooler_config: Initialize non-default pooling config for the pooling
|
||||
model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
|
||||
override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
|
||||
argument is deprecated and will be removed in v0.12.0 or v1.0.0,
|
||||
whichever is sooner.
|
||||
compilation_config: Either an integer or a dictionary. If it is an
|
||||
integer, it is used as the level of compilation optimization. If it
|
||||
is a dictionary, it can specify the full compilation configuration.
|
||||
@ -191,6 +193,7 @@ class LLM:
|
||||
hf_token: Optional[Union[bool, str]] = None,
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
pooler_config: Optional[PoolerConfig] = None,
|
||||
override_pooler_config: Optional[PoolerConfig] = None,
|
||||
structured_outputs_config: Optional[Union[dict[
|
||||
str, Any], StructuredOutputsConfig]] = None,
|
||||
@ -288,6 +291,7 @@ class LLM:
|
||||
hf_token=hf_token,
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
pooler_config=pooler_config,
|
||||
override_pooler_config=override_pooler_config,
|
||||
structured_outputs_config=structured_outputs_instance,
|
||||
compilation_config=compilation_config_instance,
|
||||
|
||||
@ -17,13 +17,14 @@ from argparse import Namespace
|
||||
from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
from typing import Annotated, Any, Callable, Optional
|
||||
from typing import Annotated, Any, Callable, Literal, Optional
|
||||
|
||||
import prometheus_client
|
||||
import pydantic
|
||||
import regex as re
|
||||
import uvloop
|
||||
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
|
||||
from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query,
|
||||
Request)
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
@ -166,6 +167,9 @@ async def build_async_engine_client(
|
||||
# Context manager to handle engine_client lifecycle
|
||||
# Ensures everything is shutdown and cleaned up on error/exit
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
if client_config:
|
||||
engine_args._api_process_count = client_config.get("client_count", 1)
|
||||
engine_args._api_process_rank = client_config.get("client_index", 0)
|
||||
|
||||
if disable_frontend_multiprocessing is None:
|
||||
disable_frontend_multiprocessing = bool(
|
||||
@ -209,8 +213,12 @@ async def build_async_engine_client_from_engine_args(
|
||||
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
async_llm: Optional[AsyncLLM] = None
|
||||
client_count = client_config.pop("client_count") if client_config else 1
|
||||
client_index = client_config.pop("client_index") if client_config else 0
|
||||
|
||||
# Don't mutate the input client_config
|
||||
client_config = dict(client_config) if client_config else {}
|
||||
client_count = client_config.pop("client_count", 1)
|
||||
client_index = client_config.pop("client_index", 0)
|
||||
|
||||
try:
|
||||
async_llm = AsyncLLM.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
@ -956,9 +964,22 @@ if envs.VLLM_SERVER_DEV_MODE:
|
||||
logger.warning("SECURITY WARNING: Development endpoints are enabled! "
|
||||
"This should NOT be used in production!")
|
||||
|
||||
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
|
||||
|
||||
@router.get("/server_info")
|
||||
async def show_server_info(raw_request: Request):
|
||||
server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
|
||||
async def show_server_info(
|
||||
raw_request: Request,
|
||||
config_format: Annotated[Literal["text", "json"],
|
||||
Query()] = "text",
|
||||
):
|
||||
vllm_config: VllmConfig = raw_request.app.state.vllm_config
|
||||
server_info = {
|
||||
"vllm_config":
|
||||
str(vllm_config)
|
||||
if config_format == "text" else PydanticVllmConfig.dump_python(
|
||||
vllm_config, mode="json", fallback=str)
|
||||
# fallback=str is needed to handle e.g. torch.dtype
|
||||
}
|
||||
return JSONResponse(content=server_info)
|
||||
|
||||
@router.post("/reset_prefix_cache")
|
||||
@ -1856,8 +1877,6 @@ async def run_server_worker(listen_address,
|
||||
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
|
||||
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
||||
|
||||
server_index = client_config.get("client_index", 0) if client_config else 0
|
||||
|
||||
# Load logging config for uvicorn if specified
|
||||
log_config = load_log_config(args.log_config_file)
|
||||
if log_config is not None:
|
||||
@ -1873,7 +1892,8 @@ async def run_server_worker(listen_address,
|
||||
vllm_config = await engine_client.get_vllm_config()
|
||||
await init_app_state(engine_client, vllm_config, app.state, args)
|
||||
|
||||
logger.info("Starting vLLM API server %d on %s", server_index,
|
||||
logger.info("Starting vLLM API server %d on %s",
|
||||
vllm_config.parallel_config._api_process_rank,
|
||||
listen_address)
|
||||
shutdown_task = await serve_http(
|
||||
app,
|
||||
|
||||
@ -14,7 +14,7 @@ from torch import nn
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||
from vllm.lora.utils import (from_layer, from_layer_logits_processor,
|
||||
|
||||
@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager, create_lora_manager)
|
||||
@ -27,25 +27,26 @@ class WorkerLoRAManager:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_num_seqs: int,
|
||||
max_num_batched_tokens: int,
|
||||
vocab_size: int,
|
||||
lora_config: LoRAConfig,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
embedding_modules: dict[str, str],
|
||||
embedding_padding_modules: list[str],
|
||||
lora_model_cls: type[LoRAModel] = LoRAModel,
|
||||
max_position_embeddings: Optional[int] = None,
|
||||
):
|
||||
self._lora_model_cls = lora_model_cls
|
||||
self.embedding_modules = embedding_modules
|
||||
self.embedding_padding_modules = embedding_padding_modules
|
||||
self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
self.vocab_size = vocab_size
|
||||
self.lora_config = lora_config
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
|
||||
self.max_num_batched_tokens = (
|
||||
vllm_config.scheduler_config.max_num_batched_tokens)
|
||||
self.vocab_size = vllm_config.model_config.get_vocab_size()
|
||||
self.lora_config = vllm_config.lora_config
|
||||
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = vllm_config.model_config.hf_config.get_text_config()
|
||||
|
||||
self.max_position_embeddings = text_config.max_position_embeddings
|
||||
self.device = device
|
||||
# Lazily initialized by create_lora_manager.
|
||||
self._adapter_manager: LoRAModelManager
|
||||
|
||||
@ -78,3 +78,12 @@ if HAS_TRITON:
|
||||
"TritonOrDeepGemmExperts",
|
||||
"BatchedTritonOrDeepGemmExperts",
|
||||
]
|
||||
else:
|
||||
# Some model classes directly use the custom ops. Add placeholders
|
||||
# to avoid import errors.
|
||||
def _raise_exception(method: str):
|
||||
raise NotImplementedError(
|
||||
f"{method} is not implemented as lack of triton.")
|
||||
|
||||
fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk")
|
||||
fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts")
|
||||
|
||||
@ -15,7 +15,7 @@ from .common import apply_rotary_emb_dispatch
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _triton_qwen2vl_mrope_forward(
|
||||
def _triton_mrope_forward(
|
||||
q_ptr,
|
||||
k_ptr,
|
||||
cos,
|
||||
@ -30,12 +30,14 @@ def _triton_qwen2vl_mrope_forward(
|
||||
pad_hd: tl.constexpr,
|
||||
mrope_section_t: tl.constexpr,
|
||||
mrope_section_h: tl.constexpr,
|
||||
mrope_section_w: tl.constexpr,
|
||||
is_interleaved: tl.constexpr,
|
||||
):
|
||||
# Adapted from
|
||||
# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
|
||||
# This version supports flatten input tensors from vllm
|
||||
# and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
|
||||
# instead of (3, bsz, seq_len, head_dim)
|
||||
# instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary
|
||||
pid = tl.program_id(0)
|
||||
# locate start address
|
||||
q_ptr = q_ptr + pid * (n_qh * hd)
|
||||
@ -47,9 +49,6 @@ def _triton_qwen2vl_mrope_forward(
|
||||
# ####################################################################
|
||||
# Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
|
||||
|
||||
t_end = mrope_section_t
|
||||
h_end = t_end + mrope_section_h
|
||||
|
||||
# Updated stride calculation for half head_dim
|
||||
half_rd = rd // 2
|
||||
t_cos = cos + pid * half_rd
|
||||
@ -61,9 +60,18 @@ def _triton_qwen2vl_mrope_forward(
|
||||
|
||||
# Updated offsets for half head_dim
|
||||
cos_offsets = tl.arange(0, pad_hd // 2)
|
||||
t_mask = cos_offsets < t_end
|
||||
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
|
||||
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
|
||||
if is_interleaved:
|
||||
h_mask = (((cos_offsets % 3) == 1) &
|
||||
(cos_offsets <= 3 * mrope_section_h))
|
||||
w_mask = (((cos_offsets % 3) == 2) &
|
||||
(cos_offsets <= 3 * mrope_section_w))
|
||||
t_mask = ~(h_mask | w_mask)
|
||||
else:
|
||||
t_end = mrope_section_t
|
||||
h_end = t_end + mrope_section_h
|
||||
t_mask = cos_offsets < mrope_section_t
|
||||
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
|
||||
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
|
||||
|
||||
t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
|
||||
h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
|
||||
@ -131,6 +139,7 @@ def triton_mrope(
|
||||
mrope_section: list[int],
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
mrope_interleaved: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Qwen2VL mrope kernel.
|
||||
|
||||
@ -158,7 +167,7 @@ def triton_mrope(
|
||||
cos = cos.contiguous()
|
||||
sin = sin.contiguous()
|
||||
|
||||
_triton_qwen2vl_mrope_forward[(n_row, )](
|
||||
_triton_mrope_forward[(n_row, )](
|
||||
q,
|
||||
k,
|
||||
cos,
|
||||
@ -173,6 +182,8 @@ def triton_mrope(
|
||||
pad_hd,
|
||||
mrope_section[0],
|
||||
mrope_section[1],
|
||||
mrope_section[2],
|
||||
mrope_interleaved,
|
||||
)
|
||||
return q, k
|
||||
|
||||
@ -201,7 +212,7 @@ class MRotaryEmbedding(RotaryEmbedding):
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
mrope_section: Optional[list[int]] = None,
|
||||
mrope_interleaved: Optional[bool] = False,
|
||||
mrope_interleaved: bool = False,
|
||||
) -> None:
|
||||
# In Qwen2.5-VL, the maximum index value is related to the duration of
|
||||
# the input video. We enlarge max_position_embeddings to 4 times to get
|
||||
@ -282,10 +293,6 @@ class MRotaryEmbedding(RotaryEmbedding):
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
if self.mrope_interleaved:
|
||||
# TODO: add triton implementation to support mrope-interleaved
|
||||
return self.forward_native(positions, query, key)
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
@ -302,6 +309,7 @@ class MRotaryEmbedding(RotaryEmbedding):
|
||||
self.mrope_section,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
self.mrope_interleaved,
|
||||
)
|
||||
|
||||
return q.reshape(query_shape), k.reshape(key_shape)
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm import envs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
|
||||
@ -167,7 +167,8 @@ def dispatch_cpu_unquantized_gemm(
|
||||
if remove_weight:
|
||||
layer.weight = torch.nn.Parameter(torch.empty(0),
|
||||
requires_grad=False)
|
||||
elif ops._supports_onednn:
|
||||
elif (ops._supports_onednn
|
||||
and current_platform.get_cpu_architecture() == CpuArchEnum.X86):
|
||||
origin_weight = layer.weight
|
||||
if remove_weight:
|
||||
layer.weight = torch.nn.Parameter(torch.empty(0),
|
||||
|
||||
@ -13,8 +13,7 @@ from torch import nn
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
|
||||
set_current_vllm_config)
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
@ -176,8 +175,8 @@ def get_model_architecture(
|
||||
)
|
||||
|
||||
if arch == model_config._get_transformers_backend_cls():
|
||||
assert model_config.model_impl != ModelImpl.VLLM
|
||||
if model_config.model_impl == ModelImpl.AUTO:
|
||||
assert model_config.model_impl != "vllm"
|
||||
if model_config.model_impl == "auto":
|
||||
logger.warning_once(
|
||||
"%s has no vLLM implementation, falling back to Transformers "
|
||||
"implementation. Some features may not be supported and "
|
||||
|
||||
@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
batch.
|
||||
|
||||
Info:
|
||||
[Blip2ImageInputs][]
|
||||
[`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
|
||||
"""
|
||||
|
||||
if intermediate_tensors is not None:
|
||||
|
||||
@ -46,7 +46,8 @@ from vllm.sequence import IntermediateTensors
|
||||
from .clip import CLIPVisionModel
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
from .siglip import SiglipVisionModel
|
||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
||||
from .utils import (AutoWeightsLoader, init_vllm_registered_model,
|
||||
maybe_prefix, merge_multimodal_embeddings)
|
||||
from .vision import get_vision_encoder_info
|
||||
|
||||
EOT = "<|endofturn|>"
|
||||
@ -740,33 +741,20 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
if (kwargs.get("pixel_values_images") is not None
|
||||
or kwargs.get("pixel_values_videos")
|
||||
is not None): # v0 compatibility
|
||||
multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
|
||||
if multimodal_embeddings is not None:
|
||||
multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0)
|
||||
_mask_image = input_ids == self.config.image_token_id
|
||||
_mask_video = input_ids == self.config.video_token_id
|
||||
assert _mask_image.sum() + _mask_video.sum() == len(
|
||||
multimodal_embeddings)
|
||||
if multimodal_embeddings is not None \
|
||||
and len(multimodal_embeddings) != 0:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids,
|
||||
inputs_embeds,
|
||||
multimodal_embeddings,
|
||||
placeholder_token_id=[
|
||||
self.config.image_token_id,
|
||||
self.config.video_token_id,
|
||||
],
|
||||
)
|
||||
|
||||
if multimodal_embeddings.dtype != inputs_embeds.dtype:
|
||||
multimodal_embeddings = multimodal_embeddings.to(
|
||||
dtype=inputs_embeds.dtype)
|
||||
if multimodal_embeddings.device != inputs_embeds.device:
|
||||
multimodal_embeddings = multimodal_embeddings.to(
|
||||
device=inputs_embeds.device)
|
||||
|
||||
if _mask_image.sum() > 0:
|
||||
inputs_embeds[
|
||||
_mask_image] = multimodal_embeddings[:sum(_mask_image)]
|
||||
if _mask_video.sum() > 0:
|
||||
inputs_embeds[_mask_video] = multimodal_embeddings[
|
||||
-sum(_mask_video):]
|
||||
return inputs_embeds
|
||||
|
||||
def forward(
|
||||
@ -783,8 +771,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
# NOTE: In v1, inputs_embeds is always generated at model runner, this
|
||||
# condition is for v0 compatibility.
|
||||
elif inputs_embeds is None:
|
||||
inputs_embeds = self.get_input_embeddings(input_ids=input_ids,
|
||||
**kwargs)
|
||||
multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
|
||||
inputs_embeds = self.get_input_embeddings(input_ids,
|
||||
multimodal_embeddings)
|
||||
input_ids = None
|
||||
hidden_states = self.language_model.model(input_ids,
|
||||
positions,
|
||||
|
||||
@ -23,7 +23,6 @@ from vllm.utils import supports_kw
|
||||
from .interfaces_base import is_pooling_model
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -97,33 +96,10 @@ class SupportsMultiModal(Protocol):
|
||||
"""
|
||||
...
|
||||
|
||||
# Only for models that support v0 chunked prefill
|
||||
# TODO(ywang96): Remove this overload once v0 is deprecated
|
||||
@overload
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
attn_metadata: Optional["AttentionMetadata"] = None,
|
||||
) -> Tensor:
|
||||
...
|
||||
|
||||
# TODO: Remove this overload once v0 is deprecated
|
||||
@overload
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> Tensor:
|
||||
...
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
# Only necessary so that the v0 overload is valid
|
||||
# TODO: Remove attn_metadata once v0 is deprecated
|
||||
attn_metadata: Optional["AttentionMetadata"] = None,
|
||||
) -> Tensor:
|
||||
"""
|
||||
Returns the input embeddings merged from the text embeddings from
|
||||
|
||||
@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
inputs_embeds: Optional tensor of input embeddings.
|
||||
|
||||
Info:
|
||||
[LlavaImageInputs][]
|
||||
[`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs]
|
||||
"""
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
Unlike in LLaVA-1.5, the number of image tokens inputted to the language
|
||||
model depends on the original size of the input image. Including the
|
||||
original image token in the input, the required number of image tokens
|
||||
is given by [get_llava_next_image_feature_size][].
|
||||
is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\
|
||||
model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
|
||||
|
||||
This way, the `positions` and `attn_metadata` are consistent
|
||||
with the `input_ids`.
|
||||
@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
inputs_embeds: Optional tensor of input embeddings.
|
||||
|
||||
Info:
|
||||
[LlavaNextImageInputs][]
|
||||
[`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs]
|
||||
"""
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
@ -306,7 +306,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
|
||||
eps=self.layer_norm_epsilon,
|
||||
group_size=None,
|
||||
norm_before_gate=True,
|
||||
device=torch.cuda.current_device(),
|
||||
device=current_platform.current_device(),
|
||||
dtype=config.torch_dtype,
|
||||
)
|
||||
|
||||
|
||||
@ -223,9 +223,7 @@ class Qwen3_VisionPatchMerger(nn.Module):
|
||||
|
||||
if norm_layer is None:
|
||||
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
||||
self.use_postshuffle_norm = use_postshuffle_norm
|
||||
self.norm = norm_layer(
|
||||
self.hidden_size if use_postshuffle_norm else context_dim)
|
||||
self.norm = norm_layer(context_dim)
|
||||
self.linear_fc1 = ColumnParallelLinear(self.hidden_size,
|
||||
self.hidden_size,
|
||||
bias=True,
|
||||
@ -1075,6 +1073,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
config.text_config.hidden_size)
|
||||
for _ in range(self.deepstack_num_level)
|
||||
] if self.use_deepstack else None
|
||||
self.visual_dim = config.vision_config.out_hidden_size
|
||||
self.multiscale_dim = self.visual_dim * self.deepstack_num_level
|
||||
|
||||
def _get_deepstack_input_embeds(self,
|
||||
num_tokens: int) -> IntermediateTensors:
|
||||
@ -1313,12 +1313,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
]
|
||||
multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0)
|
||||
|
||||
visual_dim = multimodal_embeddings_cat.shape[-1] // (
|
||||
self.deepstack_num_level + 1)
|
||||
|
||||
main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level
|
||||
multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501
|
||||
multimodal_embeddings_cat, [main_dim, multi_dim],
|
||||
multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim],
|
||||
dim=-1)
|
||||
|
||||
multimodal_embeddings = torch.split(multimodal_embeddings_main,
|
||||
@ -1340,10 +1336,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
],
|
||||
)
|
||||
deepstack_input_embeds = deepstack_input_embeds.view(
|
||||
inputs_embeds.shape[0], self.deepstack_num_level,
|
||||
visual_dim).contiguous()
|
||||
deepstack_input_embeds = deepstack_input_embeds.permute(
|
||||
1, 0, 2).contiguous()
|
||||
inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim)
|
||||
deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2)
|
||||
return deepstack_input_embeds, multimodal_embeddings
|
||||
|
||||
def get_input_embeddings(
|
||||
@ -1353,9 +1347,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
) -> torch.Tensor:
|
||||
deepstack_input_embeds = None
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
if multimodal_embeddings is not None and self.use_deepstack:
|
||||
deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501
|
||||
input_ids, inputs_embeds, multimodal_embeddings)
|
||||
if multimodal_embeddings is not None:
|
||||
if self.use_deepstack:
|
||||
deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501
|
||||
input_ids, inputs_embeds, multimodal_embeddings)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, multimodal_embeddings,
|
||||
[self.config.image_token_id, self.config.video_token_id])
|
||||
@ -1531,4 +1526,4 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
language_model="language_model",
|
||||
connector="model.visual.merger",
|
||||
tower_model="model.visual.",
|
||||
)
|
||||
)
|
||||
@ -344,3 +344,5 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
|
||||
config.text_config.hidden_size)
|
||||
for _ in range(self.deepstack_num_level)
|
||||
] if self.use_deepstack else None
|
||||
self.visual_dim = config.vision_config.out_hidden_size
|
||||
self.multiscale_dim = self.visual_dim * self.deepstack_num_level
|
||||
@ -19,7 +19,7 @@ from typing import Callable, Optional, TypeVar, Union
|
||||
import torch.nn as nn
|
||||
import transformers
|
||||
|
||||
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
|
||||
from vllm.config import (ModelConfig, iter_architecture_defaults,
|
||||
try_match_architecture_defaults)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.dynamic_module import (
|
||||
@ -587,7 +587,7 @@ class _ModelRegistry:
|
||||
if model_module is not None:
|
||||
break
|
||||
else:
|
||||
if model_config.model_impl != ModelImpl.TRANSFORMERS:
|
||||
if model_config.model_impl != "transformers":
|
||||
return None
|
||||
|
||||
raise ValueError(
|
||||
@ -598,7 +598,7 @@ class _ModelRegistry:
|
||||
"'auto_map' (relevant if the model is custom).")
|
||||
|
||||
if not model_module.is_backend_compatible():
|
||||
if model_config.model_impl != ModelImpl.TRANSFORMERS:
|
||||
if model_config.model_impl != "transformers":
|
||||
return None
|
||||
|
||||
raise ValueError(
|
||||
@ -644,20 +644,20 @@ class _ModelRegistry:
|
||||
raise ValueError("No model architectures are specified")
|
||||
|
||||
# Require transformers impl
|
||||
if model_config.model_impl == ModelImpl.TRANSFORMERS:
|
||||
if model_config.model_impl == "transformers":
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
if arch is not None:
|
||||
model_info = self._try_inspect_model_cls(arch)
|
||||
if model_info is not None:
|
||||
return (model_info, arch)
|
||||
elif model_config.model_impl == ModelImpl.TERRATORCH:
|
||||
elif model_config.model_impl == "terratorch":
|
||||
model_info = self._try_inspect_model_cls("Terratorch")
|
||||
return (model_info, "Terratorch")
|
||||
|
||||
# Fallback to transformers impl (after resolving convert_type)
|
||||
if (all(arch not in self.models for arch in architectures)
|
||||
and model_config.model_impl == ModelImpl.AUTO
|
||||
and model_config.model_impl == "auto"
|
||||
and getattr(model_config, "convert_type", "none") == "none"):
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
@ -674,7 +674,7 @@ class _ModelRegistry:
|
||||
|
||||
# Fallback to transformers impl (before resolving runner_type)
|
||||
if (all(arch not in self.models for arch in architectures)
|
||||
and model_config.model_impl == ModelImpl.AUTO):
|
||||
and model_config.model_impl == "auto"):
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
if arch is not None:
|
||||
@ -695,14 +695,14 @@ class _ModelRegistry:
|
||||
raise ValueError("No model architectures are specified")
|
||||
|
||||
# Require transformers impl
|
||||
if model_config.model_impl == ModelImpl.TRANSFORMERS:
|
||||
if model_config.model_impl == "transformers":
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
if arch is not None:
|
||||
model_cls = self._try_load_model_cls(arch)
|
||||
if model_cls is not None:
|
||||
return (model_cls, arch)
|
||||
elif model_config.model_impl == ModelImpl.TERRATORCH:
|
||||
elif model_config.model_impl == "terratorch":
|
||||
arch = "Terratorch"
|
||||
model_cls = self._try_load_model_cls(arch)
|
||||
if model_cls is not None:
|
||||
@ -710,7 +710,7 @@ class _ModelRegistry:
|
||||
|
||||
# Fallback to transformers impl (after resolving convert_type)
|
||||
if (all(arch not in self.models for arch in architectures)
|
||||
and model_config.model_impl == ModelImpl.AUTO
|
||||
and model_config.model_impl == "auto"
|
||||
and getattr(model_config, "convert_type", "none") == "none"):
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
@ -727,7 +727,7 @@ class _ModelRegistry:
|
||||
|
||||
# Fallback to transformers impl (before resolving runner_type)
|
||||
if (all(arch not in self.models for arch in architectures)
|
||||
and model_config.model_impl == ModelImpl.AUTO):
|
||||
and model_config.model_impl == "auto"):
|
||||
arch = self._try_resolve_transformers(architectures[0],
|
||||
model_config)
|
||||
if arch is not None:
|
||||
|
||||
@ -27,7 +27,7 @@ from transformers import (AutoModel, BatchFeature, PretrainedConfig,
|
||||
PreTrainedModel)
|
||||
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.attention import Attention, AttentionType
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, VllmConfig)
|
||||
@ -452,8 +452,9 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
self.pp_rank = self.pp_group.rank_in_group
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
|
||||
# To be updated in child classes for use in `load_weights`
|
||||
self.skip_prefixes: Optional[list[str]] = None
|
||||
# Weights to skip in `self.load_weights`
|
||||
self.skip_prefixes: list[str] = []
|
||||
self.skip_substrs: list[str] = []
|
||||
|
||||
# Set correct attn and init on "meta" to delay allocating GPU tensors
|
||||
# TODO: @raushan, use the public `model.set_attn_implementation()`
|
||||
@ -596,7 +597,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
|
||||
_tensor_parallel(self.model)
|
||||
|
||||
def create_attention_instances(self) -> dict[int, Attention]:
|
||||
def create_attention_instances(
|
||||
self,
|
||||
attn_type: AttentionType = AttentionType.DECODER
|
||||
) -> dict[int, Attention]:
|
||||
"""
|
||||
Create `Attention` instances to inform KV cache allocation.
|
||||
"""
|
||||
@ -625,7 +629,8 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
cache_config=self.cache_config,
|
||||
quant_config=self.quant_config,
|
||||
per_layer_sliding_window=per_layer_sliding_window,
|
||||
prefix=f"{i}.attn")
|
||||
prefix=f"{i}.attn",
|
||||
attn_type=attn_type)
|
||||
return attention_instances
|
||||
|
||||
def init_parameters(self, module: nn.Module):
|
||||
@ -685,7 +690,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes)
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=self.skip_prefixes,
|
||||
skip_substrs=self.skip_substrs,
|
||||
)
|
||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
|
||||
@ -700,6 +709,37 @@ class TransformersModel(TransformersBase):
|
||||
"model.score": "score",
|
||||
})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
# Some encoder models have the position_ids buffer in the checkpoint
|
||||
# vLLM will always pass position_ids as an argument, so we skip loading
|
||||
# the buffer if it exists
|
||||
self.skip_substrs.append("position_ids")
|
||||
|
||||
def create_attention_instances(
|
||||
self, attn_type: AttentionType = AttentionType.DECODER):
|
||||
# TODO(hmellor): Better way to detect encoder models
|
||||
# In encoder models, the attention layers will have `is_causal=False`
|
||||
is_encoder = lambda m: not getattr(m, "is_causal", True)
|
||||
# vLLM does not support encoder-decoder models, so if any encoder layer
|
||||
# is found, we assume the whole model is an encoder model
|
||||
if any(is_encoder(m) for m in self.model.modules()):
|
||||
attn_type = AttentionType.ENCODER_ONLY
|
||||
|
||||
# Check minimum transformers version for encoder models support
|
||||
if attn_type == AttentionType.ENCODER_ONLY:
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if installed < required:
|
||||
raise ValueError(
|
||||
"Encoder models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
|
||||
return super().create_attention_instances(attn_type)
|
||||
|
||||
|
||||
@support_torch_compile(enable_if=can_enable_torch_compile)
|
||||
class TransformersForCausalLM(TransformersBase):
|
||||
@ -710,7 +750,7 @@ class TransformersForCausalLM(TransformersBase):
|
||||
# Tell `TransformersBase.load_weights` to skip
|
||||
# `lm_head` if the model has tied word embeddings
|
||||
if self.text_config.tie_word_embeddings:
|
||||
self.skip_prefixes = ["lm_head."]
|
||||
self.skip_prefixes.append("lm_head.")
|
||||
|
||||
if get_pp_group().is_last_rank:
|
||||
self.unpadded_vocab_size = self.text_config.vocab_size
|
||||
|
||||
@ -13,9 +13,7 @@ from transformers import BatchFeature, ProcessorMixin
|
||||
from transformers.models.whisper import WhisperFeatureExtractor
|
||||
from transformers.models.whisper.modeling_whisper import WhisperEncoder
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.model_loader import DefaultModelLoader
|
||||
@ -37,8 +35,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP)
|
||||
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
||||
init_vllm_registered_model, maybe_prefix,
|
||||
merge_multimodal_embeddings,
|
||||
merge_multimodal_embeddings_from_map)
|
||||
merge_multimodal_embeddings)
|
||||
|
||||
_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
|
||||
_MAX_ENCODER_BATCH_SIZE = 16
|
||||
@ -568,17 +565,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||
safe_input_ids)
|
||||
if multimodal_embeddings is not None and len(
|
||||
multimodal_embeddings) > 0:
|
||||
|
||||
# TODO(ywang96): remove this block after v0 is deprecated.
|
||||
if not envs.VLLM_USE_V1:
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
merge_multimodal_embeddings_from_map(
|
||||
inputs_embeds, multimodal_embeddings,
|
||||
attn_metadata.multi_modal_placeholder_index_maps["audio"])
|
||||
else:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, multimodal_embeddings,
|
||||
self.config.audio_token_index)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, multimodal_embeddings,
|
||||
self.config.audio_token_index)
|
||||
return inputs_embeds
|
||||
|
||||
def forward(self,
|
||||
|
||||
@ -15,7 +15,7 @@ import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
|
||||
from vllm.multimodal import NestedTensors
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
|
||||
is_uva_available)
|
||||
@ -389,22 +389,6 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
|
||||
_embedding_count_expression(inner) for inner in embeddings)
|
||||
|
||||
|
||||
def merge_multimodal_embeddings_from_map(
|
||||
inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
|
||||
placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
|
||||
"""
|
||||
Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided
|
||||
placeholder map .
|
||||
|
||||
Note:
|
||||
This updates ``inputs_embeds`` in place.
|
||||
"""
|
||||
flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
|
||||
inputs_embeds[placeholder_map.dest] = flattened_embeddings[
|
||||
placeholder_map.src].to(dtype=inputs_embeds.dtype)
|
||||
return inputs_embeds
|
||||
|
||||
|
||||
def _merge_multimodal_embeddings(
|
||||
inputs_embeds: torch.Tensor,
|
||||
is_multimodal: torch.Tensor,
|
||||
|
||||
@ -494,7 +494,8 @@ def _enable_processor_cache(
|
||||
|
||||
def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
|
||||
parallel_config = vllm_config.parallel_config
|
||||
supports_ipc_cache = (parallel_config.data_parallel_size == 1
|
||||
supports_ipc_cache = ((parallel_config._api_process_count == 1
|
||||
and parallel_config.data_parallel_size == 1)
|
||||
or parallel_config.data_parallel_external_lb)
|
||||
|
||||
return supports_ipc_cache
|
||||
|
||||
@ -127,14 +127,23 @@ class PrefixCachingMetrics:
|
||||
if stats.reset:
|
||||
self.reset()
|
||||
|
||||
# DO NOT appending empty stats to avoid helpful info get kicked out
|
||||
# due to sliding window.
|
||||
if stats.requests == 0:
|
||||
return
|
||||
|
||||
# Update the metrics.
|
||||
self.query_queue.append((stats.requests, stats.queries, stats.hits))
|
||||
self.aggregated_requests += stats.requests
|
||||
self.aggregated_query_total += stats.queries
|
||||
self.aggregated_query_hit += stats.hits
|
||||
|
||||
# Remove the oldest stats if the number of requests exceeds.
|
||||
if self.aggregated_requests > self.max_recent_requests:
|
||||
# Remove the oldest stats until number of requests does not exceed
|
||||
# the limit.
|
||||
# NOTE: We preserve the latest added stats regardless.
|
||||
while len(
|
||||
self.query_queue
|
||||
) > 1 and self.aggregated_requests > self.max_recent_requests:
|
||||
old_requests, old_queries, old_hits = self.query_queue.popleft()
|
||||
self.aggregated_requests -= old_requests
|
||||
self.aggregated_query_total -= old_queries
|
||||
|
||||
@ -15,6 +15,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
|
||||
KVConnectorFactory)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
|
||||
KVConnectorRole)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
|
||||
@ -576,8 +578,10 @@ class Scheduler(SchedulerInterface):
|
||||
scheduled_spec_decode_tokens,
|
||||
req_to_new_blocks,
|
||||
)
|
||||
scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
|
||||
scheduled_resumed_reqs)
|
||||
structured_output_request_ids, grammar_bitmask = (
|
||||
self.get_grammar_bitmask(self.running,
|
||||
self.get_grammar_bitmask(scheduled_requests,
|
||||
scheduled_spec_decode_tokens))
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=new_reqs_data,
|
||||
@ -870,9 +874,12 @@ class Scheduler(SchedulerInterface):
|
||||
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
|
||||
pooler_outputs = model_runner_output.pooler_output
|
||||
num_nans_in_logits = model_runner_output.num_nans_in_logits
|
||||
kv_connector_output = model_runner_output.kv_connector_output
|
||||
|
||||
outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
|
||||
spec_decoding_stats: Optional[SpecDecodingStats] = None
|
||||
kv_connector_stats = (kv_connector_output.kv_connector_stats
|
||||
if kv_connector_output else None)
|
||||
|
||||
# NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
|
||||
# the below loop can be a performance bottleneck. We should do our best
|
||||
@ -1013,7 +1020,8 @@ class Scheduler(SchedulerInterface):
|
||||
finished_requests=finished_set)
|
||||
finished_req_ids.clear()
|
||||
|
||||
if (stats := self.make_stats(spec_decoding_stats)) is not None:
|
||||
if (stats := self.make_stats(spec_decoding_stats,
|
||||
kv_connector_stats)) is not None:
|
||||
# Return stats to only one of the front-ends.
|
||||
if (eco := next(iter(engine_core_outputs.values()), None)) is None:
|
||||
# We must return the stats even if there are no request
|
||||
@ -1178,20 +1186,21 @@ class Scheduler(SchedulerInterface):
|
||||
def make_stats(
|
||||
self,
|
||||
spec_decoding_stats: Optional[SpecDecodingStats] = None,
|
||||
kv_connector_stats: Optional[KVConnectorStats] = None,
|
||||
) -> Optional[SchedulerStats]:
|
||||
if not self.log_stats:
|
||||
return None
|
||||
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
|
||||
assert prefix_cache_stats is not None
|
||||
return SchedulerStats(
|
||||
num_running_reqs=len(self.running),
|
||||
num_waiting_reqs=len(self.waiting),
|
||||
kv_cache_usage=self.kv_cache_manager.usage,
|
||||
prefix_cache_stats=prefix_cache_stats,
|
||||
spec_decoding_stats=spec_decoding_stats,
|
||||
num_corrupted_reqs=sum(req.is_output_corrupted
|
||||
for req in self.running),
|
||||
)
|
||||
return SchedulerStats(num_running_reqs=len(self.running),
|
||||
num_waiting_reqs=len(self.waiting),
|
||||
kv_cache_usage=self.kv_cache_manager.usage,
|
||||
prefix_cache_stats=prefix_cache_stats,
|
||||
spec_decoding_stats=spec_decoding_stats,
|
||||
num_corrupted_reqs=sum(req.is_output_corrupted
|
||||
for req in self.running),
|
||||
kv_connector_stats=kv_connector_stats.data
|
||||
if kv_connector_stats else None)
|
||||
|
||||
def make_spec_decoding_stats(
|
||||
self,
|
||||
|
||||
@ -437,7 +437,7 @@ class MPClient(EngineCoreClient):
|
||||
self.engines_running = False
|
||||
|
||||
self.stats_update_address: Optional[str] = None
|
||||
if client_addresses is not None:
|
||||
if client_addresses:
|
||||
# Engines are managed externally to this client.
|
||||
input_address = client_addresses["input_address"]
|
||||
output_address = client_addresses["output_address"]
|
||||
@ -774,6 +774,7 @@ class AsyncMPClient(MPClient):
|
||||
client_addresses=client_addresses,
|
||||
)
|
||||
|
||||
self.client_count = client_count
|
||||
self.client_index = client_index
|
||||
self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
|
||||
Exception]]()
|
||||
|
||||
171
vllm/v1/kv_offload/worker/cpu_gpu.py
Normal file
171
vllm/v1/kv_offload/worker/cpu_gpu.py
Normal file
@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.attention import AttentionBackend
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
|
||||
TransferResult, TransferSpec)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def expand_block_ids(block_ids: np.ndarray,
|
||||
block_size_factor: int,
|
||||
output: np.ndarray,
|
||||
skip_count: int = 0):
|
||||
"""
|
||||
Convert a list of block IDs to a list of matching block ids,
|
||||
assuming each block is composed of actual block_size_factor blocks.
|
||||
Outputs to output tensor.
|
||||
The first skip_count blocks will be skipped.
|
||||
Note that skip_count must be less than block_size_factor.
|
||||
|
||||
For example, if block_ids = [0, 1, 3] and block_size_factor = 4,
|
||||
then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
|
||||
since 0 maps to [0, 1, 2, 3]
|
||||
1 maps to [4, 5, 6, 7]
|
||||
and 3 maps to [12, 13, 14, 15]
|
||||
"""
|
||||
assert skip_count < block_size_factor
|
||||
|
||||
first_range = np.arange(skip_count, block_size_factor)
|
||||
full_range = np.arange(0, block_size_factor)
|
||||
|
||||
output_idx = 0
|
||||
for i, block_id in enumerate(block_ids):
|
||||
base_block_id = block_id * block_size_factor
|
||||
indices = first_range if i == 0 else full_range
|
||||
output_end_idx = output_idx + len(indices)
|
||||
output[output_idx:output_end_idx] = base_block_id + indices
|
||||
output_idx = output_end_idx
|
||||
|
||||
|
||||
class CpuGpuOffloadingHandler(OffloadingHandler):
|
||||
|
||||
def __init__(self, gpu_block_size: int, cpu_block_size: int,
|
||||
num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
|
||||
attn_backends: dict[str, type[AttentionBackend]]):
|
||||
assert cpu_block_size % gpu_block_size == 0
|
||||
self.block_size_factor = cpu_block_size // gpu_block_size
|
||||
|
||||
# cuda streams for gpu->cpu and cpu->gpu
|
||||
self.d2h_stream = torch.cuda.Stream()
|
||||
self.h2d_stream = torch.cuda.Stream()
|
||||
|
||||
# job_id -> transfer cuda event
|
||||
self.transfer_events: dict[int, torch.cuda.Event] = {}
|
||||
# list of cuda events available for re-use
|
||||
self.events_pool: list[torch.cuda.Event] = []
|
||||
|
||||
pin_memory = is_pin_memory_available()
|
||||
|
||||
# allocate cpu tensors
|
||||
logger.info("Allocating %d CPU tensors...", len(gpu_caches))
|
||||
self.gpu_tensors: list[torch.Tensor] = []
|
||||
self.cpu_tensors: list[torch.Tensor] = []
|
||||
self.kv_dim_before_num_blocks: list[bool] = []
|
||||
for layer_name, gpu_tensor in gpu_caches.items():
|
||||
self.gpu_tensors.append(gpu_tensor)
|
||||
|
||||
gpu_shape = gpu_tensor.shape
|
||||
test_shape = attn_backends[layer_name].get_kv_cache_shape(
|
||||
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
|
||||
if test_shape[0] == 1234:
|
||||
# shape is (num_blocks, ...)
|
||||
num_blocks_idx = 0
|
||||
self.kv_dim_before_num_blocks.append(False)
|
||||
else:
|
||||
# shape should be (2, num_blocks, ...)
|
||||
assert test_shape[0] == 2
|
||||
assert test_shape[1] == 1234
|
||||
assert gpu_shape[0] == 2
|
||||
|
||||
num_blocks_idx = 1
|
||||
self.kv_dim_before_num_blocks.append(True)
|
||||
|
||||
cpu_shape = list(gpu_shape)
|
||||
cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
|
||||
|
||||
logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
|
||||
self.cpu_tensors.append(
|
||||
torch.zeros(cpu_shape,
|
||||
dtype=gpu_tensor.dtype,
|
||||
device="cpu",
|
||||
pin_memory=pin_memory))
|
||||
|
||||
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
|
||||
src_spec, dst_spec = spec
|
||||
if isinstance(src_spec, CPULoadStoreSpec):
|
||||
assert isinstance(dst_spec, GPULoadStoreSpec)
|
||||
stream = self.h2d_stream
|
||||
src_tensors = self.cpu_tensors
|
||||
dst_tensors = self.gpu_tensors
|
||||
src_block_size_factor = self.block_size_factor
|
||||
dst_block_size_factor = 1
|
||||
else:
|
||||
assert isinstance(src_spec, GPULoadStoreSpec)
|
||||
assert isinstance(dst_spec, CPULoadStoreSpec)
|
||||
stream = self.d2h_stream
|
||||
src_tensors = self.gpu_tensors
|
||||
dst_tensors = self.cpu_tensors
|
||||
src_block_size_factor = 1
|
||||
dst_block_size_factor = self.block_size_factor
|
||||
|
||||
src_blocks = src_spec.block_ids
|
||||
dst_blocks = dst_spec.block_ids
|
||||
assert src_blocks.ndim == 1
|
||||
assert dst_blocks.ndim == 1
|
||||
|
||||
dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
|
||||
src_sub_block_count = src_blocks.size * src_block_size_factor
|
||||
|
||||
assert (
|
||||
src_sub_block_count == dst_blocks.size * dst_block_size_factor -
|
||||
dst_sub_blocks_to_skip)
|
||||
|
||||
src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
|
||||
expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
|
||||
expand_block_ids(dst_blocks,
|
||||
dst_block_size_factor,
|
||||
src_to_dst[:, 1],
|
||||
skip_count=dst_sub_blocks_to_skip)
|
||||
src_to_dst_tensor = torch.from_numpy(src_to_dst)
|
||||
|
||||
event = self.events_pool.pop() if self.events_pool \
|
||||
else torch.cuda.Event()
|
||||
with torch.cuda.stream(stream):
|
||||
for src_tensor, dst_tensor, kv_dim in zip(
|
||||
src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
|
||||
if kv_dim:
|
||||
src_key_cache = src_tensor[0]
|
||||
dst_key_cache = dst_tensor[0]
|
||||
ops.swap_blocks(src_key_cache, dst_key_cache,
|
||||
src_to_dst_tensor)
|
||||
src_value_cache = src_tensor[1]
|
||||
dst_value_cache = dst_tensor[1]
|
||||
ops.swap_blocks(src_value_cache, dst_value_cache,
|
||||
src_to_dst_tensor)
|
||||
else:
|
||||
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
|
||||
event.record(stream)
|
||||
|
||||
self.transfer_events[job_id] = event
|
||||
|
||||
# success
|
||||
return True
|
||||
|
||||
def get_finished(self) -> list[TransferResult]:
|
||||
results: list[TransferResult] = []
|
||||
for job_id, event in self.transfer_events.items():
|
||||
if event.query():
|
||||
results.append((job_id, True))
|
||||
self.events_pool.append(event)
|
||||
for job_id, _ in results:
|
||||
del self.transfer_events[job_id]
|
||||
return results
|
||||
@ -9,6 +9,8 @@ from typing import Callable, Optional, Union
|
||||
import prometheus_client
|
||||
|
||||
from vllm.config import SupportsMetricsInfo, VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorLogging)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
|
||||
from vllm.v1.engine import FinishReason
|
||||
@ -59,6 +61,8 @@ class LoggingStatLogger(StatLoggerBase):
|
||||
# TODO: Make the interval configurable.
|
||||
self.prefix_caching_metrics = PrefixCachingMetrics()
|
||||
self.spec_decoding_logging = SpecDecodingLogging()
|
||||
kv_tranfer_config = self.vllm_config.kv_transfer_config
|
||||
self.kv_transfer_logging = KVConnectorLogging(kv_tranfer_config)
|
||||
self.last_prompt_throughput: float = 0.0
|
||||
self.last_generation_throughput: float = 0.0
|
||||
|
||||
@ -97,7 +101,8 @@ class LoggingStatLogger(StatLoggerBase):
|
||||
if scheduler_stats.spec_decoding_stats is not None:
|
||||
self.spec_decoding_logging.observe(
|
||||
scheduler_stats.spec_decoding_stats)
|
||||
|
||||
if kv_connector_stats := scheduler_stats.kv_connector_stats:
|
||||
self.kv_transfer_logging.observe(kv_connector_stats)
|
||||
self.last_scheduler_stats = scheduler_stats
|
||||
|
||||
def log(self):
|
||||
@ -136,6 +141,7 @@ class LoggingStatLogger(StatLoggerBase):
|
||||
self.prefix_caching_metrics.hit_rate * 100,
|
||||
)
|
||||
self.spec_decoding_logging.log(log_fn=log_fn)
|
||||
self.kv_transfer_logging.log(log_fn=log_fn)
|
||||
|
||||
def log_engine_initialized(self):
|
||||
if self.vllm_config.cache_config.num_gpu_blocks:
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from vllm.v1.spec_decode.metrics import SpecDecodingStats
|
||||
|
||||
@ -43,6 +43,7 @@ class SchedulerStats:
|
||||
default_factory=PrefixCacheStats)
|
||||
|
||||
spec_decoding_stats: Optional[SpecDecodingStats] = None
|
||||
kv_connector_stats: Optional[dict[str, Any]] = None
|
||||
|
||||
num_corrupted_reqs: int = 0
|
||||
|
||||
|
||||
@ -3,11 +3,15 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import NamedTuple, Optional
|
||||
from typing import TYPE_CHECKING, NamedTuple, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
|
||||
|
||||
class LogprobsLists(NamedTuple):
|
||||
|
||||
@ -78,6 +82,11 @@ class KVConnectorOutput:
|
||||
# [req_ids]
|
||||
finished_sending: Optional[set[str]] = None
|
||||
finished_recving: Optional[set[str]] = None
|
||||
kv_connector_stats: Optional["KVConnectorStats"] = None
|
||||
|
||||
def is_empty(self):
|
||||
return (not self.finished_sending and not self.finished_recving
|
||||
and not self.kv_connector_stats)
|
||||
|
||||
|
||||
# ModelRunnerOutput is serialized and sent to the scheduler process.
|
||||
|
||||
@ -29,15 +29,12 @@ class TopKTopPSampler(nn.Module):
|
||||
Implementations may update the logits tensor in-place.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None:
|
||||
def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
|
||||
super().__init__()
|
||||
self.logprobs_mode = logprobs_mode
|
||||
# flashinfer optimization does not apply if intermediate
|
||||
# logprobs/logits after top_k/top_p need to be returned
|
||||
if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS,
|
||||
LogprobsMode.PROCESSED_LOGPROBS
|
||||
if logprobs_mode not in ("processed_logits", "processed_logprobs"
|
||||
) and current_platform.is_cuda():
|
||||
if is_flashinfer_available:
|
||||
flashinfer_version = flashinfer.__version__
|
||||
@ -90,9 +87,9 @@ class TopKTopPSampler(nn.Module):
|
||||
"""
|
||||
logits = self.apply_top_k_top_p(logits, k, p)
|
||||
logits_to_return = None
|
||||
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
|
||||
if self.logprobs_mode == "processed_logits":
|
||||
logits_to_return = logits
|
||||
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
|
||||
elif self.logprobs_mode == "processed_logprobs":
|
||||
logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
|
||||
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
||||
return random_sample(probs, generators), logits_to_return
|
||||
@ -115,7 +112,7 @@ class TopKTopPSampler(nn.Module):
|
||||
"PyTorch-native implementation.")
|
||||
return self.forward_native(logits, generators, k, p)
|
||||
assert self.logprobs_mode not in (
|
||||
LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS
|
||||
"processed_logits", "processed_logprobs"
|
||||
), "FlashInfer does not support returning logits/logprobs"
|
||||
# flashinfer sampling functions expect contiguous logits.
|
||||
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
|
||||
|
||||
@ -60,8 +60,7 @@ class Sampler(nn.Module):
|
||||
9. Return the final `SamplerOutput`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS):
|
||||
def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"):
|
||||
super().__init__()
|
||||
self.topk_topp_sampler = TopKTopPSampler(logprobs_mode)
|
||||
self.pin_memory = is_pin_memory_available()
|
||||
@ -78,9 +77,9 @@ class Sampler(nn.Module):
|
||||
# is used for sampling (after penalties and temperature scaling).
|
||||
num_logprobs = sampling_metadata.max_num_logprobs
|
||||
if num_logprobs is not None:
|
||||
if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS:
|
||||
if self.logprobs_mode == "raw_logprobs":
|
||||
raw_logprobs = self.compute_logprobs(logits)
|
||||
elif self.logprobs_mode == LogprobsMode.RAW_LOGITS:
|
||||
elif self.logprobs_mode == "raw_logits":
|
||||
raw_logprobs = logits.clone()
|
||||
|
||||
# Use float32 for the logits.
|
||||
@ -156,9 +155,9 @@ class Sampler(nn.Module):
|
||||
if sampling_metadata.all_greedy:
|
||||
processed_logprobs = None
|
||||
if sampling_metadata.max_num_logprobs is not None:
|
||||
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
|
||||
if self.logprobs_mode == "processed_logits":
|
||||
processed_logprobs = logits
|
||||
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
|
||||
elif self.logprobs_mode == "processed_logprobs":
|
||||
processed_logprobs = self.compute_logprobs(logits)
|
||||
return greedy_sampled, processed_logprobs
|
||||
|
||||
|
||||
@ -90,13 +90,14 @@ def apply_grammar_bitmask(
|
||||
seq = sorted(scheduler_output.structured_output_request_ids.items(),
|
||||
key=lambda x: x[1])
|
||||
for req_id, _ in seq:
|
||||
logit_index = struct_out_req_batch_indices[req_id]
|
||||
num_spec_tokens = len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
|
||||
for i in range(1 + num_spec_tokens):
|
||||
sorted_bitmask[logit_index + i] = \
|
||||
grammar_bitmask[cumulative_index + i]
|
||||
out_indices.append(logit_index + i)
|
||||
if req_id in struct_out_req_batch_indices:
|
||||
logit_index = struct_out_req_batch_indices[req_id]
|
||||
for i in range(1 + num_spec_tokens):
|
||||
sorted_bitmask[logit_index + i] = \
|
||||
grammar_bitmask[cumulative_index + i]
|
||||
out_indices.append(logit_index + i)
|
||||
cumulative_index += 1 + num_spec_tokens
|
||||
grammar_bitmask = sorted_bitmask
|
||||
|
||||
|
||||
@ -107,9 +107,8 @@ class CPUModelRunner(GPUModelRunner):
|
||||
self.model = get_model(vllm_config=self.vllm_config)
|
||||
|
||||
if self.lora_config:
|
||||
self.model = self.load_lora_model(self.model, self.model_config,
|
||||
self.scheduler_config,
|
||||
self.lora_config, self.device)
|
||||
self.model = self.load_lora_model(self.model, self.vllm_config,
|
||||
self.device)
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
return self.model
|
||||
|
||||
@ -2552,10 +2552,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.model = model_loader.load_model(
|
||||
vllm_config=self.vllm_config, model_config=self.model_config)
|
||||
if self.lora_config:
|
||||
self.model = self.load_lora_model(self.model,
|
||||
self.model_config,
|
||||
self.scheduler_config,
|
||||
self.lora_config,
|
||||
self.model = self.load_lora_model(self.model, self.vllm_config,
|
||||
self.device)
|
||||
if hasattr(self, "drafter"):
|
||||
logger.info("Loading drafter model...")
|
||||
|
||||
@ -13,6 +13,8 @@ from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown,
|
||||
get_kv_transfer_group,
|
||||
has_kv_transfer_group)
|
||||
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.forward_context import get_forward_context, set_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput,
|
||||
@ -119,4 +121,11 @@ class KVConnectorModelRunnerMixin:
|
||||
output.finished_sending, output.finished_recving = (
|
||||
kv_connector.get_finished(scheduler_output.finished_req_ids))
|
||||
|
||||
kv_connector.clear_connector_metadata()
|
||||
output.kv_connector_stats = KVConnectorModelRunnerMixin.\
|
||||
get_kv_connector_stats()
|
||||
|
||||
@staticmethod
|
||||
def get_kv_connector_stats() -> Optional[KVConnectorStats]:
|
||||
if has_kv_transfer_group():
|
||||
return get_kv_transfer_group().get_kv_connector_stats()
|
||||
return None
|
||||
|
||||
@ -11,7 +11,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import ModelConfig, SchedulerConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
@ -31,9 +31,7 @@ class LoRAModelRunnerMixin:
|
||||
|
||||
LORA_WARMUP_RANK = 8
|
||||
|
||||
def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
lora_config: LoRAConfig,
|
||||
def load_lora_model(self, model: nn.Module, vllm_config: VllmConfig,
|
||||
device: torch.device) -> nn.Module:
|
||||
|
||||
if not supports_lora(model):
|
||||
@ -44,19 +42,12 @@ class LoRAModelRunnerMixin:
|
||||
logger.warning("Regarding multimodal models, vLLM currently "
|
||||
"only supports adding LoRA to language model.")
|
||||
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = model_config.hf_config.get_text_config()
|
||||
|
||||
# Add LoRA Manager to the Model Runner
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
scheduler_config.max_num_seqs,
|
||||
scheduler_config.max_num_batched_tokens,
|
||||
model_config.get_vocab_size(),
|
||||
lora_config,
|
||||
vllm_config,
|
||||
device,
|
||||
model.embedding_modules,
|
||||
model.embedding_padding_modules,
|
||||
max_position_embeddings=text_config.max_position_embeddings,
|
||||
)
|
||||
return self.lora_manager.create_lora_manager(model)
|
||||
|
||||
|
||||
@ -1178,9 +1178,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
"or sharding the weights on more chips. "
|
||||
f"See the detailed error: {e}") from e
|
||||
if self.lora_config is not None:
|
||||
model = self.load_lora_model(model, self.model_config,
|
||||
self.scheduler_config,
|
||||
self.lora_config, self.device)
|
||||
model = self.load_lora_model(model, self.vllm_config, self.device)
|
||||
replace_set_lora(model)
|
||||
|
||||
# Sync all pending XLA execution during model initialization and weight
|
||||
|
||||
@ -1078,20 +1078,13 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
"Regarding multimodal models, vLLM currently "
|
||||
"only supports adding LoRA to language model.")
|
||||
|
||||
# Use get_text_config() in case of multimodal models
|
||||
text_config = self.model_config.hf_config.get_text_config()
|
||||
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
self.scheduler_config.max_num_seqs,
|
||||
self.scheduler_config.max_num_batched_tokens,
|
||||
self.vocab_size,
|
||||
self.lora_config,
|
||||
self.vllm_config,
|
||||
self.device,
|
||||
self.model.embedding_modules,
|
||||
self.model.embedding_padding_modules,
|
||||
max_position_embeddings=text_config.
|
||||
max_position_embeddings,
|
||||
)
|
||||
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
time_after_load = time.perf_counter()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user