Merge remote-tracking branch 'origin/main' into one-pod-per-node-lb

Signed-off-by: Nick Hill <nhill@redhat.com>

# Conflicts:
#	vllm/v1/engine/core_client.py
This commit is contained in:
Nick Hill 2025-07-21 19:14:40 +01:00
commit 60ae223986
113 changed files with 2398 additions and 1061 deletions

View File

@ -6,6 +6,7 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@ -78,17 +79,16 @@ function cpu_tests() {
# tests/quantization/test_ipex_quant.py"
# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
--endpoint /v1/completions'
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "

View File

@ -273,7 +273,7 @@ steps:
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min

View File

@ -46,7 +46,7 @@ body:
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:

View File

@ -52,3 +52,36 @@ After branch cut, we approach finalizing the release branch with clear criteria
* Release branch specific changes (e.g. change version identifiers or CI fixes)
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
## Manual validations
### E2E Performance Validation
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
**Current Coverage:**
* Models: Llama3, Llama4, and Mixtral
* Hardware: NVIDIA H100 and AMD MI300x
* *Note: Coverage may change based on new model releases and hardware availability*
**Performance Validation Process:**
**Step 1: Get Access**
Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
**Step 2: Review Benchmark Setup**
Familiarize yourself with the benchmark configurations:
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
**Step 3: Run the Benchmark**
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
* **vLLM commit**: Set to the RC commit hash
**Step 4: Review Results**
Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
**Step 5: Performance Comparison**
Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).

View File

@ -7,7 +7,7 @@
namespace {
#define MAX_SHM_RANK_NUM 8
#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024)
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
#define MIN_THREAD_PROCESS_SIZE (256)
@ -34,9 +34,10 @@ struct KernelVecType<c10::Half> {
};
struct ThreadSHMContext {
volatile char _curr_thread_stamp;
volatile char _ready_thread_stamp;
char _padding1[6];
volatile char _curr_thread_stamp[2];
volatile char _ready_thread_stamp[2];
int local_stamp_buffer_idx;
int remote_stamp_buffer_idx;
int thread_id;
int thread_num;
int rank;
@ -45,23 +46,28 @@ struct ThreadSHMContext {
int swizzled_ranks[MAX_SHM_RANK_NUM];
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
size_t _thread_buffer_mask;
char _padding2[56];
size_t _thread_buffer_mask[2];
char _padding2[40];
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
const int group_size, void* thread_shm_ptr)
: _curr_thread_stamp(1),
_ready_thread_stamp(0),
: local_stamp_buffer_idx(0),
remote_stamp_buffer_idx(0),
thread_id(thread_id),
thread_num(thread_num),
rank(rank),
group_size(group_size),
_spinning_count(0),
_thread_buffer_mask(0) {
_spinning_count(0) {
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
TORCH_CHECK((size_t)this % 64 == 0);
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
_curr_thread_stamp[0] = 1;
_curr_thread_stamp[1] = 1;
_ready_thread_stamp[0] = 0;
_ready_thread_stamp[1] = 0;
_thread_buffer_mask[0] = 0;
_thread_buffer_mask[1] = 0;
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
shm_contexts[i] = nullptr;
thread_shm_ptrs[i] = nullptr;
@ -70,6 +76,11 @@ struct ThreadSHMContext {
set_context(rank, this, thread_shm_ptr);
}
void set_stamp_buffer_idx(int local, int remote) {
local_stamp_buffer_idx = local;
remote_stamp_buffer_idx = remote;
}
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
TORCH_CHECK(ptr);
@ -84,23 +95,27 @@ struct ThreadSHMContext {
T* get_thread_shm_ptr(int rank) {
return reinterpret_cast<T*>(
reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
(PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask));
(PER_THREAD_SHM_BUFFER_OFFSET &
_thread_buffer_mask[local_stamp_buffer_idx]));
}
void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; }
void next_buffer() {
_thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
}
char get_curr_stamp() const { return _curr_thread_stamp; }
char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
char get_ready_stamp() const { return _ready_thread_stamp; }
char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
void next_stamp() {
_mm_mfence();
_curr_thread_stamp += 1;
_curr_thread_stamp[local_stamp_buffer_idx] += 1;
}
void commit_ready_stamp() {
_mm_mfence();
_ready_thread_stamp = _curr_thread_stamp;
_ready_thread_stamp[local_stamp_buffer_idx] =
_curr_thread_stamp[local_stamp_buffer_idx];
}
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@ -117,10 +132,11 @@ struct ThreadSHMContext {
void wait_for_one(int rank, Cond&& cond) {
ThreadSHMContext* rank_ctx = shm_contexts[rank];
for (;;) {
char local_curr_stamp = get_curr_stamp();
char local_ready_stamp = get_ready_stamp();
char rank_curr_stamp = rank_ctx->get_curr_stamp();
char rank_ready_stamp = rank_ctx->get_ready_stamp();
char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx);
char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx);
char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx);
char rank_ready_stamp =
rank_ctx->get_ready_stamp(remote_stamp_buffer_idx);
if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
rank_ready_stamp)) {
break;
@ -361,6 +377,15 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
}
}
}
void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local,
int remote) {
int thread_num = ctx->thread_num;
for (int i = 0; i < thread_num; ++i) {
ThreadSHMContext* thread_ctx = ctx + i;
thread_ctx->set_stamp_buffer_idx(local, remote);
}
}
}; // namespace shm_cc_ops
namespace shm_cc_ops {
@ -632,6 +657,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
metadata->bind_tensor_list(tensor_list_with_metadata);
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1);
shm_cc_ops::shm_cc_loop<int8_t>(
ctx, metadata->total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
@ -659,6 +685,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
torch::Tensor metadata_tensor =
torch::empty({sizeof(TensorListMeta)}, options);
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0);
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
ctx->get_thread_shm_ptr<void>(src),
@ -677,7 +704,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
ctx, metadata.total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num, bool fast_mode) {
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
int64_t curr_shm_offset = 0;
while (curr_shm_offset < data_elem_num) {
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);

View File

@ -510,7 +510,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
ENV VLLM_USAGE_SOURCE production-docker-image

View File

@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
pip install accelerate hf_transfer pytest modelscope
ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1

View File

@ -14,7 +14,7 @@ For example:
```python
from vllm import LLM
model = LLM(
llm = LLM(
model="cerebras/Cerebras-GPT-1.3B",
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
)

View File

@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../..
### Grafana Dashboard
vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
@ -672,8 +672,7 @@ v0 has support for OpenTelemetry tracing:
`--collect-detailed-traces`
- [OpenTelemetry blog
post](https://opentelemetry.io/blog/2024/llm-observability/)
- [User-facing
docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html)
- [User-facing docs](../../examples/online_serving/opentelemetry.md)
- [Blog
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
- [IBM product

View File

@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
return tokenizer.apply_chat_template(chat, tokenize=False)
model = LLM(
llm = LLM(
model=model_id,
enable_lora=True,
max_lora_rank=64,
@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
}
outputs = model.generate(
outputs = llm.generate(
inputs,
sampling_params=SamplingParams(
temperature=0.2,

View File

@ -98,7 +98,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
```python
from vllm import LLM

View File

@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
!!! note
Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
For details see [supported hardware](supported_hardware.md).
Below are the steps to utilize BitBLAS with vLLM.

View File

@ -86,8 +86,9 @@ Load and run the model in `vllm`:
```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
result = model.generate("Hello my name is")
llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
result = llm.generate("Hello my name is")
print(result[0].outputs[0].text)
```
@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
```python
from vllm import LLM
model = LLM("facebook/opt-125m", quantization="fp8")
llm = LLM("facebook/opt-125m", quantization="fp8")
# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
result = model.generate("Hello, my name is")
result = llm.generate("Hello, my name is")
print(result[0].outputs[0].text)
```

View File

@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
```
To evaluate accuracy, you can use `lm_eval`:

View File

@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
```
To evaluate accuracy, you can use `lm_eval`:

View File

@ -95,7 +95,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha
## Required Function Calling
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine.
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.

View File

@ -166,6 +166,20 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
- This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
### How to do performance tuning for vLLM CPU?
- First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
- Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
- `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
- Offline Inference: `4096 * world_size`
- Online Serving: `2048 * world_size`
- `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance.
- Offline Inference: `256 * world_size`
- Online Serving: `128 * world_size`
- vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes.
### Which quantization configs does vLLM CPU support?
- vLLM CPU supports quantizations:

View File

@ -7,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
For more information on CoreWeave's Tensorizer, please refer to
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
the [vLLM example script](../../examples/others/tensorize_vllm_model.md).
!!! note
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.

View File

@ -11,26 +11,51 @@ before returning them.
As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
pooling models as they only work on the generation or decode stage, so performance may not improve as much.
For pooling models, we support the following `--task` options.
The selected option sets the default pooler used to extract the final hidden states:
If the model doesn't implement this interface, you can set `--task` which tells vLLM
to convert the model into a pooling model.
| Task | Pooling Type | Normalization | Softmax |
|---------------------------------|----------------|-----------------|-----------|
| Embedding (`embed`) | `LAST` | ✅︎ | ❌ |
| Classification (`classify`) | `LAST` | ❌ | ✅︎ |
| Sentence Pair Scoring (`score`) | \* | \* | \* |
| `--task` | Model type | Supported pooling tasks |
|------------|----------------------|-------------------------------|
| `embed` | Embedding model | `encode`, `embed` |
| `classify` | Classification model | `encode`, `classify`, `score` |
| `reward` | Reward model | `encode` |
\*The default pooler is always defined by the model.
## Pooling Tasks
!!! note
If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
In vLLM, we define the following pooling tasks and corresponding APIs:
| Task | APIs |
|------------|--------------------|
| `encode` | `encode` |
| `embed` | `embed`, `score`\* |
| `classify` | `classify` |
| `score` | `score` |
\*The `score` API falls back to `embed` task if the model does not support `score` task.
Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
By default, the pooler assigned to each task has the following attributes:
| Task | Pooling Type | Normalization | Softmax |
|------------|----------------|---------------|---------|
| `encode` | `ALL` | ❌ | ❌ |
| `embed` | `LAST` | ✅︎ | ❌ |
| `classify` | `LAST` | ❌ | ✅︎ |
These defaults may be overridden by the model's implementation in vLLM.
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
which takes priority over the model's defaults.
!!! tip
You can customize the model's pooling method via the `--override-pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults.
You can further customize this via the `--override-pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults.
!!! note
The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
that is not based on [PoolerConfig][vllm.config.PoolerConfig].
## Offline Inference
@ -149,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka
```python
from vllm import LLM, PoolingParams
model = LLM(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
outputs = model.embed(["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32))
llm = LLM(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
outputs = llm.embed(["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32))
print(outputs[0].outputs)
```

View File

@ -314,6 +314,13 @@ See [this page](generative_models.md) for more information on how to use generat
Specified using `--task generate`.
<style>
th {
white-space: nowrap;
min-width: 0 !important;
}
</style>
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |

View File

@ -28,10 +28,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(**vars(args))
llm = LLM(**vars(args))
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
outputs = llm.classify(prompts)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)

View File

@ -31,10 +31,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
outputs = llm.embed(prompts)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)

View File

@ -27,10 +27,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(**vars(args))
llm = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
outputs = llm.score(text_1, texts_2)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)

View File

@ -30,11 +30,11 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120
outputs = model.embed(prompts)
outputs = llm.embed(prompts)
# Print the outputs.
print("\nGenerated Outputs:")

View File

@ -30,10 +30,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
# Print the outputs.
print("\nGenerated Outputs:")

View File

@ -25,7 +25,7 @@ def config_buckets():
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
def initialize_model():
def initialize_llm():
"""Create an LLM with speculative decoding."""
return LLM(
model="openlm-research/open_llama_7b",
@ -43,9 +43,9 @@ def initialize_model():
)
def process_requests(model: LLM, sampling_params: SamplingParams):
def process_requests(llm: LLM, sampling_params: SamplingParams):
"""Generate texts from prompts and print them."""
outputs = model.generate(prompts, sampling_params)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
def main():
"""Main function that sets up the model and processes prompts."""
"""Main function that sets up the llm and processes prompts."""
config_buckets()
model = initialize_model()
llm = initialize_llm()
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, top_k=1)
process_requests(model, sampling_params)
process_requests(llm, sampling_params)
if __name__ == "__main__":

View File

@ -140,7 +140,7 @@ datamodule_config = {
class PrithviMAE:
def __init__(self):
print("Initializing PrithviMAE model")
self.model = LLM(
self.llm = LLM(
model=os.path.join(os.path.dirname(__file__), "./model"),
skip_tokenizer_init=True,
dtype="float32",
@ -158,7 +158,7 @@ class PrithviMAE:
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
outputs = self.model.encode(prompt, use_tqdm=False)
outputs = self.llm.encode(prompt, use_tqdm=False)
print("################ Inference done (it took seconds) ##############")
return outputs[0].outputs.data

View File

@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# If you want to load the official original version, the init parameters are
# as follows.
def get_model() -> LLM:
def get_llm() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM(
model=model_name,
@ -77,8 +77,8 @@ def main() -> None:
]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
model = get_model()
outputs = model.score(queries, documents)
llm = get_llm()
outputs = llm.score(queries, documents)
print("-" * 30)
print([output.outputs.score for output in outputs])

View File

@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
v1_test_failed_model_execution(vllm_model)
def v1_test_failed_model_execution(vllm_model):
engine = vllm_model.model.llm_engine
engine = vllm_model.llm.llm_engine
mocked_execute_model = Mock(
side_effect=RuntimeError("Mocked Critical Error"))
engine.engine_core.engine_core.model_executor.execute_model =\

View File

@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)):
@ -118,10 +118,10 @@ def test_preemption(
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal(
outputs_0_lst=hf_outputs,
@ -174,12 +174,12 @@ def test_preemption_infeasible(
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
req_outputs = vllm_model.model.generate(
req_outputs = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.

View File

@ -784,7 +784,7 @@ class VllmRunner:
enforce_eager: Optional[bool] = False,
**kwargs,
) -> None:
self.model = LLM(
self.llm = LLM(
model=model_name,
task=task,
tokenizer=tokenizer_name,
@ -854,9 +854,9 @@ class VllmRunner:
videos=videos,
audios=audios)
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params,
**kwargs)
req_outputs = self.llm.generate(inputs,
sampling_params=sampling_params,
**kwargs)
outputs: list[tuple[list[list[int]], list[str]]] = []
for req_output in req_outputs:
@ -902,9 +902,9 @@ class VllmRunner:
videos=videos,
audios=audios)
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params,
**kwargs)
req_outputs = self.llm.generate(inputs,
sampling_params=sampling_params,
**kwargs)
toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
@ -924,8 +924,8 @@ class VllmRunner:
'''
assert sampling_params.logprobs is not None
req_outputs = self.model.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
req_outputs = self.llm.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params
@ -1018,7 +1018,7 @@ class VllmRunner:
videos=videos,
audios=audios)
outputs = self.model.beam_search(
outputs = self.llm.beam_search(
inputs,
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
returned_outputs = []
@ -1029,7 +1029,7 @@ class VllmRunner:
return returned_outputs
def classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.classify(prompts)
req_outputs = self.llm.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs]
def embed(self,
@ -1044,11 +1044,11 @@ class VllmRunner:
videos=videos,
audios=audios)
req_outputs = self.model.embed(inputs, *args, **kwargs)
req_outputs = self.llm.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]
def encode(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.encode(prompts)
req_outputs = self.llm.encode(prompts)
return [req_output.outputs.data for req_output in req_outputs]
def score(
@ -1058,18 +1058,18 @@ class VllmRunner:
*args,
**kwargs,
) -> list[float]:
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
return [req_output.outputs.score for req_output in req_outputs]
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.model.llm_engine.model_executor
executor = self.llm.llm_engine.model_executor
return executor.apply_model(func)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
del self.model
del self.llm
cleanup_dist_env_and_memory()

View File

@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
engine: LLMEngine = runner.model.llm_engine
engine: LLMEngine = runner.llm.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.

View File

@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
def test_stop_reason(vllm_model, example_prompts):
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
llm = vllm_model.model
llm = vllm_model.llm
# test stop token
outputs = llm.generate(example_prompts,

View File

@ -101,42 +101,42 @@ def _stop_token_id(llm):
def test_stop_strings():
# If V0, must set enforce_eager=False since we use
# async output processing below.
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
if envs.VLLM_USE_V1:
_stop_basic(vllm_model)
_stop_basic(llm)
else:
_set_async_mode(vllm_model, True)
_stop_basic(vllm_model)
_set_async_mode(llm, True)
_stop_basic(llm)
_set_async_mode(vllm_model, False)
_stop_basic(vllm_model)
_set_async_mode(llm, False)
_stop_basic(llm)
if envs.VLLM_USE_V1:
_stop_multi_tokens(vllm_model)
_stop_multi_tokens(llm)
else:
_set_async_mode(vllm_model, True)
_stop_multi_tokens(vllm_model)
_set_async_mode(llm, True)
_stop_multi_tokens(llm)
_set_async_mode(vllm_model, False)
_stop_multi_tokens(vllm_model)
_set_async_mode(llm, False)
_stop_multi_tokens(llm)
if envs.VLLM_USE_V1:
_stop_partial_token(vllm_model)
_stop_partial_token(llm)
else:
_set_async_mode(vllm_model, True)
_stop_partial_token(vllm_model)
_set_async_mode(llm, True)
_stop_partial_token(llm)
_set_async_mode(vllm_model, False)
_stop_partial_token(vllm_model)
_set_async_mode(llm, False)
_stop_partial_token(llm)
if envs.VLLM_USE_V1:
# FIXME: this does not respect include_in_output=False
# _stop_token_id(vllm_model)
# _stop_token_id(llm)
pass
else:
_set_async_mode(vllm_model, True)
_stop_token_id(vllm_model)
_set_async_mode(llm, True)
_stop_token_id(llm)
_set_async_mode(vllm_model, False)
_stop_token_id(vllm_model)
_set_async_mode(llm, False)
_stop_token_id(llm)

View File

@ -77,6 +77,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
@pytest.mark.parametrize("sliding_window", [None, 64])
@torch.inference_mode
def test_flashinfer_decode_with_paged_kv(
kv_lens: list[int],
@ -85,6 +86,7 @@ def test_flashinfer_decode_with_paged_kv(
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
sliding_window: Optional[int],
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
@ -136,17 +138,20 @@ def test_flashinfer_decode_with_paged_kv(
use_tensor_cores=(
(num_query_heads//num_kv_heads) > 4)
)
wrapper.plan(kv_indptr,
kv_indices,
kv_last_page_lens,
num_query_heads,
num_kv_heads,
head_size,
block_size,
"NONE",
q_data_type=dtype,
kv_data_type=dtype,
logits_soft_cap=soft_cap)
wrapper.plan(
kv_indptr,
kv_indices,
kv_last_page_lens,
num_query_heads,
num_kv_heads,
head_size,
block_size,
"NONE",
window_left=sliding_window - 1 if sliding_window is not None else -1,
q_data_type=dtype,
kv_data_type=dtype,
logits_soft_cap=soft_cap,
)
output = wrapper.run(query, key_value_cache)
@ -157,7 +162,8 @@ def test_flashinfer_decode_with_paged_kv(
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
soft_cap=soft_cap)
soft_cap=soft_cap,
sliding_window=sliding_window)
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
f"{torch.max(torch.abs(output - ref_output))}"
@ -168,12 +174,17 @@ def test_flashinfer_decode_with_paged_kv(
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
@pytest.mark.parametrize("sliding_window", [None, 64])
@torch.inference_mode
def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int, dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float]) -> None:
def test_flashinfer_prefill_with_paged_kv(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
sliding_window: Optional[int],
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
@ -242,6 +253,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
num_kv_heads,
head_size,
block_size,
window_left=sliding_window - 1 if sliding_window is not None else -1,
q_data_type=dtype,
kv_data_type=dtype,
logits_soft_cap=soft_cap,
@ -259,7 +271,8 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
soft_cap=soft_cap)
soft_cap=soft_cap,
sliding_window=sliding_window)
torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
f"{torch.max(torch.abs(output - ref_output))}"

View File

@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
loaded_vllm_model = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
loaded_llm = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tc_as_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT

View File

@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts
]
@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3:

View File

@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name
def check_model(model):

View File

@ -274,7 +274,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute).
"""
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
scheduler = vllm_model.model.llm_engine.scheduler[0]
scheduler = vllm_model.llm.llm_engine.scheduler[0]
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)

View File

@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format="mistral") as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.model.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip()
@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}]
outputs = vllm_model.model.chat(messages, sampling_params=params)
outputs = vllm_model.llm.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text)

View File

@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.model = vllm_model
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order.
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.model.embed(sentences, use_tqdm=False)
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.model.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
outputs = self.llm.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
in vllm_model.llm.llm_engine.model_config.architectures)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name,
is_sentence_transformer=True,
@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)

View File

@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
d_rep = run_llm_encode(
llm,
@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)

View File

@ -87,10 +87,10 @@ def test_matryoshka(
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
assert vllm_model.model.llm_engine.model_config.is_matryoshka
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = (
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions:

View File

@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed",
max_model_len=None) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, task="embed",
max_model_len=256) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
# set 512 < max_model_len <= 2048
@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else:
with vllm_runner(model_info.name, task="embed",
max_model_len=1024) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024

View File

@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.model.encode(
llm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
assert llm_output == f"""truncate_prompt_tokens value

View File

@ -0,0 +1,649 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Create a reduced-layer version of the Maverick model for testing purposes.
This script creates a new model with fewer layers by:
1. Loading the original Maverick model configuration
2. Creating a reduced configuration
3. Generating compatible safetensors files with appropriate weights
4. Creating the necessary index files for vLLM compatibility
"""
import json
import shutil
from pathlib import Path
from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig)
from vllm import LLM, SamplingParams
# Sample prompts for testing
PROMPTS: list[str] = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
def run_maverick_serving(model: str):
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
options with reduced layers.
"""
try:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model=model,
max_model_len=2048,
enforce_eager=True,
tensor_parallel_size=8,
enable_expert_parallel=True,
trust_remote_code=True,
gpu_memory_utilization=0.4,
kv_cache_dtype="fp8",
)
outputs = llm.generate(PROMPTS, sampling_params)
# Print the outputs
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
except Exception as e:
print(f"Error initializing or running model: {e}")
raise
def create_reduced_maverick_model(
original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
vision_layers: int = 2,
force_recreate: bool = False,
) -> str:
"""
Create a reduced-layer version of the Maverick model.
Args:
original_model_name: Name of the original Maverick model
output_dir: Directory to save the reduced model
text_layers: Number of text transformer layers
num_experts: Number of experts per layer
vision_layers: Number of vision transformer layers
force_recreate: Whether to recreate if output_dir already exists
Returns:
Path to the created reduced model directory
"""
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers...")
# Create output directory
output_path = Path(output_dir)
if output_path.exists():
if force_recreate:
shutil.rmtree(output_path)
else:
print(f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite.")
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
json.dump(reduced_config, f, indent=2)
print(f"Saved reduced config to {config_path}")
print("Copying tokenizer files...")
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config,
output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
try:
gen_config = GenerationConfig.from_pretrained(original_model_name)
gen_config.save_pretrained(output_path)
print("Copied generation config")
except Exception as e:
print(f"Could not copy generation config: {e}")
print(f"Successfully created reduced Maverick model at {output_path}")
return str(output_path)
except Exception as e:
print(f"Error creating reduced model: {e}")
# Clean up on failure
if output_path.exists():
shutil.rmtree(output_path)
raise
def create_reduced_config(original_config: Any, text_layers: int,
num_experts: int,
vision_layers: int) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
config_dict = original_config.to_dict()
# Reduce text layers
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
print(
f"Reduced text layers from {original_text_layers} to {text_layers}"
)
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(
f"Reduced num experts from {original_num_experts} to {num_experts}"
)
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to "
f"{new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
config_dict["text_config"]["head_dim"] = new_head_dim
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"][
"num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} "
f"to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = (
f"reduced_maverick_{text_layers}t_{vision_layers}v")
return config_dict
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
trust_remote_code=True)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any,
output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
try:
processor = AutoProcessor.from_pretrained(
original_config._name_or_path
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
trust_remote_code=True,
)
processor.save_pretrained(output_path)
print("Copied original preprocessor config")
return
except Exception as e:
print(f"Could not copy original preprocessor config: {e}")
raise
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
Any],
output_path: Path) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
text_config = reduced_config["text_config"]
vision_config = reduced_config["vision_config"]
weights = {}
print("Creating text model weights...")
weights.update(create_text_model_weights(text_config))
print("Creating vision model weights...")
weights.update(create_vision_model_weights(vision_config))
print("Creating shared model weights...")
weights.update(create_shared_weights(text_config, vision_config))
print("Saving weights to safetensors files...")
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
vocab_size = text_config["vocab_size"]
hidden_size = text_config["hidden_size"]
intermediate_size = text_config["intermediate_size"]
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads",
num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert (num_experts
is not None), "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16)
# Transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"language_model.model.layers.{layer_idx}"
print(f"Creating weights for layer {layer_prefix}...")
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = (interleave_step > 0
and (layer_idx + 1) % interleave_step == 0)
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = (
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
# Expert weight scales (FP8 quantization)
weights[
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
torch.randn(hidden_size,
intermediate_size_mlp,
dtype=torch.bfloat16))
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
hidden_size = vision_config["hidden_size"]
intermediate_size = vision_config["intermediate_size"]
num_layers = vision_config["num_hidden_layers"]
# Vision transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
return weights
def create_shared_weights(
text_config: dict[str, Any],
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
text_hidden_size = text_config["hidden_size"]
projector_input_dim = vision_config["projector_input_dim"]
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
return weights
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
output_path: Path) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
# Calculate sizes and create shards
shards = []
current_shard: dict[str, torch.Tensor] = {}
current_size = 0
for name, tensor in weights.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_shard_size and current_shard:
shards.append(current_shard)
current_shard = {}
current_size = 0
current_shard[name] = tensor
current_size += tensor_size
if current_shard:
shards.append(current_shard)
# Save shards and create index
weight_map = {}
if len(shards) == 1:
# Single file
filename = "model.safetensors"
save_file(shards[0], output_path / filename)
weight_map = {name: filename for name in shards[0]}
print(f"Saved weights to single file: {filename}")
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size":
sum(tensor.numel() * tensor.element_size()
for tensor in weights.values())
},
"weight_map": weight_map,
}
index_path = output_path / "model.safetensors.index.json"
with open(index_path, "w") as f:
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(f"Total model size: "
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
def run_reduced_model(model_path: str,
should_profile: bool = False,
**kwargs) -> None:
"""Test the created reduced model with vLLM."""
print(f"\nTesting reduced model at {model_path}...")
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
**kwargs,
)
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=50)
if should_profile:
llm.start_profile()
outputs = llm.generate(PROMPTS, sampling_params)
if should_profile:
llm.stop_profile()
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: "
f"{output.outputs[0].text}")
print("-" * 40)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
original_model_name: str,
text_layers: int,
num_experts: int,
vision_layers: int,
enforce_eager: bool,
tp: int,
ep: bool,
output_dir: str = "/tmp/reduced_maverick",
force_recreate: bool = True,
profile: bool = False,
) -> None:
model_path = create_reduced_maverick_model(
original_model_name=original_model_name,
output_dir=output_dir,
text_layers=text_layers,
num_experts=num_experts,
vision_layers=vision_layers,
force_recreate=force_recreate,
)
print(f"\nReduced model created successfully at: {model_path}")
run_reduced_model(model_path=model_path,
should_profile=profile,
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep)
def main():
"""Main function to create and test the reduced model."""
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model")
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
help="Output directory for the reduced model",
)
parser.add_argument(
"--text-layers",
type=int,
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts",
type=int,
default=4,
help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
default=2,
help="Number of vision transformer layers",
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument("--test",
action="store_true",
help="Test the created model with vLLM")
parser.add_argument("--profile",
action="store_true",
help="Profile the created model with vLLM")
parser.add_argument(
"--test-original",
action="store_true",
help="Test the original model with vLLM",
)
parser.add_argument(
"--original-model",
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
help="Original model name to base the reduction on",
)
args = parser.parse_args()
if args.test:
test_dummy_maverick(original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile)
if args.test_original:
run_maverick_serving(args.original_model)
if __name__ == "__main__":
exit(main())

View File

@ -180,8 +180,7 @@ def test_chat(
) as vllm_model:
outputs = []
for msg in MSGS:
output = vllm_model.model.chat(msg,
sampling_params=SAMPLING_PARAMS)
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = vllm_model.model.generate(prompt)
outputs = vllm_model.llm.generate(prompt)
assert len(outputs) == 1, f"{len(outputs)=}"
output: RequestOutput = outputs[0]

View File

@ -106,7 +106,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(
temperature=0,

View File

@ -85,7 +85,7 @@ def run_test(
enforce_eager=enforce_eager,
task=task,
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:

View File

@ -96,7 +96,7 @@ def _run_test(
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
texts = [
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad

View File

@ -56,7 +56,7 @@ def vllm_reranker(
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model:
outputs = vllm_model.model.score(query, documents)
outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs]

View File

@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]

View File

@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason="modelopt_fp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]

View File

@ -144,7 +144,7 @@ def test_quantization(
"model",
["jason9693/Qwen2.5-1.5B-apeach"],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", ["float"])
def test_classify(
hf_runner,
vllm_runner,

View File

@ -8,7 +8,7 @@ import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
from vllm.model_executor.models.gemma2 import Gemma2Model
from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
from vllm.sequence import IntermediateTensors
@ -26,12 +26,13 @@ class MyGemma2Embedding(nn.Module):
self.model = Gemma2Model(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
self.pooler = Pooler.from_config_with_defaults(
vllm_config.model_config.pooler_config,
pooling_type=PoolingType.LAST,
normalize=True,
softmax=False,
)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode": Pooler.for_encode(pooler_config),
"embed": Pooler.for_embed(pooler_config),
})
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

View File

@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
def test_disable_sliding_window(model_len_len, ):
model, sliding_len, full_len = model_len_len
vllm_disabled_model = LLM(model, disable_sliding_window=True)
vllm_disabled_model.generate("Hi my name is")
model_config = vllm_disabled_model.llm_engine.model_config
disabled_llm = LLM(model, disable_sliding_window=True)
disabled_llm.generate("Hi my name is")
model_config = disabled_llm.llm_engine.model_config
assert model_config.max_model_len == sliding_len, (
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
model_config.max_model_len)
del vllm_disabled_model
del disabled_llm
cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
vllm_enabled_model.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config
enabled_llm = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
enabled_llm.generate("Hi my name is")
model_config = enabled_llm.llm_engine.model_config
assert model_config.max_model_len == full_len, (
"Max len expected to equal full_len of %s, but got %s", full_len,
model_config.max_model_len)
del vllm_enabled_model
del enabled_llm
cleanup_dist_env_and_memory()

View File

@ -93,8 +93,8 @@ def test_mixed_requests(
# Run all the promopts
greedy_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts,
greedy_params)
req_outputs = vllm_model.llm.generate(example_prompts,
greedy_params)
# Verify number of cached tokens
for i in range(len(req_outputs)):
@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_batched_tokens,
)
engine: LLMEngine = runner.model.llm_engine
engine: LLMEngine = runner.llm.llm_engine
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
engine.scheduler[0] = scheduler

View File

@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
GPTQLinearMethod)
for name, submodule in (vllm_model.model.llm_engine.model_executor.
for name, submodule in (vllm_model.llm.llm_engine.model_executor.
driver_worker.model_runner.model.named_modules()):
if name == "lm_head":
assert isinstance(submodule.quant_method, linear_method_cls)

View File

@ -0,0 +1,91 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test ModelOpt quantization method setup and weight loading.
Run `pytest tests/quantization/test_modelopt.py`.
"""
import os
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
if not current_platform.is_cpu():
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.")
def test_modelopt_fp8_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
# TODO: provide a small publically available test checkpoint
model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
"TinyLlama-1.1B-Chat-v1.0-fp8-0710")
# Skip test if checkpoint doesn't exist
if not os.path.exists(model_path):
pytest.skip(f"Test checkpoint not found at {model_path}. "
"This test requires a local ModelOpt FP8 checkpoint.")
with vllm_runner(model_path, quantization="modelopt",
enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
# Check that ModelOpt quantization method is properly applied
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8LinearMethod)
assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
assert isinstance(gate_up_proj.quant_method,
ModelOptFp8LinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
# Check weight dtype is FP8
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Check scales are present and have correct dtype
assert hasattr(qkv_proj, 'weight_scale')
assert hasattr(qkv_proj, 'input_scale')
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.input_scale.dtype == torch.float32
assert hasattr(o_proj, 'weight_scale')
assert hasattr(o_proj, 'input_scale')
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.input_scale.dtype == torch.float32
assert hasattr(gate_up_proj, 'weight_scale')
assert hasattr(gate_up_proj, 'input_scale')
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.input_scale.dtype == torch.float32
assert hasattr(down_proj, 'weight_scale')
assert hasattr(down_proj, 'input_scale')
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.input_scale.dtype == torch.float32
llm.apply_model(check_model)
# Run a simple generation test to ensure the model works
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
assert output
print(f"ModelOpt FP8 output: {output}")

View File

@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner):
}
with (vllm_runner(quark_model_id, **llm_kwargs) as
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
quark_model = (quark_handle.model.llm_engine.model_executor.
quark_model = (quark_handle.llm.llm_engine.model_executor.
driver_worker.model_runner.model)
quark_state_dict = quark_model.state_dict()
fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
model_runner.model)
fp8_state_dict = fp8_model.state_dict()

View File

@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
quantization="custom_quant",
enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj

View File

@ -36,7 +36,7 @@ def test_ignore_eos(
ignore_eos=True)
for prompt in example_prompts:
ignore_eos_output = vllm_model.model.generate(
ignore_eos_output = vllm_model.llm.generate(
prompt, sampling_params=sampling_params)
output_length = len(ignore_eos_output[0].outputs[0].token_ids)
assert output_length == max_tokens

View File

@ -26,7 +26,7 @@ def test_logits_processor_force_generate(
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
repeat_times = 2
enforced_answers = " vLLM"
vllm_token_ids = tokenizer.encode(enforced_answers,
@ -45,13 +45,13 @@ def test_logits_processor_force_generate(
)
# test logits_processors when prompt_logprobs is not None
vllm_model.model._add_request(
vllm_model.llm._add_request(
example_prompts[0],
params=params_with_logprobs,
)
# test prompt_logprobs is not None
vllm_model.model._add_request(
vllm_model.llm._add_request(
example_prompts[1],
params=SamplingParams(
prompt_logprobs=3,
@ -60,11 +60,11 @@ def test_logits_processor_force_generate(
)
# test grouped requests
vllm_model.model._add_request(
vllm_model.llm._add_request(
example_prompts[2],
params=SamplingParams(max_tokens=max_tokens),
)
outputs = vllm_model.model._run_engine(use_tqdm=False)
outputs = vllm_model.llm._run_engine(use_tqdm=False)
assert outputs[0].outputs[0].text == enforced_answers * repeat_times

View File

@ -64,7 +64,7 @@ def test_get_prompt_logprobs(
prompt_logprobs=num_top_logprobs,
temperature=0.0,
detokenize=detokenize)
vllm_results = vllm_model.model.generate(
vllm_results = vllm_model.llm.generate(
example_prompts, sampling_params=vllm_sampling_params)
# Test whether logprobs are included in the results.
@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
logprobs=None,
temperature=0.0,
detokenize=detokenize)
results_logprobs_none = vllm_model.model.generate(
results_logprobs_none = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_none)
for i in range(len(results_logprobs_none)):

View File

@ -20,7 +20,7 @@ def v1(run_with_both_engines):
def _generate(
model: LLM,
llm: LLM,
prompt: str,
num_prompt_tokens: int,
temperature: float = 0,
@ -32,7 +32,7 @@ def _generate(
)
# [([output_token_ids, ], [output_text, ]), ]
output = model.generate([prompt], sampling_params=sampling_params)
output = llm.generate([prompt], sampling_params=sampling_params)
output_token_ids = output[0][0][0][num_prompt_tokens:]
# [0] first (and only) request output
@ -66,10 +66,10 @@ class TestOneTokenBadWord:
assert self.target_token_id not in output_token_ids
def _generate(self,
model: LLM,
llm: LLM,
bad_words: Optional[list[str]] = None) -> list[int]:
return _generate(
model=model,
llm=llm,
prompt=self.PROMPT,
num_prompt_tokens=self.num_prompt_tokens,
bad_words=bad_words,
@ -156,10 +156,10 @@ class TestTwoTokenBadWord:
or (self.neighbour_token_id2 in output_token_ids))
def _generate(self,
model: LLM,
llm: LLM,
bad_words: Optional[list[str]] = None) -> list[int]:
return _generate(
model=model,
llm=llm,
prompt=self.PROMPT,
num_prompt_tokens=self.num_prompt_tokens,
bad_words=bad_words,

View File

@ -49,7 +49,7 @@ def test_random_sample_with_seed(
sampling_params_seed_2 = copy.deepcopy(sampling_params)
sampling_params_seed_2.seed = 200
llm = vllm_model.model
llm = vllm_model.llm
for prompt in example_prompts:
for params in (

View File

@ -23,9 +23,9 @@ from vllm.transformers_utils.detokenizer_utils import (
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
MemorySnapshot, PlaceholderModule, StoreBoolean,
bind_kv_cache, common_broadcastable_dtype,
deprecate_kwargs, get_open_port, get_tcp_uri,
is_lossless_cast, join_host_port, make_zmq_path,
make_zmq_socket, memory_profiling,
current_stream, deprecate_kwargs, get_open_port,
get_tcp_uri, is_lossless_cast, join_host_port,
make_zmq_path, make_zmq_socket, memory_profiling,
merge_async_iterators, sha256, split_host_port,
split_zmq_path, supports_kw, swap_dict_values)
@ -957,3 +957,41 @@ def test_convert_ids_list_to_tokens():
]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!']
def test_current_stream_multithread():
import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream()
thread_stream_ready = threading.Event()
thread_can_exit = threading.Event()
def child_thread_func():
with torch.cuda.stream(child_stream):
thread_stream_ready.set()
thread_can_exit.wait(timeout=10)
child_thread = threading.Thread(target=child_thread_func)
child_thread.start()
try:
assert thread_stream_ready.wait(
timeout=5), "Child thread failed to enter stream context in time"
main_current_stream = current_stream()
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
# Notify child thread it can exit
thread_can_exit.set()
finally:
# Ensure child thread exits properly
child_thread.join(timeout=5)
if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly")

View File

@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill(
logprobs=5,
prompt_logprobs=5,
temperature=0.0)
vllm_results = vllm_model.model.generate(
vllm_results = vllm_model.llm.generate(
example_prompts, sampling_params=vllm_sampling_params)
for idx, result in enumerate(vllm_results):

View File

@ -14,7 +14,7 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def model() -> LLM:
def llm() -> LLM:
return LLM(MODEL,
enforce_eager=True,
enable_prefix_caching=True,
@ -24,16 +24,16 @@ def model() -> LLM:
block_size=16)
def test_concurrent_partial_prefill(model):
outputs = model.generate([PROMPT] * 3)
def test_concurrent_partial_prefill(llm):
outputs = llm.generate([PROMPT] * 3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
def test_prefix_cache_stats_is_recorded(model):
def test_prefix_cache_stats_is_recorded(llm):
# 17 tokens will make sure first 16 tokens are cached in a block
input_tokens = {"prompt_token_ids": [101] * 17}
_ = model.generate([input_tokens])
outputs = model.generate([input_tokens])
_ = llm.generate([input_tokens])
outputs = llm.generate([input_tokens])
assert outputs[0].num_cached_tokens == 16

View File

@ -336,9 +336,10 @@ async def test_customize_loggers(monkeypatch):
await engine.do_log_stats()
assert len(engine.stat_loggers) == 1
assert len(engine.stat_loggers[0]) == 1
engine.stat_loggers[0][0].log.assert_called_once()
stat_loggers = engine.logger_manager.per_engine_logger_dict
assert len(stat_loggers) == 1
assert len(stat_loggers[0]) == 1
stat_loggers[0][0].log.assert_called_once()
@pytest.mark.asyncio(scope="module")

View File

@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init(
example_prompts,
structured_outputs=True,
)
model: LLM = vllm_model_skip_tokenizer_init.model
llm: LLM = vllm_model_skip_tokenizer_init.llm
with pytest.raises(ValueError):
_ = model.generate(example_prompts, sampling_params_list)
_ = llm.generate(example_prompts, sampling_params_list)
def test_parallel_sampling(vllm_model, example_prompts) -> None:
@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
example_prompt: test fixture providing prompts for testing.
"""
sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
model: LLM = vllm_model.model
outputs = model.generate(example_prompts, sampling_params_list)
llm: LLM = vllm_model.llm
outputs = llm.generate(example_prompts, sampling_params_list)
# Validate each request response
for out, n in zip(outputs, n_list):
@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
speculative_config=speculative_config,
disable_log_stats=False,
) as vllm_model:
model: LLM = vllm_model.model
llm: LLM = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens)
outputs = model.generate(example_prompts, sampling_params)
outputs = llm.generate(example_prompts, sampling_params)
n_prompts = len(example_prompts)
assert len(outputs) == n_prompts
@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
total_tokens += len(out.outputs[0].token_ids)
assert total_tokens == max_tokens * n_prompts
metrics = model.get_metrics()
metrics = llm.get_metrics()
def find_metric(name) -> list[Metric]:
found = []

View File

@ -112,7 +112,7 @@ def _run_and_validate(
max_tokens: int,
do_apc: bool,
) -> None:
vllm_results = vllm_model.model.generate(
vllm_results = vllm_model.llm.generate(
test_prompts, sampling_params=vllm_sampling_params)
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs(
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and (temperature < 2.0
or batch_logprobs_composition != SAMPLE_PROMPT):
# Skip some test-cases to save time.
@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts,
prompt_logprobs=None,
temperature=0.0,
)
results_logprobs_none = vllm_model.model.generate(
results_logprobs_none = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params_logprobs_none,
)
@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
logprobs=0,
prompt_logprobs=0,
temperature=0.0)
results_logprobs_zero = vllm_model.model.generate(
results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero)
for i in range(len(results_logprobs_zero)):

View File

@ -14,30 +14,30 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def model() -> LLM:
def llm() -> LLM:
# Disable prefix caching so that we can test prompt logprobs.
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
# is merged
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
def test_n_gt_1(model):
def test_n_gt_1(llm):
"""ParallelSampling is supported."""
params = SamplingParams(n=3)
outputs = model.generate(PROMPT, params)
outputs = llm.generate(PROMPT, params)
assert len(outputs[0].outputs) == 3
def test_best_of(model):
def test_best_of(llm):
"""Raise a ValueError since best_of is deprecated."""
params = SamplingParams(n=2, best_of=3)
with pytest.raises(ValueError):
_ = model.generate(PROMPT, params)
_ = llm.generate(PROMPT, params)
def test_penalties(model):
def test_penalties(llm):
"""Check that we do not get errors if applied."""
params = SamplingParams(
@ -49,18 +49,18 @@ def test_penalties(model):
top_p=0.5,
top_k=3,
)
_ = model.generate(PROMPT, params)
_ = llm.generate(PROMPT, params)
def test_stop(model):
def test_stop(llm):
"""Check that we respect the stop words."""
output = model.generate(PROMPT, SamplingParams(temperature=0))
output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split()
STOP_IDX = 5
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split()
# Output should not contain the stop word.
@ -69,40 +69,40 @@ def test_stop(model):
params = SamplingParams(temperature=0,
stop=split_text[STOP_IDX],
include_stop_str_in_output=True)
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split()
# Output should contain the stop word.
assert len(new_split_text) == STOP_IDX + 1
def test_stop_token_ids(model):
def test_stop_token_ids(llm):
"""Check that we respect the stop token ids."""
output = model.generate(PROMPT, SamplingParams(temperature=0))
output = llm.generate(PROMPT, SamplingParams(temperature=0))
stop_token_id_0 = output[0].outputs[0].token_ids[5]
stop_token_id_1 = output[0].outputs[0].token_ids[6]
stop_token_ids = [stop_token_id_1, stop_token_id_0]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
stop_token_ids = [stop_token_id_0, stop_token_id_1]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
def test_detokenize_false(model):
def test_detokenize_false(llm):
"""Check that detokenize=False option works."""
output = model.generate(PROMPT, SamplingParams(detokenize=False))
output = llm.generate(PROMPT, SamplingParams(detokenize=False))
assert len(output[0].outputs[0].token_ids) > 0
assert len(output[0].outputs[0].text) == 0
output = model.generate(
output = llm.generate(
PROMPT, SamplingParams(detokenize=False, logprobs=3,
prompt_logprobs=3))
assert len(output[0].outputs[0].token_ids) > 0
@ -118,28 +118,28 @@ def test_detokenize_false(model):
assert all(lp.decoded_token is None for lp in logprobs.values())
def test_bad_words(model):
def test_bad_words(llm):
"""Check that we respect bad words."""
output = model.generate(PROMPT, SamplingParams(temperature=0))
output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split()
bad_words_1 = " ".join(split_text[:2])
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
assert bad_words_1 not in new_text
bad_words_2 = new_text.split()[-1]
params = SamplingParams(temperature=0,
bad_words=[bad_words_1, bad_words_2])
output = model.generate(PROMPT, params)
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
assert bad_words_1 not in new_text
assert bad_words_2 not in new_text
def test_logits_processor(model):
def test_logits_processor(llm):
"""Check that we reject logits processor."""
# This sample logits processor gives infinite score to the i-th token,
@ -150,47 +150,45 @@ def test_logits_processor(model):
return logits
with pytest.raises(ValueError):
_ = model.generate(PROMPT,
SamplingParams(logits_processors=[pick_ith]))
_ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
def test_allowed_token_ids(model):
def test_allowed_token_ids(llm):
"""Check that we can use allowed_token_ids."""
TOKEN_ID = 10
allowed_token_ids = [TOKEN_ID]
output = model.generate(
PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
output = llm.generate(PROMPT,
SamplingParams(allowed_token_ids=allowed_token_ids))
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
# Reject empty allowed_token_ids.
with pytest.raises(ValueError):
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
# Reject negative token id.
with pytest.raises(ValueError):
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
# Reject out of vocabulary.
with pytest.raises(ValueError):
_ = model.generate(PROMPT,
SamplingParams(allowed_token_ids=[10000000]))
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_priority(model):
def test_priority(llm):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with pytest.raises(ValueError):
_ = model.generate(PROMPT, priority=[1])
_ = llm.generate(PROMPT, priority=[1])
def test_seed(model):
def test_seed(llm):
"""Check that seed impacts randomness."""
out_1 = model.generate(PROMPT, SamplingParams(seed=42))
out_2 = model.generate(PROMPT, SamplingParams(seed=42))
out_3 = model.generate(PROMPT, SamplingParams(seed=43))
out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text

View File

@ -90,8 +90,10 @@ async def test_load(output_kind: RequestOutputKind,
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
stats_loggers[engine_index] = self
def record(self, scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats]):
def record(self,
scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
engine_idx: int = 0):
if iteration_stats:
self.finished_req_count += len(
iteration_stats.finished_requests)

View File

@ -106,9 +106,9 @@ def test_v1_llm_by_default(monkeypatch):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
print(model.generate("Hello my name is"))
assert hasattr(model.llm_engine, "engine_core")
llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
print(llm.generate("Hello my name is"))
assert hasattr(llm.llm_engine, "engine_core")
m.delenv("VLLM_USE_V1")

View File

@ -137,6 +137,13 @@ class Attention(nn.Module):
self.num_kv_heads = num_kv_heads
self.sliding_window = sliding_window
# For v1 we have backend agnostic iRoPE (local chunked attention)
# we have to store the flag on the layer so gpu model runner can
# set KVSpec appropriately (and pop it so it doesnt get passed to
# the backends)
if envs.VLLM_USE_V1:
self.use_irope = extra_impl_args.pop("use_irope", False)
quant_method = quant_config.get_quant_method(
self, prefix=prefix) if quant_config else None
if quant_method is not None and not isinstance(

View File

@ -94,7 +94,7 @@ ConfigT = TypeVar("ConfigT", bound=ConfigType)
TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
"score", "reward", "transcription", "draft"]
_ResolvedTask = Literal["generate", "transcription", "pooling", "embed",
_ResolvedTask = Literal["generate", "transcription", "encode", "embed",
"classify", "reward", "draft"]
RunnerOption = Literal["auto", "generate", "pooling", "draft"]
@ -103,7 +103,7 @@ RunnerType = Literal["generate", "pooling", "draft"]
_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
"generate": ["generate", "transcription"],
"pooling": ["pooling", "embed", "classify", "reward"],
"pooling": ["encode", "embed", "classify", "reward"],
"draft": [],
}
@ -346,11 +346,11 @@ class ModelConfig:
"""Maximum number of data items per modality per prompt. Only applicable
for multimodal models."""
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string. Defaults to False."""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
use_async_output_proc: bool = True
"""Whether to use async output processor."""
@ -579,7 +579,7 @@ class ModelConfig:
# user-selected task
if runner_type == "pooling" and self.task == "auto":
selected_task = all_supported_tasks[runner_type][-1]
assert selected_task != "pooling"
assert selected_task != "encode"
self.task = selected_task
self.supported_runner_types = supported_runner_types
self.runner_type = runner_type
@ -884,7 +884,7 @@ class ModelConfig:
supported_tasks = list[_ResolvedTask]()
if registry.is_pooling_model(architectures):
supported_tasks.append("pooling")
supported_tasks.append("encode")
# For now, users must specify the task (other than "pooling")
# to use for pooling models
@ -1000,9 +1000,13 @@ class ModelConfig:
quant_cfg = self._parse_quant_hf_config()
if quant_cfg is not None:
# Use the community standard 'quant_method'
quant_method = quant_cfg.get("quant_method", "").lower()
# Normalize library names
quant_method = quant_method.replace("compressed_tensors",
"compressed-tensors")
quant_cfg["quant_method"] = quant_method
# Quantization methods which are overrides (i.e. they have a
@ -1017,6 +1021,8 @@ class ModelConfig:
"awq_marlin",
"ipex",
"moe_wna16",
"modelopt",
"modelopt_fp4",
]
quantization_methods = [
q for q in supported_quantization if q not in overrides
@ -3193,8 +3199,8 @@ class MultiModalConfig:
"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
mm_processor_kwargs: Optional[dict[str, object]] = None
@ -4094,7 +4100,7 @@ class CompilationConfig:
- True: inductor compilation is used (custom_ops disabled by default).
One graph for symbolic shape and one graph per size in compile_sizes
are compiled using configurations in inductor_compile_config.
This setting is ignored if level<PIECEWISE."""
compile_sizes: Optional[list[Union[int, str]]] = None
"""Sizes to compile for inductor. In addition
@ -4393,7 +4399,7 @@ class VllmConfig:
As a shorthand, `-O<n>` can be used to directly specify the compilation
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
Currently, -O <n> and -O=<n> are supported as well but this will likely be
Currently, -O <n> and -O=<n> are supported as well but this will likely be
removed in favor of clearer -O<n> syntax in the future.
NOTE: level 0 is the default level without any optimization. level 1 and 2

View File

@ -2,11 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional
from typing import Any, Optional, Union
import torch
from torch.distributed import ProcessGroup
from vllm.distributed.utils import pickle
from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum
@ -26,7 +27,8 @@ class CpuCommunicator(DeviceCommunicatorBase):
if (current_platform.get_cpu_architecture()
== CpuArchEnum.X86) and hasattr(
torch.ops._C,
"init_shm_manager") and unique_name.startswith("tp"):
"init_shm_manager") and (unique_name.startswith("tp")
or unique_name.startswith("pp")):
self.dist_module = _CPUSHMDistributed(self)
def all_reduce(self, input_):
@ -94,6 +96,19 @@ class CpuCommunicator(DeviceCommunicatorBase):
input_size[dim + 1:])
return output_tensor
def send_tensor_dict(
self,
tensor_dict: dict[str, Union[torch.Tensor, Any]],
dst: int,
) -> None:
return self.dist_module.send_tensor_dict(tensor_dict, dst)
def recv_tensor_dict(
self,
src: int,
) -> dict[str, Union[torch.Tensor, Any]]:
return self.dist_module.recv_tensor_dict(src)
class _CPUSHMDistributed:
@ -143,3 +158,44 @@ class _CPUSHMDistributed:
input: torch.Tensor,
group: Optional[ProcessGroup] = None) -> None:
torch.ops._C.shm_all_gather(self.handle, input, output)
def send_tensor_dict(
self,
tensor_dict: dict[str, Union[torch.Tensor, Any]],
dst: int,
) -> None:
key_list = list(tensor_dict.keys())
value_list = list(tensor_dict.values())
size_list = []
for v in value_list:
if not isinstance(v, torch.Tensor):
raise RuntimeError(
"CpuCommunicator only supports sending tensors.")
size_list.append(v.size())
key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]),
dtype=torch.uint8)
value_list.append(key_size_tensor)
torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst)
return None
def recv_tensor_dict(
self,
src: int,
) -> dict[str, Union[torch.Tensor, Any]]:
tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
value_list: list[torch.Tensor] = tensor_list[:-1]
key_size_tensor = tensor_list[-1]
key_size = pickle.loads(key_size_tensor.numpy().tobytes())
key_list = key_size[0]
size_list = key_size[1]
assert len(key_list) == len(size_list)
assert len(key_list) == len(value_list)
tensor_dict: dict[str, torch.Tensor] = {}
for key, size, t in zip(key_list, size_list, value_list):
tensor_dict[key] = t.view(size)
return tensor_dict

View File

@ -272,6 +272,9 @@ class GroupCoordinator:
self.use_custom_op_call = (current_platform.is_cuda_alike()
or current_platform.is_tpu())
self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr(
torch.ops._C, "init_shm_manager"))
@property
def first_rank(self):
"""Return the global rank of the first process in the group"""
@ -663,6 +666,11 @@ class GroupCoordinator:
dst = (self.rank_in_group + 1) % self.world_size
assert dst < self.world_size, f"Invalid dst rank ({dst})"
if self.use_cpu_custom_send_recv:
self.device_communicator.send_tensor_dict( # type: ignore
tensor_dict, dst)
return None
metadata_list: list[tuple[Any, Any]] = []
assert isinstance(
tensor_dict,
@ -718,6 +726,10 @@ class GroupCoordinator:
src = (self.rank_in_group - 1) % self.world_size
assert src < self.world_size, f"Invalid src rank ({src})"
if self.use_cpu_custom_send_recv:
return self.device_communicator.recv_tensor_dict( # type: ignore
src)
recv_metadata_list = self.recv_object(src=src)
tensor_dict: dict[str, Any] = {}
for key, value in recv_metadata_list:

View File

@ -1668,13 +1668,14 @@ class EngineArgs:
# cpu specific default values.
if current_platform.is_cpu():
world_size = self.pipeline_parallel_size * self.tensor_parallel_size
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 4096,
UsageContext.OPENAI_API_SERVER: 2048,
UsageContext.LLM_CLASS: 4096 * world_size,
UsageContext.OPENAI_API_SERVER: 2048 * world_size,
}
default_max_num_seqs = {
UsageContext.LLM_CLASS: 128,
UsageContext.OPENAI_API_SERVER: 32,
UsageContext.LLM_CLASS: 256 * world_size,
UsageContext.OPENAI_API_SERVER: 128 * world_size,
}
use_context_value = usage_context.value if usage_context else None

View File

@ -1668,7 +1668,7 @@ async def init_app_state(
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
) if "pooling" in model_config.supported_tasks else None
) if "encode" in model_config.supported_tasks else None
state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client,
model_config,

View File

@ -42,7 +42,7 @@ if TYPE_CHECKING:
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
VLLM_PP_LAYER_PARTITION: Optional[str] = None
VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
VLLM_CPU_OMP_THREADS_BIND: str = ""
VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
VLLM_CPU_MOE_PREPACK: bool = True
@ -430,9 +430,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
# (CPU backend only) CPU key-value cache space.
# default is 4 GiB
# default is None and will be set as 4 GB
"VLLM_CPU_KVCACHE_SPACE":
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None,
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.

View File

@ -1,15 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Mapping, Set
from dataclasses import dataclass
from enum import IntEnum
from itertools import groupby
from typing import Callable, Optional, TypeVar, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig
from typing_extensions import assert_never
from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.pooling_metadata import ( # noqa: E501
@ -21,6 +22,10 @@ from vllm.utils import resolve_obj_by_qualname
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
PoolingFn = Callable[
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
Union[torch.Tensor, list[torch.Tensor]]]
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
class PoolingType(IntEnum):
@ -79,37 +84,81 @@ class Pooler(nn.Module, ABC):
"""The interface required for all poolers used in pooling models in vLLM."""
@staticmethod
def from_config_with_defaults(
def for_encode(
pooler_config: PoolerConfig,
pooling_type: PoolingType,
normalize: bool,
softmax: bool,
step_tag_id: Optional[int] = None,
returned_token_ids: Optional[list[int]] = None,
) -> "Pooler":
*,
default_pooling_type: PoolingType = PoolingType.ALL,
default_normalize: bool = False,
default_softmax: bool = False,
default_step_tag_id: Optional[int] = None,
default_returned_token_ids: Optional[list[int]] = None,
):
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=pooling_type,
normalize=normalize,
softmax=softmax,
step_tag_id=step_tag_id,
returned_token_ids=returned_token_ids,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
step_tag_id=default_step_tag_id,
returned_token_ids=default_returned_token_ids,
)
if pooling_type == PoolingType.STEP:
if resolved_config.pooling_type == PoolingType.STEP:
return StepPooler.from_config(resolved_config)
return SimplePooler.from_config(resolved_config)
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
@staticmethod
def for_embed(
pooler_config: PoolerConfig,
*,
default_pooling_type: PoolingType = PoolingType.LAST,
default_normalize: bool = True,
default_softmax: bool = False,
):
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
)
return SimplePooler.from_config(resolved_config)
@staticmethod
def for_classify(
pooler_config: PoolerConfig,
classifier: Optional[ClassifierFn],
*,
default_pooling_type: PoolingType = PoolingType.LAST,
default_normalize: bool = False,
default_softmax: bool = True,
):
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
)
base_pooler = SimplePooler.from_config(resolved_config)
if classifier is None:
return base_pooler
return ClassifierPooler(
pooling=base_pooler.pooling,
classifier=classifier,
act_fn=base_pooler.head.activation,
)
@abstractmethod
def get_supported_tasks(self) -> Set[PoolingTask]:
"""Determine which pooling tasks are supported."""
raise NotImplementedError
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
"""
Construct the pooling parameters to use for a task,
or `None` if the task is not supported.
Construct the updated pooling parameters to use for a supported task.
"""
return None
return PoolingParamsUpdate()
@abstractmethod
def forward(
@ -127,9 +176,8 @@ def get_prompt_lens(
if isinstance(pooling_metadata, V1PoolingMetadata):
return pooling_metadata.prompt_lens
assert isinstance(hidden_states, torch.Tensor)
return PoolingTensors.from_pooling_metadata(
pooling_metadata, hidden_states.device).prompt_lens
pooling_metadata, hidden_states[0].device).prompt_lens
def get_prompt_token_ids(
@ -149,6 +197,21 @@ def get_prompt_token_ids(
]
def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
if isinstance(pooling_metadata, V0PoolingMetadata):
pooling_params = [p for _, p in pooling_metadata.seq_groups]
else:
pooling_params = pooling_metadata.pooling_params
tasks: list[PoolingTask] = [
task for pooling_param in pooling_params
if (task := pooling_param.task) is not None
]
assert len(pooling_params) == len(tasks)
return tasks
def get_classification_activation_function(config: PretrainedConfig):
return PoolerClassify()
@ -172,7 +235,8 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
return PoolerScore()
def build_output(all_data: torch.Tensor) -> PoolerOutput:
def build_output(
all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput:
all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
return PoolerOutput(outputs=all_outputs)
@ -193,12 +257,12 @@ class PoolingMethod(nn.Module, ABC):
raise NotImplementedError(f"Unsupported method: {pooling_type}")
@abstractmethod
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
def get_supported_tasks(self) -> Set[PoolingTask]:
raise NotImplementedError
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return PoolingParamsUpdate()
@abstractmethod
def forward_one(
self,
@ -237,16 +301,8 @@ class PoolingMethod(nn.Module, ABC):
class CLSPool(PoolingMethod):
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
# The equalities are split up to keep mypy happy
if (task == "encode" or task == "embed" or task == "classify"
or task == "score"):
return PoolingParamsUpdate()
assert_never(task)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode", "embed", "classify", "score"}
def forward_one(
self,
@ -270,16 +326,8 @@ class CLSPool(PoolingMethod):
class LastPool(PoolingMethod):
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
# The equalities are split up to keep mypy happy
if (task == "encode" or task == "embed" or task == "classify"
or task == "score"):
return PoolingParamsUpdate()
assert_never(task)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode", "embed", "classify", "score"}
def forward_one(
self,
@ -299,18 +347,8 @@ class LastPool(PoolingMethod):
class AllPool(PoolingMethod):
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
if task == "encode":
return PoolingParamsUpdate()
# The equalities are split up to keep mypy happy
if task == "embed" or task == "classify" or task == "score":
return None
assert_never(task)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode"}
def forward_one(
self,
@ -327,28 +365,13 @@ class AllPool(PoolingMethod):
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
) -> Union[list[torch.Tensor], torch.Tensor]:
offset = 0
pooled_data = list[torch.Tensor]()
for prompt_len in prompt_lens:
pooled_data.append(hidden_states[offset:offset + prompt_len])
offset += prompt_len
return pooled_data
return list(hidden_states.split_with_sizes(prompt_lens.tolist()))
class MeanPool(PoolingMethod):
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
# The equalities are split up to keep mypy happy
if (task == "encode" or task == "embed" or task == "classify"
or task == "score"):
return PoolingParamsUpdate()
assert_never(task)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode", "embed", "classify", "score"}
def forward_one(
self,
@ -529,24 +552,6 @@ class SimplePooler(Pooler):
3. Returns structured results as `PoolerOutput`.
"""
@classmethod
def from_config_with_defaults( # type: ignore[override]
cls,
pooler_config: PoolerConfig,
pooling_type: PoolingType,
normalize: bool,
softmax: bool,
) -> "SimplePooler":
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=pooling_type,
normalize=normalize,
softmax=softmax,
)
assert resolved_config.pooling_type != PoolingType.STEP
return cls.from_config(resolved_config)
@classmethod
def from_config(
cls,
@ -563,10 +568,10 @@ class SimplePooler(Pooler):
self.pooling = pooling
self.head = head
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
def get_supported_tasks(self) -> Set[PoolingTask]:
return self.pooling.get_supported_tasks()
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return self.pooling.get_pooling_updates(task)
def forward(
@ -627,18 +632,11 @@ class StepPooler(Pooler):
return pooled_data
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
if task == "encode":
return PoolingParamsUpdate(requires_token_ids=True)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode"}
# The equalities are split up to keep mypy happy
if task == "embed" or task == "classify" or task == "score":
return None
assert_never(task)
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return PoolingParamsUpdate(requires_token_ids=True)
def forward(
self,
@ -650,68 +648,43 @@ class StepPooler(Pooler):
return build_output(pooled_data)
PoolingFn = Callable[
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
Union[torch.Tensor, list[torch.Tensor]]]
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
class ClassifierPooler(nn.Module):
class ClassifierPooler(Pooler):
"""A pooling layer for classification tasks.
This layer does the following:
1. Applies a classification layer to the hidden states.
2. Optionally applies a pooler layer.
3. Applies an activation function to the output. In the case of
classification models it is either sigmoid or softmax. In the
case of scoring models, the same behavior is configuration
dependent, as in the sentence-transformers library.
3. Applies an activation function to the output.
"""
@staticmethod
def act_fn_for_seq_cls(config: ModelConfig):
return get_classification_activation_function(config.hf_config)
@staticmethod
def act_fn_for_cross_encoder(config: ModelConfig):
return get_cross_encoder_activation_function(config.hf_config)
def __init__(
self,
config: ModelConfig,
pooling: PoolingFn,
classifier: ClassifierFn,
act_fn: Optional[PoolerActivation] = None,
act_fn: PoolerActivation,
) -> None:
super().__init__()
self.pooling = pooling
self.classifier = classifier
self.act_fn = act_fn
self.classification_act_fn = get_classification_activation_function(
config.hf_config) if act_fn is None else act_fn
self.cross_encoder_act_fn = get_cross_encoder_activation_function(
config.hf_config) if act_fn is None else act_fn
def _get_act_fn(self, task: PoolingTask):
if task == "encode" or task == "classify":
return self.classification_act_fn
if task == "score":
return self.cross_encoder_act_fn
raise ValueError(f"Unsupported task: {task!r}")
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
# The equalities are split up to keep mypy happy
if task == "encode" or task == "classify" or task == "score":
return PoolingParamsUpdate()
if task == "embed":
return None
assert_never(task)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"classify", "score"}
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
"""Pools sentence pair scores from the hidden_states."""
pooled_data = self.pooling(hidden_states, pooling_metadata)
# apply classifier once on the full batch if possible
@ -722,28 +695,59 @@ class ClassifierPooler(nn.Module):
else:
pooled_output = [self.classifier(data) for data in pooled_data]
task_list: list[PoolingTask]
if isinstance(pooling_metadata, V0PoolingMetadata):
task_list = [
task for _, pooling_param in pooling_metadata.seq_groups
if (task := pooling_param.task) is not None
]
else:
task_list = [
task for pooling_param in pooling_metadata.pooling_params
if (task := pooling_param.task) is not None
]
assert len(task_list) == len(pooled_output)
# shape of scores: (batch_size, num_labels)
if len(set(task_list)) <= 1:
act_fn = self._get_act_fn(task_list[0])
scores = act_fn(pooled_output)
else:
scores = torch.stack([
self._get_act_fn(task)(vecs)
for task, vecs in zip(task_list, pooled_output)
])
scores = self.act_fn(pooled_output)
return build_output(scores)
class DispatchPooler(Pooler):
"""Dispatches calls to a sub-pooler based on the pooling task."""
def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
super().__init__()
for task, pooler in poolers_by_task.items():
if task not in pooler.get_supported_tasks():
raise ValueError(
f"{pooler=} does not support {task=}. "
f"Supported tasks: {pooler.get_supported_tasks()}")
self.poolers_by_task = poolers_by_task
def get_supported_tasks(self) -> Set[PoolingTask]:
return set(self.poolers_by_task)
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return self.poolers_by_task[task].get_pooling_updates(task)
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
poolers_by_task = self.poolers_by_task
if isinstance(hidden_states, list):
hidden_states_lst = hidden_states
else:
prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
hidden_states_lst = list(hidden_states.split(prompt_lens.tolist()))
outputs = list[PoolingSequenceGroupOutput]()
offset = 0
for task, group in groupby(get_tasks(pooling_metadata)):
if not (pooler := poolers_by_task.get(task)):
raise ValueError(
f"Unsupported task: {task} "
f"Supported tasks: {self.get_supported_tasks()}")
num_items = len(list(group))
group_output: PoolerOutput = pooler(
hidden_states_lst[offset:offset + num_items],
pooling_metadata[offset:offset + num_items],
)
outputs.extend(group_output.outputs)
offset += num_items
return PoolerOutput(outputs)

View File

@ -75,20 +75,64 @@ class ModelOptFp8Config(QuantizationConfig):
def get_config_filenames(cls) -> list[str]:
return ["hf_quant_config.json"]
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
"""Detect if this ModelOpt config should be used based on
quantization config."""
if hf_quant_cfg is None:
return None
# Use the community standard 'quant_method'
quant_method = hf_quant_cfg.get("quant_method", "").lower()
# Only proceed if the method is explicitly "modelopt"
if quant_method != "modelopt":
return None
# Look for ModelOpt-specific config structure
if "quantization" in hf_quant_cfg:
quant_config = hf_quant_cfg["quantization"]
if isinstance(quant_config, dict):
quant_algo = quant_config.get("quant_algo", "")
if "FP8" in quant_algo:
return "modelopt"
else:
# Check for compressed-tensors style config with specific quant_algo
quant_algo = hf_quant_cfg.get("quant_algo", "")
if isinstance(quant_algo, str) and "FP8" in quant_algo:
return "modelopt"
return None
@classmethod
def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
quant_config = cls.get_from_keys(config, ["quantization"])
quant_method = quant_config["quant_algo"]
kv_cache_quant_method = cls.get_from_keys(
config, ["quantization"]).get("kv_cache_quant_algo")
exclude_modules = cls.get_from_keys(
config, ["quantization"]).get("exclude_modules")
# Handle both ModelOpt format and compressed-tensors style format
if "quantization" in config:
# ModelOpt format: {"quantization": {"quant_algo": "..."}}
quant_config = cls.get_from_keys(config, ["quantization"])
if not isinstance(quant_config, dict):
raise ValueError(
"Expected 'quantization' to be a dictionary in config")
quant_method = quant_config.get("quant_algo", "")
if not quant_method:
raise ValueError("Missing 'quant_algo' in quantization config")
kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
exclude_modules = quant_config.get("exclude_modules")
else:
# Compressed-tensors style format:
# {"quant_algo": "...", "quant_method": "modelopt"}
quant_method = config.get("quant_algo", "")
kv_cache_quant_method = config.get("kv_cache_quant_algo")
exclude_modules = config.get("exclude_modules")
if quant_method not in QUANT_ALGOS:
raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
" quantizations in vLLM. Please check the "
"`hf_quant_config.json` file for your model's "
"quant configuration.")
raise ValueError(
f"ModelOpt currently only supports: {QUANT_ALGOS} "
"quantizations in vLLM. Please check the "
"`hf_quant_config.json` file for your model's "
"quant configuration.")
is_checkpoint_fp8_serialized = ("FP8" in quant_method)
return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
@ -434,7 +478,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
def __init__(
self,
is_checkpoint_nvfp4_serialized: bool,
kv_cache_quant_algo: str,
kv_cache_quant_algo: Optional[str],
exclude_modules: list[str],
group_size: int = 16,
) -> None:
@ -465,24 +509,138 @@ class ModelOptNvFp4Config(QuantizationConfig):
def get_config_filenames(cls) -> list[str]:
return ["hf_quant_config.json"]
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
"""Detect if this ModelOpt FP4 config should be used based on
quantization config."""
if hf_quant_cfg is None:
return None
# Use the community standard 'quant_method'
quant_method = hf_quant_cfg.get("quant_method", "").lower()
# Only proceed if the method is explicitly "modelopt"
if quant_method != "modelopt":
return None
# Look for ModelOpt-specific config structure
if "quantization" in hf_quant_cfg:
quant_config = hf_quant_cfg["quantization"]
if isinstance(quant_config, dict):
quant_algo = quant_config.get("quant_algo", "")
if "NVFP4" in quant_algo:
return "modelopt_fp4"
else:
# Check for compressed-tensors style config with specific
# quant_algo field
quant_algo = hf_quant_cfg.get("quant_algo", "")
if isinstance(quant_algo, str) and "FP4" in quant_algo.upper():
return "modelopt_fp4"
return None
@classmethod
def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
quant_config = cls.get_from_keys(config, ["quantization"])
quant_method = quant_config["quant_algo"]
# Handle both traditional ModelOpt format and compressed-tensors
# style format
if "quantization" in config:
# Traditional ModelOpt format:
# {"quantization": {"quant_algo": "..."}}
quant_config = cls.get_from_keys(config, ["quantization"])
if not isinstance(quant_config, dict):
raise ValueError(
"Expected 'quantization' to be a dictionary in config")
quant_method = quant_config.get("quant_algo", "")
if not quant_method:
raise ValueError("Missing 'quant_algo' in quantization config")
# Handle kv_cache_quant_algo with proper type validation
kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
if kv_cache_quant_algo_raw is None:
# No KV cache quantization by default
kv_cache_quant_algo = None
elif isinstance(kv_cache_quant_algo_raw, str):
kv_cache_quant_algo = kv_cache_quant_algo_raw
else:
raise ValueError(f"kv_cache_quant_algo must be a string, got "
f"{type(kv_cache_quant_algo_raw)}")
# Handle group_size with proper type validation
group_size_raw = quant_config.get("group_size")
if group_size_raw is None:
group_size = 16 # Default value
elif isinstance(group_size_raw, int):
group_size = group_size_raw
else:
try:
group_size = int(group_size_raw)
except (ValueError, TypeError):
raise ValueError(f"group_size must be an integer, got "
f"{type(group_size_raw)}") from None
exclude_modules = quant_config.get("exclude_modules", [])
if not isinstance(exclude_modules, list):
raise ValueError(f"exclude_modules must be a list, got "
f"{type(exclude_modules)}")
else:
# Compressed-tensors style format:
# {"quant_algo": "...", "quant_method": "modelopt"}
quant_method = config.get("quant_algo", "")
# Handle kv_cache_quant_algo with proper type validation
kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
if kv_cache_quant_algo_raw is None:
# No KV cache quantization by default
kv_cache_quant_algo = None
elif isinstance(kv_cache_quant_algo_raw, str):
kv_cache_quant_algo = kv_cache_quant_algo_raw
else:
raise ValueError(f"kv_cache_quant_algo must be a string, got "
f"{type(kv_cache_quant_algo_raw)}")
# Handle group_size with proper type validation
group_size_raw = config.get("group_size")
if group_size_raw is None:
group_size = 16 # Default value
elif isinstance(group_size_raw, int):
group_size = group_size_raw
else:
try:
group_size = int(group_size_raw)
except (ValueError, TypeError):
raise ValueError(f"group_size must be an integer, got "
f"{type(group_size_raw)}") from None
exclude_modules = config.get("exclude_modules", [])
if not isinstance(exclude_modules, list):
raise ValueError(f"exclude_modules must be a list, got "
f"{type(exclude_modules)}")
if quant_method not in QUANT_ALGOS:
raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
" quantizations in vLLM. Please check the "
"`hf_quant_config.json` file for your model's "
"quant configuration.")
raise ValueError(
f"ModelOpt currently only supports: {QUANT_ALGOS} "
"quantizations in vLLM. Please check the "
"`hf_quant_config.json` file for your model's "
"quant configuration.")
is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method)
if ("group_size" and "kv_cache_quant_algo"
and "exclude_modules") not in quant_config:
raise ValueError("NVFP4 quantization requires group size and "
"kv_cache_quant_algo specified in "
"hf_quant_config.json")
kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
group_size = quant_config["group_size"]
exclude_modules = quant_config["exclude_modules"]
# For FP4, these fields are required
if is_checkpoint_nvfp4_serialized and "quantization" in config:
# Check if required fields are present in the quantization config
quant_config = config["quantization"]
required_fields = [
"group_size", "kv_cache_quant_algo", "exclude_modules"
]
missing_fields = [
field for field in required_fields if field not in quant_config
]
if missing_fields:
raise ValueError(
f"NVFP4 quantization requires the following fields in "
f"hf_quant_config.json: {missing_fields}")
return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
exclude_modules, group_size)

View File

@ -13,7 +13,6 @@ from .interfaces_base import VllmModelForPooling, is_pooling_model
if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import PoolingType
_T = TypeVar("_T", bound=type[nn.Module])
@ -34,16 +33,8 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
return model_name + pooling_suffix
def _create_pooling_model_cls(
orig_cls: _T,
*,
default_pooling_type: "PoolingType",
default_normalize: bool,
default_softmax: bool,
) -> _T:
def _create_pooling_model_cls(orig_cls: _T) -> _T:
# Lazy import
from vllm.model_executor.layers.pooler import Pooler
from .utils import AutoWeightsLoader, WeightsMapper
class ModelForPooling(orig_cls, VllmModelForPooling):
@ -71,15 +62,7 @@ def _create_pooling_model_cls(
self._init_pooler(vllm_config, prefix=prefix)
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
)
raise NotImplementedError
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
# TODO: Support uninitialized params tracking
@ -132,14 +115,20 @@ def as_embedding_model(cls: _T) -> _T:
return cls
# Lazy import
from vllm.model_executor.layers.pooler import PoolingType
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
class ModelForEmbedding(_create_pooling_model_cls(cls)):
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler(
{
"encode": Pooler.for_encode(pooler_config),
"embed": Pooler.for_embed(pooler_config),
}, )
ModelForEmbedding = _create_pooling_model_cls(
cls,
default_pooling_type=PoolingType.LAST,
default_normalize=True,
default_softmax=False,
)
ModelForEmbedding.__name__ = \
_get_pooling_model_name(cls.__name__, "ForEmbedding")
@ -165,20 +154,14 @@ def as_seq_cls_model(cls: _T) -> _T:
# Lazy import
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.pooler import (ClassifierPooler,
PoolingType, SimplePooler)
DispatchPooler, Pooler,
PoolingMethod, PoolingType)
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
from vllm.sequence import IntermediateTensors
from .utils import maybe_prefix
ModelForPooling = _create_pooling_model_cls(
cls,
default_pooling_type=PoolingType.LAST,
default_normalize=False,
default_softmax=True,
)
class ModelForSequenceClassification(ModelForPooling,
class ModelForSequenceClassification(_create_pooling_model_cls(cls),
SupportsCrossEncoding):
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
@ -198,19 +181,28 @@ def as_seq_cls_model(cls: _T) -> _T:
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
pooler = SimplePooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=True,
)
pooling_type_str = pooler_config.pooling_type
pooling_type = (PoolingType.LAST if pooling_type_str is None else
PoolingType[pooling_type_str])
self.pooler = ClassifierPooler(
vllm_config.model_config,
pooling=pooler.pooling,
classifier=self._classifier,
act_fn=pooler.head.activation,
)
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
ClassifierPooler(
pooling=PoolingMethod.from_pooling_type(pooling_type),
classifier=self._classifier,
act_fn=ClassifierPooler.act_fn_for_seq_cls(
vllm_config.model_config),
),
"score":
ClassifierPooler(
pooling=PoolingMethod.from_pooling_type(pooling_type),
classifier=self._classifier,
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
vllm_config.model_config),
),
})
def _classifier(self, x: torch.Tensor):
x, _ = self.score(x.float())
@ -259,14 +251,16 @@ def as_reward_model(cls: _T) -> _T:
return cls
# Lazy import
from vllm.model_executor.layers.pooler import PoolingType
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
ModelForReward = _create_pooling_model_cls(
cls,
default_pooling_type=PoolingType.ALL,
default_normalize=False,
default_softmax=False,
)
class ModelForReward(_create_pooling_model_cls(cls)):
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler(
{"encode": Pooler.for_encode(pooler_config)}, )
ModelForReward.__name__ = \
_get_pooling_model_name(cls.__name__, "ForReward")

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from collections.abc import Iterable, Set
from typing import Optional, Union
import torch
@ -17,7 +17,8 @@ from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
from vllm.model_executor.layers.pooler import (ClassifierPooler,
DispatchPooler, Pooler,
PoolingMethod,
PoolingParamsUpdate,
PoolingType)
@ -92,20 +93,29 @@ class BertPooler(Pooler):
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
def get_supported_tasks(self) -> Set[PoolingTask]:
return self.pooling.get_supported_tasks()
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return self.pooling.get_pooling_updates(task)
def _head(self, pooled_output: torch.Tensor):
pooled_output = self.dense(pooled_output)
pooled_output = self.activation(pooled_output)
return pooled_output
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[torch.Tensor, list[torch.Tensor]]:
pooled_output = self.pooling(hidden_states, pooling_metadata)
pooled_output = self.dense(pooled_output)
pooled_output = self.activation(pooled_output)
if isinstance(pooled_output, list):
pooled_output = [self._head(output) for output in pooled_output]
else:
pooled_output = self._head(pooled_output)
return pooled_output
@ -333,18 +343,19 @@ class BertModel(nn.Module, SupportsQuant):
packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = "",
embedding_class: type = BertEmbedding,
add_pooling_layer: bool = False):
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
embedding_class: type[nn.Module] = BertEmbedding,
) -> None:
super().__init__()
config = vllm_config.model_config.hf_config
self.embeddings = embedding_class(config)
self.encoder = BertEncoder(vllm_config=vllm_config,
prefix=f"{prefix}.encoder")
self.pooler = BertPooler(config) if add_pooling_layer else None
def forward(
self,
@ -366,8 +377,7 @@ class BertModel(nn.Module, SupportsQuant):
token_type_ids=token_type_ids)
return self.encoder(hidden_states)
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "query", "q"),
@ -395,10 +405,43 @@ class BertModel(nn.Module, SupportsQuant):
if name in params_dict:
other_weights.append((name, loaded_weight))
loader = AutoWeightsLoader(
self,
skip_prefixes=(["pooler."] if self.pooler is None else []),
return other_weights, loaded_stacked_params
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
other_weights, loaded_stacked_params = self._load_weights(weights)
loader = AutoWeightsLoader(self, skip_prefixes=["pooler."])
loaded_params = loader.load_weights(other_weights)
loaded_params.update(loaded_stacked_params)
return loaded_params
class BertPoolingModel(BertModel):
is_pooling_model = True
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
embedding_class: type[nn.Module] = BertEmbedding,
) -> None:
super().__init__(
vllm_config=vllm_config,
prefix=prefix,
embedding_class=embedding_class,
)
config = vllm_config.model_config.hf_config
self.pooler = BertPooler(config)
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
other_weights, loaded_stacked_params = self._load_weights(weights)
loader = AutoWeightsLoader(self)
loaded_params = loader.load_weights(other_weights)
loaded_params.update(loaded_stacked_params)
return loaded_params
@ -421,6 +464,8 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
super().__init__()
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.model = self._build_model(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
self.pooler = self._build_pooler(pooler_config)
@ -456,10 +501,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
embedding_class=BertEmbedding)
def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
return Pooler.from_config_with_defaults(pooler_config,
pooling_type=PoolingType.CLS,
normalize=True,
softmax=False)
return DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"embed":
Pooler.for_embed(
pooler_config,
default_pooling_type=PoolingType.CLS,
),
})
class BertForSequenceClassification(nn.Module, SupportsV0Only,
@ -481,16 +531,32 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
config = vllm_config.model_config.hf_config
self.num_labels = config.num_labels
self.bert = BertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "bert"),
embedding_class=BertEmbedding,
add_pooling_layer=True)
self.bert = BertPoolingModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "bert"),
embedding_class=BertEmbedding)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.pooler = ClassifierPooler(
vllm_config.model_config,
pooling=self.bert.pooler,
classifier=self.classifier,
)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
ClassifierPooler(
pooling=self.bert.pooler,
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_seq_cls(
vllm_config.model_config),
),
"score":
ClassifierPooler(
pooling=self.bert.pooler,
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
vllm_config.model_config),
),
})
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(self)

View File

@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
from ..layers.pooler import Pooler, PoolingType
from ..layers.pooler import DispatchPooler, Pooler
from .interfaces import SupportsPP
from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
@ -339,12 +339,16 @@ class GPT2ForSequenceClassification(nn.Module):
self.transformer = GPT2Model(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "gpt2"))
self.score = nn.Linear(config.n_embd, config.num_labels, bias=False)
pooler_config = vllm_config.model_config.pooler_config
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=True)
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
Pooler.for_classify(pooler_config, classifier=None),
})
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(self)

View File

@ -1,17 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Set
from typing import Optional, Union
import numpy as np
import torch
import torch.nn as nn
from typing_extensions import assert_never
from vllm.config import ModelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.pooler import (Pooler, PoolerHead,
PoolerNormalize,
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
PoolerHead, PoolerNormalize,
PoolingParamsUpdate,
build_output, get_prompt_lens,
get_prompt_token_ids)
@ -135,18 +134,11 @@ class GritLMMeanPool(nn.Module):
return instruction_len
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
# The equalities are split up to keep mypy happy
if task == "encode" or task == "embed":
return PoolingParamsUpdate(requires_token_ids=True)
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"encode", "embed"}
if task == "classify" or task == "score":
return None
assert_never(task)
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return PoolingParamsUpdate(requires_token_ids=True)
def forward_one(
self,
@ -207,10 +199,10 @@ class GritLMPooler(Pooler):
self.pooling = GritLMMeanPool(model_config)
self.head = PoolerHead(PoolerNormalize())
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
def get_supported_tasks(self) -> Set[PoolingTask]:
return self.pooling.get_supported_tasks()
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return self.pooling.get_pooling_updates(task)
def forward(
@ -262,4 +254,11 @@ class GritLM(LlamaForCausalLM, SupportsV0Only):
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
self.pooler = GritLMPooler(vllm_config.model_config)
pooler_config = vllm_config.model_config.pooler_config
if pooler_config is not None:
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"embed":
GritLMPooler(vllm_config.model_config),
})

View File

@ -22,7 +22,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
@ -429,12 +429,10 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
)
pooler_config = vllm_config.model_config.pooler_config
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.ALL,
normalize=False,
softmax=False,
)
assert pooler_config is not None
self.pooler = DispatchPooler(
{"encode": Pooler.for_encode(pooler_config)}, )
def forward(
self,

View File

@ -19,8 +19,8 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType,
SimplePooler)
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
PoolingType)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@ -584,16 +584,15 @@ class JambaForSequenceClassification(JambaForCausalLM):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
pooler = SimplePooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=False,
)
self.pooler = ClassifierPooler(
vllm_config.model_config,
pooling=pooler.pooling,
classifier=self.score,
act_fn=pooler.head.activation,
)
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
Pooler.for_classify(
pooler_config,
classifier=self.score,
default_pooling_type=PoolingType.LAST,
default_normalize=False,
default_softmax=False,
),
})

View File

@ -12,7 +12,7 @@ from vllm.inputs import TokensPrompt
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
@ -96,11 +96,17 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
self.score = JinaVLScorer(config)
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=True)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
Pooler.for_classify(pooler_config, classifier=None),
"score":
Pooler.for_classify(pooler_config, classifier=None),
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from collections.abc import Iterable, Set
from typing import Optional, Union
import torch
@ -13,7 +13,8 @@ from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
from vllm.model_executor.layers.pooler import (ClassifierPooler,
DispatchPooler, Pooler,
PoolingMethod,
PoolingParamsUpdate,
PoolingType)
@ -271,19 +272,27 @@ class ModernBertPooler(Pooler):
eps=config.norm_eps,
bias=config.norm_bias)
def get_pooling_updates(
self,
task: PoolingTask,
) -> Optional[PoolingParamsUpdate]:
def get_supported_tasks(self) -> Set[PoolingTask]:
return self.pooling.get_supported_tasks()
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return self.pooling.get_pooling_updates(task)
def _head(self, pooled_output: torch.Tensor):
return self.norm(self.act(self.dense(pooled_output)))
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[torch.Tensor, list[torch.Tensor]]:
pooled_output = self.pooling(hidden_states, pooling_metadata)
pooled_output = self.norm(self.act(self.dense(pooled_output)))
if isinstance(pooled_output, list):
pooled_output = [self._head(output) for output in pooled_output]
else:
pooled_output = self._head(pooled_output)
return pooled_output
@ -299,11 +308,28 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
self.model = ModernBertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "modernbert"))
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.pooler = ClassifierPooler(
vllm_config.model_config,
pooling=ModernBertPooler(config),
classifier=self.classifier,
)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
ClassifierPooler(
pooling=ModernBertPooler(config),
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_seq_cls(
vllm_config.model_config),
),
"score":
ClassifierPooler(
pooling=ModernBertPooler(config),
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
vllm_config.model_config),
),
})
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):

View File

@ -15,7 +15,8 @@ from torch import nn
from vllm.config import VllmConfig
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
PoolingType)
from vllm.sequence import IntermediateTensors
from .interfaces import SupportsLoRA, SupportsPP
@ -26,7 +27,7 @@ from .utils import AutoWeightsLoader, maybe_prefix
class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
is_pooling_model = True
pooler: SimplePooler
pooler: Pooler
packed_modules_mapping = {
"qkv_proj": [
@ -94,12 +95,12 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
vllm_config.model_config.hf_config.num_labels = 1
super().__init__(vllm_config=vllm_config, prefix=prefix)
pooler_config = vllm_config.model_config.pooler_config
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.ALL,
normalize=False,
softmax=False)
assert pooler_config is not None
self.pooler = DispatchPooler(
{"encode": Pooler.for_encode(pooler_config)}, )
class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
@ -107,11 +108,17 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
vllm_config.model_config.hf_config.num_labels = 2
super().__init__(vllm_config=vllm_config, prefix=prefix)
pooler_config = vllm_config.model_config.pooler_config
self.pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.STEP,
normalize=False,
softmax=True,
step_tag_id=151651,
)
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(
pooler_config,
default_pooling_type=PoolingType.STEP,
default_normalize=False,
default_softmax=True,
default_step_tag_id=151651,
)
})

View File

@ -9,7 +9,8 @@ from torch import nn
from transformers import RobertaConfig
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import ClassifierPooler, CLSPool
from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
DispatchPooler, Pooler)
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
@ -63,16 +64,10 @@ class RobertaEmbedding(nn.Module):
# References:
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
pos_list = []
token_list = []
offset = 0
for seq_len in seq_lens:
pos_list.append(position_ids[offset:offset + seq_len])
token_list.append(input_ids[offset:offset + seq_len])
offset += seq_len
seq_lens_list = seq_lens.tolist()
new_pos_list = []
for positions, tokens in zip(pos_list, token_list):
for positions, tokens in zip(position_ids.split(seq_lens_list),
input_ids.split(seq_lens_list)):
# Verify assumption that incoming position are
# always a sequence from 0 to N.
expected_pos = torch.arange(positions.size()[0],
@ -184,15 +179,30 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
self.num_labels = config.num_labels
self.roberta = BertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "bert"),
embedding_class=RobertaEmbedding,
add_pooling_layer=False)
embedding_class=RobertaEmbedding)
self.classifier = RobertaClassificationHead(config)
self.pooler = ClassifierPooler(
vllm_config.model_config,
pooling=CLSPool(),
classifier=self.classifier,
)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler({
"encode":
Pooler.for_encode(pooler_config),
"classify":
ClassifierPooler(
pooling=CLSPool(),
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_seq_cls(
vllm_config.model_config),
),
"score":
ClassifierPooler(
pooling=CLSPool(),
classifier=self.classifier,
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
vllm_config.model_config),
),
})
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(self)

View File

@ -38,6 +38,13 @@ class PoolingMetadata:
f"seq_data={self.seq_data}, "
f"prompt_lens={self.prompt_lens})")
def __getitem__(self, indices: slice):
return PoolingMetadata(
seq_groups=self.seq_groups[indices],
seq_data=dict(list(self.seq_data.items())[indices]),
prompt_lens=self.prompt_lens[indices],
)
@dataclass
class PoolingTensors:

View File

@ -104,8 +104,19 @@ class CpuPlatform(Platform):
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
import psutil
return psutil.virtual_memory().total
import vllm.envs as envs
from vllm.utils import GiB_bytes
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
if kv_cache_space is None:
kv_cache_space = 4 * GiB_bytes # type: ignore
logger.warning_once(
"Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
"for CPU backend is not set, using 4 by default.")
else:
kv_cache_space *= GiB_bytes
return kv_cache_space
@classmethod
def set_device(cls, device: torch.device) -> None:
@ -124,8 +135,6 @@ class CpuPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
import vllm.envs as envs
from vllm.utils import GiB_bytes
model_config = vllm_config.model_config
if model_config is not None:
@ -162,20 +171,8 @@ class CpuPlatform(Platform):
" support fp16 for now, cast to bf16.")
model_config.dtype = torch.bfloat16
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
if kv_cache_space >= 0:
if kv_cache_space == 0:
cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
logger.warning(
"Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
"for CPU backend is not set, using 4 by default.")
else:
cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa
else:
raise RuntimeError(
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
f" {kv_cache_space}, expect a positive integer value.")
cache_config.cpu_kvcache_space_bytes = \
CpuPlatform.get_device_total_memory()
parallel_config = vllm_config.parallel_config
if (parallel_config.world_size > 1
@ -216,8 +213,6 @@ class CpuPlatform(Platform):
False,
"nan_asserts":
False,
"memory_planning":
True,
"epilogue_fusion":
True,
})

View File

@ -42,7 +42,7 @@ def adapt_config_dict(config_dict: dict[str, Any],
config = PretrainedConfig.from_dict(config_dict)
logger.debug("Initialized config", config)
logger.debug("Initialized config %s", config)
return config

View File

@ -1383,12 +1383,11 @@ def find_nccl_library() -> str:
prev_set_stream = torch.cuda.set_stream
_current_stream = None
_current_stream_tls = threading.local()
def _patched_set_stream(stream: torch.cuda.Stream) -> None:
global _current_stream
_current_stream = stream
_current_stream_tls.value = stream
prev_set_stream(stream)
@ -1407,16 +1406,16 @@ def current_stream() -> torch.cuda.Stream:
from C/C++ code.
"""
from vllm.platforms import current_platform
global _current_stream
if _current_stream is None:
if not hasattr(_current_stream_tls,
"value") or _current_stream_tls.value is None:
# when this function is called before any stream is set,
# we return the default stream.
# On ROCm using the default 0 stream in combination with RCCL
# is hurting performance. Therefore creating a dedicated stream
# per process
_current_stream = torch.cuda.Stream() if current_platform.is_rocm(
) else torch.cuda.current_stream()
return _current_stream
_current_stream_tls.value = torch.cuda.Stream(
) if current_platform.is_rocm() else torch.cuda.current_stream()
return _current_stream_tls.value
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:

View File

@ -446,17 +446,12 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
if kv_sharing_target_layer_name is not None:
raise NotImplementedError("KV sharing is not supported in V0.")
if logits_soft_cap is not None:
logger.warning_once("Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off.")
if use_irope:
logger.warning_once(
"Using irope in Torch SPDA is not supported yet, it will fall"
" back to global attention for long context.")
self.paged_attn_impl = _get_paged_attn_impl()
self.num_heads = num_heads
self.head_size = head_size

View File

@ -352,7 +352,6 @@ class FlashAttentionImpl(AttentionImpl):
logits_soft_cap: Optional[float] = None,
attn_type: AttentionType = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
self.num_heads = num_heads
self.head_size = head_size
@ -381,7 +380,6 @@ class FlashAttentionImpl(AttentionImpl):
"encoder/decoder cross-attention "
"are not implemented for "
"FlashAttentionImpl")
self.use_irope = use_irope
self.vllm_flash_attn_version = get_flash_attn_version()
if is_quantized_kv_cache(self.kv_cache_dtype) \
and not flash_attn_supports_fp8():

Some files were not shown because too many files have changed in this diff Show More