mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-22 08:28:01 +08:00
Merge remote-tracking branch 'origin/main' into one-pod-per-node-lb
Signed-off-by: Nick Hill <nhill@redhat.com> # Conflicts: # vllm/v1/engine/core_client.py
This commit is contained in:
commit
60ae223986
@ -6,6 +6,7 @@ set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
# used for TP/PP E2E test
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
@ -78,17 +79,16 @@ function cpu_tests() {
|
||||
# tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
--endpoint /v1/completions'
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
|
||||
@ -273,7 +273,7 @@ steps:
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Examples Test # 25min
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
2
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -46,7 +46,7 @@ body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
|
||||
33
RELEASE.md
33
RELEASE.md
@ -52,3 +52,36 @@ After branch cut, we approach finalizing the release branch with clear criteria
|
||||
* Release branch specific changes (e.g. change version identifiers or CI fixes)
|
||||
|
||||
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
|
||||
|
||||
## Manual validations
|
||||
|
||||
### E2E Performance Validation
|
||||
|
||||
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
||||
|
||||
**Current Coverage:**
|
||||
* Models: Llama3, Llama4, and Mixtral
|
||||
* Hardware: NVIDIA H100 and AMD MI300x
|
||||
* *Note: Coverage may change based on new model releases and hardware availability*
|
||||
|
||||
**Performance Validation Process:**
|
||||
|
||||
**Step 1: Get Access**
|
||||
Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
|
||||
|
||||
**Step 2: Review Benchmark Setup**
|
||||
Familiarize yourself with the benchmark configurations:
|
||||
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
||||
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
||||
|
||||
**Step 3: Run the Benchmark**
|
||||
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
||||
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
||||
* **vLLM commit**: Set to the RC commit hash
|
||||
|
||||
**Step 4: Review Results**
|
||||
Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
|
||||
|
||||
**Step 5: Performance Comparison**
|
||||
Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
|
||||
example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
namespace {
|
||||
#define MAX_SHM_RANK_NUM 8
|
||||
#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024)
|
||||
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
|
||||
static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
|
||||
#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
|
||||
#define MIN_THREAD_PROCESS_SIZE (256)
|
||||
@ -34,9 +34,10 @@ struct KernelVecType<c10::Half> {
|
||||
};
|
||||
|
||||
struct ThreadSHMContext {
|
||||
volatile char _curr_thread_stamp;
|
||||
volatile char _ready_thread_stamp;
|
||||
char _padding1[6];
|
||||
volatile char _curr_thread_stamp[2];
|
||||
volatile char _ready_thread_stamp[2];
|
||||
int local_stamp_buffer_idx;
|
||||
int remote_stamp_buffer_idx;
|
||||
int thread_id;
|
||||
int thread_num;
|
||||
int rank;
|
||||
@ -45,23 +46,28 @@ struct ThreadSHMContext {
|
||||
int swizzled_ranks[MAX_SHM_RANK_NUM];
|
||||
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
|
||||
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
|
||||
size_t _thread_buffer_mask;
|
||||
char _padding2[56];
|
||||
size_t _thread_buffer_mask[2];
|
||||
char _padding2[40];
|
||||
|
||||
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
|
||||
const int group_size, void* thread_shm_ptr)
|
||||
: _curr_thread_stamp(1),
|
||||
_ready_thread_stamp(0),
|
||||
: local_stamp_buffer_idx(0),
|
||||
remote_stamp_buffer_idx(0),
|
||||
thread_id(thread_id),
|
||||
thread_num(thread_num),
|
||||
rank(rank),
|
||||
group_size(group_size),
|
||||
_spinning_count(0),
|
||||
_thread_buffer_mask(0) {
|
||||
_spinning_count(0) {
|
||||
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
|
||||
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
|
||||
TORCH_CHECK((size_t)this % 64 == 0);
|
||||
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
|
||||
_curr_thread_stamp[0] = 1;
|
||||
_curr_thread_stamp[1] = 1;
|
||||
_ready_thread_stamp[0] = 0;
|
||||
_ready_thread_stamp[1] = 0;
|
||||
_thread_buffer_mask[0] = 0;
|
||||
_thread_buffer_mask[1] = 0;
|
||||
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
||||
shm_contexts[i] = nullptr;
|
||||
thread_shm_ptrs[i] = nullptr;
|
||||
@ -70,6 +76,11 @@ struct ThreadSHMContext {
|
||||
set_context(rank, this, thread_shm_ptr);
|
||||
}
|
||||
|
||||
void set_stamp_buffer_idx(int local, int remote) {
|
||||
local_stamp_buffer_idx = local;
|
||||
remote_stamp_buffer_idx = remote;
|
||||
}
|
||||
|
||||
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
|
||||
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
|
||||
TORCH_CHECK(ptr);
|
||||
@ -84,23 +95,27 @@ struct ThreadSHMContext {
|
||||
T* get_thread_shm_ptr(int rank) {
|
||||
return reinterpret_cast<T*>(
|
||||
reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
|
||||
(PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask));
|
||||
(PER_THREAD_SHM_BUFFER_OFFSET &
|
||||
_thread_buffer_mask[local_stamp_buffer_idx]));
|
||||
}
|
||||
|
||||
void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; }
|
||||
void next_buffer() {
|
||||
_thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
|
||||
}
|
||||
|
||||
char get_curr_stamp() const { return _curr_thread_stamp; }
|
||||
char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
|
||||
|
||||
char get_ready_stamp() const { return _ready_thread_stamp; }
|
||||
char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
|
||||
|
||||
void next_stamp() {
|
||||
_mm_mfence();
|
||||
_curr_thread_stamp += 1;
|
||||
_curr_thread_stamp[local_stamp_buffer_idx] += 1;
|
||||
}
|
||||
|
||||
void commit_ready_stamp() {
|
||||
_mm_mfence();
|
||||
_ready_thread_stamp = _curr_thread_stamp;
|
||||
_ready_thread_stamp[local_stamp_buffer_idx] =
|
||||
_curr_thread_stamp[local_stamp_buffer_idx];
|
||||
}
|
||||
|
||||
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
|
||||
@ -117,10 +132,11 @@ struct ThreadSHMContext {
|
||||
void wait_for_one(int rank, Cond&& cond) {
|
||||
ThreadSHMContext* rank_ctx = shm_contexts[rank];
|
||||
for (;;) {
|
||||
char local_curr_stamp = get_curr_stamp();
|
||||
char local_ready_stamp = get_ready_stamp();
|
||||
char rank_curr_stamp = rank_ctx->get_curr_stamp();
|
||||
char rank_ready_stamp = rank_ctx->get_ready_stamp();
|
||||
char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx);
|
||||
char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx);
|
||||
char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx);
|
||||
char rank_ready_stamp =
|
||||
rank_ctx->get_ready_stamp(remote_stamp_buffer_idx);
|
||||
if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
|
||||
rank_ready_stamp)) {
|
||||
break;
|
||||
@ -361,6 +377,15 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local,
|
||||
int remote) {
|
||||
int thread_num = ctx->thread_num;
|
||||
for (int i = 0; i < thread_num; ++i) {
|
||||
ThreadSHMContext* thread_ctx = ctx + i;
|
||||
thread_ctx->set_stamp_buffer_idx(local, remote);
|
||||
}
|
||||
}
|
||||
}; // namespace shm_cc_ops
|
||||
|
||||
namespace shm_cc_ops {
|
||||
@ -632,6 +657,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
|
||||
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
|
||||
metadata->bind_tensor_list(tensor_list_with_metadata);
|
||||
|
||||
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1);
|
||||
shm_cc_ops::shm_cc_loop<int8_t>(
|
||||
ctx, metadata->total_bytes,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
@ -659,6 +685,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
|
||||
torch::Tensor metadata_tensor =
|
||||
torch::empty({sizeof(TensorListMeta)}, options);
|
||||
|
||||
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0);
|
||||
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
||||
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
|
||||
ctx->get_thread_shm_ptr<void>(src),
|
||||
@ -677,7 +704,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
|
||||
ctx, metadata.total_bytes,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
int64_t data_elem_num, bool fast_mode) {
|
||||
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
||||
thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
||||
int64_t curr_shm_offset = 0;
|
||||
while (curr_shm_offset < data_elem_num) {
|
||||
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
|
||||
|
||||
@ -510,7 +510,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
|
||||
pip install accelerate hf_transfer pytest modelscope
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||
TRITON_XPU_PROFILE 1
|
||||
|
||||
@ -14,7 +14,7 @@ For example:
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
model = LLM(
|
||||
llm = LLM(
|
||||
model="cerebras/Cerebras-GPT-1.3B",
|
||||
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
||||
)
|
||||
|
||||
@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../..
|
||||
|
||||
### Grafana Dashboard
|
||||
|
||||
vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
|
||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||
|
||||
@ -672,8 +672,7 @@ v0 has support for OpenTelemetry tracing:
|
||||
`--collect-detailed-traces`
|
||||
- [OpenTelemetry blog
|
||||
post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||
- [User-facing
|
||||
docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html)
|
||||
- [User-facing docs](../../examples/online_serving/opentelemetry.md)
|
||||
- [Blog
|
||||
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
||||
- [IBM product
|
||||
|
||||
@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
|
||||
|
||||
model = LLM(
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
||||
}
|
||||
|
||||
|
||||
outputs = model.generate(
|
||||
outputs = llm.generate(
|
||||
inputs,
|
||||
sampling_params=SamplingParams(
|
||||
temperature=0.2,
|
||||
|
||||
@ -98,7 +98,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||
|
||||
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
|
||||
!!! note
|
||||
Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
|
||||
Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
|
||||
For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
|
||||
For details see [supported hardware](supported_hardware.md).
|
||||
|
||||
Below are the steps to utilize BitBLAS with vLLM.
|
||||
|
||||
|
||||
@ -86,8 +86,9 @@ Load and run the model in `vllm`:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
|
||||
result = model.generate("Hello my name is")
|
||||
|
||||
llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
|
||||
result = llm.generate("Hello my name is")
|
||||
print(result[0].outputs[0].text)
|
||||
```
|
||||
|
||||
@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
model = LLM("facebook/opt-125m", quantization="fp8")
|
||||
|
||||
llm = LLM("facebook/opt-125m", quantization="fp8")
|
||||
# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
|
||||
result = model.generate("Hello, my name is")
|
||||
result = llm.generate("Hello, my name is")
|
||||
print(result[0].outputs[0].text)
|
||||
```
|
||||
|
||||
|
||||
@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
|
||||
|
||||
llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
|
||||
```
|
||||
|
||||
To evaluate accuracy, you can use `lm_eval`:
|
||||
|
||||
@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
|
||||
|
||||
llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
|
||||
```
|
||||
|
||||
To evaluate accuracy, you can use `lm_eval`:
|
||||
|
||||
@ -95,7 +95,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha
|
||||
|
||||
## Required Function Calling
|
||||
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine.
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
|
||||
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
|
||||
|
||||
|
||||
@ -166,6 +166,20 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
|
||||
|
||||
- This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory.
|
||||
|
||||
### How to do performance tuning for vLLM CPU?
|
||||
|
||||
- First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
|
||||
|
||||
- Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
|
||||
- `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
|
||||
- Offline Inference: `4096 * world_size`
|
||||
- Online Serving: `2048 * world_size`
|
||||
- `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance.
|
||||
- Offline Inference: `256 * world_size`
|
||||
- Online Serving: `128 * world_size`
|
||||
|
||||
- vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes.
|
||||
|
||||
### Which quantization configs does vLLM CPU support?
|
||||
|
||||
- vLLM CPU supports quantizations:
|
||||
|
||||
@ -7,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
|
||||
|
||||
For more information on CoreWeave's Tensorizer, please refer to
|
||||
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
|
||||
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
|
||||
the [vLLM example script](../../examples/others/tensorize_vllm_model.md).
|
||||
|
||||
!!! note
|
||||
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
|
||||
|
||||
@ -11,26 +11,51 @@ before returning them.
|
||||
As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
|
||||
pooling models as they only work on the generation or decode stage, so performance may not improve as much.
|
||||
|
||||
For pooling models, we support the following `--task` options.
|
||||
The selected option sets the default pooler used to extract the final hidden states:
|
||||
If the model doesn't implement this interface, you can set `--task` which tells vLLM
|
||||
to convert the model into a pooling model.
|
||||
|
||||
| Task | Pooling Type | Normalization | Softmax |
|
||||
|---------------------------------|----------------|-----------------|-----------|
|
||||
| Embedding (`embed`) | `LAST` | ✅︎ | ❌ |
|
||||
| Classification (`classify`) | `LAST` | ❌ | ✅︎ |
|
||||
| Sentence Pair Scoring (`score`) | \* | \* | \* |
|
||||
| `--task` | Model type | Supported pooling tasks |
|
||||
|------------|----------------------|-------------------------------|
|
||||
| `embed` | Embedding model | `encode`, `embed` |
|
||||
| `classify` | Classification model | `encode`, `classify`, `score` |
|
||||
| `reward` | Reward model | `encode` |
|
||||
|
||||
\*The default pooler is always defined by the model.
|
||||
## Pooling Tasks
|
||||
|
||||
!!! note
|
||||
If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
|
||||
In vLLM, we define the following pooling tasks and corresponding APIs:
|
||||
|
||||
| Task | APIs |
|
||||
|------------|--------------------|
|
||||
| `encode` | `encode` |
|
||||
| `embed` | `embed`, `score`\* |
|
||||
| `classify` | `classify` |
|
||||
| `score` | `score` |
|
||||
|
||||
\*The `score` API falls back to `embed` task if the model does not support `score` task.
|
||||
|
||||
Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
|
||||
|
||||
By default, the pooler assigned to each task has the following attributes:
|
||||
|
||||
| Task | Pooling Type | Normalization | Softmax |
|
||||
|------------|----------------|---------------|---------|
|
||||
| `encode` | `ALL` | ❌ | ❌ |
|
||||
| `embed` | `LAST` | ✅︎ | ❌ |
|
||||
| `classify` | `LAST` | ❌ | ✅︎ |
|
||||
|
||||
These defaults may be overridden by the model's implementation in vLLM.
|
||||
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
|
||||
we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
|
||||
which takes priority over the model's defaults.
|
||||
|
||||
!!! tip
|
||||
You can customize the model's pooling method via the `--override-pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
You can further customize this via the `--override-pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
|
||||
!!! note
|
||||
|
||||
The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
|
||||
that is not based on [PoolerConfig][vllm.config.PoolerConfig].
|
||||
|
||||
## Offline Inference
|
||||
|
||||
@ -149,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka
|
||||
```python
|
||||
from vllm import LLM, PoolingParams
|
||||
|
||||
model = LLM(model="jinaai/jina-embeddings-v3",
|
||||
task="embed",
|
||||
trust_remote_code=True)
|
||||
outputs = model.embed(["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32))
|
||||
llm = LLM(model="jinaai/jina-embeddings-v3",
|
||||
task="embed",
|
||||
trust_remote_code=True)
|
||||
outputs = llm.embed(["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32))
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
|
||||
@ -314,6 +314,13 @@ See [this page](generative_models.md) for more information on how to use generat
|
||||
|
||||
Specified using `--task generate`.
|
||||
|
||||
<style>
|
||||
th {
|
||||
white-space: nowrap;
|
||||
min-width: 0 !important;
|
||||
}
|
||||
</style>
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
@ -28,10 +28,10 @@ def main(args: Namespace):
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="classify" for classification models
|
||||
model = LLM(**vars(args))
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||
outputs = model.classify(prompts)
|
||||
outputs = llm.classify(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
|
||||
@ -31,10 +31,10 @@ def main(args: Namespace):
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(**vars(args))
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = model.embed(prompts)
|
||||
outputs = llm.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
|
||||
@ -27,10 +27,10 @@ def main(args: Namespace):
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="score" for cross-encoder models
|
||||
model = LLM(**vars(args))
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||
outputs = model.score(text_1, texts_2)
|
||||
outputs = llm.score(text_1, texts_2)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
|
||||
@ -30,11 +30,11 @@ def main(args: Namespace):
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(**vars(args))
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
# Only text matching task is supported for now. See #16120
|
||||
outputs = model.embed(prompts)
|
||||
outputs = llm.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:")
|
||||
|
||||
@ -30,10 +30,10 @@ def main(args: Namespace):
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(**vars(args))
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
|
||||
outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:")
|
||||
|
||||
@ -25,7 +25,7 @@ def config_buckets():
|
||||
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
|
||||
|
||||
|
||||
def initialize_model():
|
||||
def initialize_llm():
|
||||
"""Create an LLM with speculative decoding."""
|
||||
return LLM(
|
||||
model="openlm-research/open_llama_7b",
|
||||
@ -43,9 +43,9 @@ def initialize_model():
|
||||
)
|
||||
|
||||
|
||||
def process_requests(model: LLM, sampling_params: SamplingParams):
|
||||
def process_requests(llm: LLM, sampling_params: SamplingParams):
|
||||
"""Generate texts from prompts and print them."""
|
||||
outputs = model.generate(prompts, sampling_params)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function that sets up the model and processes prompts."""
|
||||
"""Main function that sets up the llm and processes prompts."""
|
||||
config_buckets()
|
||||
model = initialize_model()
|
||||
llm = initialize_llm()
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, top_k=1)
|
||||
process_requests(model, sampling_params)
|
||||
process_requests(llm, sampling_params)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -140,7 +140,7 @@ datamodule_config = {
|
||||
class PrithviMAE:
|
||||
def __init__(self):
|
||||
print("Initializing PrithviMAE model")
|
||||
self.model = LLM(
|
||||
self.llm = LLM(
|
||||
model=os.path.join(os.path.dirname(__file__), "./model"),
|
||||
skip_tokenizer_init=True,
|
||||
dtype="float32",
|
||||
@ -158,7 +158,7 @@ class PrithviMAE:
|
||||
|
||||
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
|
||||
|
||||
outputs = self.model.encode(prompt, use_tqdm=False)
|
||||
outputs = self.llm.encode(prompt, use_tqdm=False)
|
||||
print("################ Inference done (it took seconds) ##############")
|
||||
|
||||
return outputs[0].outputs.data
|
||||
|
||||
@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||
# Models converted offline using this method can not only be more efficient
|
||||
# and support the vllm score API, but also make the init parameters more
|
||||
# concise, for example.
|
||||
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
||||
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
||||
|
||||
# If you want to load the official original version, the init parameters are
|
||||
# as follows.
|
||||
|
||||
|
||||
def get_model() -> LLM:
|
||||
def get_llm() -> LLM:
|
||||
"""Initializes and returns the LLM model for Qwen3-Reranker."""
|
||||
return LLM(
|
||||
model=model_name,
|
||||
@ -77,8 +77,8 @@ def main() -> None:
|
||||
]
|
||||
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
|
||||
|
||||
model = get_model()
|
||||
outputs = model.score(queries, documents)
|
||||
llm = get_llm()
|
||||
outputs = llm.score(queries, documents)
|
||||
|
||||
print("-" * 30)
|
||||
print([output.outputs.score for output in outputs])
|
||||
|
||||
@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
|
||||
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
|
||||
if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
|
||||
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
||||
v1_test_failed_model_execution(vllm_model)
|
||||
|
||||
|
||||
def v1_test_failed_model_execution(vllm_model):
|
||||
|
||||
engine = vllm_model.model.llm_engine
|
||||
engine = vllm_model.llm.llm_engine
|
||||
mocked_execute_model = Mock(
|
||||
side_effect=RuntimeError("Mocked Critical Error"))
|
||||
engine.engine_core.engine_core.model_executor.execute_model =\
|
||||
|
||||
@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
|
||||
disable_log_stats=False,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
@ -118,10 +118,10 @@ def test_preemption(
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@ -174,12 +174,12 @@ def test_preemption_infeasible(
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
ignore_eos=True)
|
||||
req_outputs = vllm_model.model.generate(
|
||||
req_outputs = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
# Verify the request is ignored and not hang.
|
||||
|
||||
@ -784,7 +784,7 @@ class VllmRunner:
|
||||
enforce_eager: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.model = LLM(
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
task=task,
|
||||
tokenizer=tokenizer_name,
|
||||
@ -854,9 +854,9 @@ class VllmRunner:
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
|
||||
req_outputs = self.model.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
req_outputs = self.llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
|
||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||
for req_output in req_outputs:
|
||||
@ -902,9 +902,9 @@ class VllmRunner:
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
|
||||
req_outputs = self.model.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
req_outputs = self.llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
@ -924,8 +924,8 @@ class VllmRunner:
|
||||
'''
|
||||
|
||||
assert sampling_params.logprobs is not None
|
||||
req_outputs = self.model.generate(encoder_decoder_prompts,
|
||||
sampling_params=sampling_params)
|
||||
req_outputs = self.llm.generate(encoder_decoder_prompts,
|
||||
sampling_params=sampling_params)
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
@ -1018,7 +1018,7 @@ class VllmRunner:
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
|
||||
outputs = self.model.beam_search(
|
||||
outputs = self.llm.beam_search(
|
||||
inputs,
|
||||
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
|
||||
returned_outputs = []
|
||||
@ -1029,7 +1029,7 @@ class VllmRunner:
|
||||
return returned_outputs
|
||||
|
||||
def classify(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.model.classify(prompts)
|
||||
req_outputs = self.llm.classify(prompts)
|
||||
return [req_output.outputs.probs for req_output in req_outputs]
|
||||
|
||||
def embed(self,
|
||||
@ -1044,11 +1044,11 @@ class VllmRunner:
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
|
||||
req_outputs = self.model.embed(inputs, *args, **kwargs)
|
||||
req_outputs = self.llm.embed(inputs, *args, **kwargs)
|
||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||
|
||||
def encode(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.model.encode(prompts)
|
||||
req_outputs = self.llm.encode(prompts)
|
||||
return [req_output.outputs.data for req_output in req_outputs]
|
||||
|
||||
def score(
|
||||
@ -1058,18 +1058,18 @@ class VllmRunner:
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> list[float]:
|
||||
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
|
||||
req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
|
||||
return [req_output.outputs.score for req_output in req_outputs]
|
||||
|
||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||
executor = self.model.llm_engine.model_executor
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
del self.model
|
||||
del self.llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
enforce_eager=enforce_eager)
|
||||
engine: LLMEngine = runner.model.llm_engine
|
||||
engine: LLMEngine = runner.llm.llm_engine
|
||||
|
||||
# In multi-step + chunked-prefill there is no separate single prompt step.
|
||||
# What is scheduled will run for num_scheduler_steps always.
|
||||
|
||||
@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
|
||||
def test_stop_reason(vllm_model, example_prompts):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
||||
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
||||
llm = vllm_model.model
|
||||
llm = vllm_model.llm
|
||||
|
||||
# test stop token
|
||||
outputs = llm.generate(example_prompts,
|
||||
|
||||
@ -101,42 +101,42 @@ def _stop_token_id(llm):
|
||||
def test_stop_strings():
|
||||
# If V0, must set enforce_eager=False since we use
|
||||
# async output processing below.
|
||||
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_basic(vllm_model)
|
||||
_stop_basic(llm)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_basic(vllm_model)
|
||||
_set_async_mode(llm, True)
|
||||
_stop_basic(llm)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_basic(vllm_model)
|
||||
_set_async_mode(llm, False)
|
||||
_stop_basic(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_multi_tokens(vllm_model)
|
||||
_stop_multi_tokens(llm)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_multi_tokens(vllm_model)
|
||||
_set_async_mode(llm, True)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_multi_tokens(vllm_model)
|
||||
_set_async_mode(llm, False)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_partial_token(vllm_model)
|
||||
_stop_partial_token(llm)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_partial_token(vllm_model)
|
||||
_set_async_mode(llm, True)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_partial_token(vllm_model)
|
||||
_set_async_mode(llm, False)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(vllm_model)
|
||||
# _stop_token_id(llm)
|
||||
pass
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_token_id(vllm_model)
|
||||
_set_async_mode(llm, True)
|
||||
_stop_token_id(llm)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_token_id(vllm_model)
|
||||
_set_async_mode(llm, False)
|
||||
_stop_token_id(llm)
|
||||
|
||||
@ -77,6 +77,7 @@ def ref_paged_attn(
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
@pytest.mark.parametrize("sliding_window", [None, 64])
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_decode_with_paged_kv(
|
||||
kv_lens: list[int],
|
||||
@ -85,6 +86,7 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
sliding_window: Optional[int],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
@ -136,17 +138,20 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
use_tensor_cores=(
|
||||
(num_query_heads//num_kv_heads) > 4)
|
||||
)
|
||||
wrapper.plan(kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
"NONE",
|
||||
q_data_type=dtype,
|
||||
kv_data_type=dtype,
|
||||
logits_soft_cap=soft_cap)
|
||||
wrapper.plan(
|
||||
kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
"NONE",
|
||||
window_left=sliding_window - 1 if sliding_window is not None else -1,
|
||||
q_data_type=dtype,
|
||||
kv_data_type=dtype,
|
||||
logits_soft_cap=soft_cap,
|
||||
)
|
||||
|
||||
output = wrapper.run(query, key_value_cache)
|
||||
|
||||
@ -157,7 +162,8 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
soft_cap=soft_cap,
|
||||
sliding_window=sliding_window)
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
@ -168,12 +174,17 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
@pytest.mark.parametrize("sliding_window", [None, 64])
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int, dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float]) -> None:
|
||||
def test_flashinfer_prefill_with_paged_kv(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
sliding_window: Optional[int],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
@ -242,6 +253,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
window_left=sliding_window - 1 if sliding_window is not None else -1,
|
||||
q_data_type=dtype,
|
||||
kv_data_type=dtype,
|
||||
logits_soft_cap=soft_cap,
|
||||
@ -259,7 +271,8 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
soft_cap=soft_cap,
|
||||
sliding_window=sliding_window)
|
||||
torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
||||
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||
|
||||
loaded_vllm_model = LLM(model=model_ref,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2)
|
||||
loaded_llm = LLM(model=model_ref,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2)
|
||||
|
||||
tc_as_dict = tensorizer_config.to_serializable()
|
||||
|
||||
print("lora adapter created")
|
||||
assert do_sample(loaded_vllm_model,
|
||||
assert do_sample(loaded_llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tc_as_dict,
|
||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
|
||||
print("lora 1")
|
||||
assert do_sample(loaded_vllm_model,
|
||||
assert do_sample(loaded_llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tc_as_dict,
|
||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
|
||||
@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.4) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
prompt_token_counts = [
|
||||
len(tokenizer.encode(p)) for p in example_prompts
|
||||
]
|
||||
@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
|
||||
vllm_prompt_token_count = sum(prompt_token_counts)
|
||||
|
||||
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
|
||||
@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.4) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
vllm_generation_count = 0
|
||||
@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
vllm_generation_count = 0
|
||||
@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.3,
|
||||
served_model_name=served_model_name) as vllm_model:
|
||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metrics_tag_content = stat_logger.labels["model_name"]
|
||||
|
||||
if envs.VLLM_CI_USE_S3:
|
||||
|
||||
@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
|
||||
output = vllm_model.embed("Write a short story about a robot that"
|
||||
" dreams for the first time.\n")
|
||||
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
||||
output = vllm_model.embed("Write a short story about a robot that"
|
||||
" dreams for the first time.\n")
|
||||
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
|
||||
output = vllm_model.embed("Write a short story about a robot that"
|
||||
" dreams for the first time.\n")
|
||||
|
||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
||||
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||
assert model_tokenizer.tokenizer_id == model_name
|
||||
|
||||
def check_model(model):
|
||||
|
||||
@ -274,7 +274,7 @@ def test_models_preemption_recompute(
|
||||
Tests that outputs are identical with and w/o preemptions (recompute).
|
||||
"""
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
scheduler = vllm_model.model.llm_engine.scheduler[0]
|
||||
scheduler = vllm_model.llm.llm_engine.scheduler[0]
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
|
||||
load_format="mistral") as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.model.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
outputs = vllm_model.llm.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
load_format="mistral") as vllm_model:
|
||||
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.model.chat(msgs,
|
||||
tools=TOOLS,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
outputs = vllm_model.llm.chat(msgs,
|
||||
tools=TOOLS,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
|
||||
model_output = outputs[0].outputs[0].text.strip()
|
||||
@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
|
||||
}]
|
||||
outputs = vllm_model.model.chat(messages, sampling_params=params)
|
||||
outputs = vllm_model.llm.chat(messages, sampling_params=params)
|
||||
|
||||
generated_text = outputs[0].outputs[0].text
|
||||
json_response = json.loads(generated_text)
|
||||
|
||||
@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
super().__init__()
|
||||
self.model = vllm_model
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(
|
||||
@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
# issues by randomizing the order.
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
outputs = self.model.embed(sentences, use_tqdm=False)
|
||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
queries = [s[0] for s in sentences]
|
||||
corpus = [s[1] for s in sentences]
|
||||
|
||||
outputs = self.model.score(queries,
|
||||
corpus,
|
||||
truncate_prompt_tokens=-1,
|
||||
use_tqdm=False)
|
||||
outputs = self.llm.score(queries,
|
||||
corpus,
|
||||
truncate_prompt_tokens=-1,
|
||||
use_tqdm=False)
|
||||
scores = np.array(outputs)
|
||||
scores = scores[np.argsort(r)]
|
||||
return scores
|
||||
@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture
|
||||
in vllm_model.model.llm_engine.model_config.architectures)
|
||||
in vllm_model.llm.llm_engine.model_config.architectures)
|
||||
|
||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||
MTEB_EMBED_TASKS)
|
||||
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
|
||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||
|
||||
with hf_runner(model_info.name,
|
||||
is_sentence_transformer=True,
|
||||
@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
|
||||
@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
task="embed",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
llm = vllm_model.llm
|
||||
|
||||
d_rep = run_llm_encode(
|
||||
llm,
|
||||
@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
task="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
llm = vllm_model.llm
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
|
||||
outputs = llm.generate(input, sampling_params=sampling_params)
|
||||
|
||||
@ -87,10 +87,10 @@ def test_matryoshka(
|
||||
task="embed",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.model.llm_engine.model_config.is_matryoshka
|
||||
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||
|
||||
matryoshka_dimensions = (
|
||||
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
|
||||
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
|
||||
assert matryoshka_dimensions is not None
|
||||
|
||||
if dimensions not in matryoshka_dimensions:
|
||||
|
||||
@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=None) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=256) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
|
||||
# set 512 < max_model_len <= 2048
|
||||
@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
else:
|
||||
with vllm_runner(model_info.name, task="embed",
|
||||
max_model_len=1024) as vllm_model:
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.model.encode(
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
|
||||
|
||||
with vllm_runner(model_name, task="embed",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
vllm_output = vllm_model.model.encode(
|
||||
vllm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
|
||||
model_name, task="embed",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
|
||||
llm_output = vllm_model.model.encode(
|
||||
llm_output = vllm_model.llm.encode(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
|
||||
assert llm_output == f"""truncate_prompt_tokens value
|
||||
|
||||
649
tests/models/multimodal/generation/test_maverick.py
Normal file
649
tests/models/multimodal/generation/test_maverick.py
Normal file
@ -0,0 +1,649 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model for testing purposes.
|
||||
|
||||
This script creates a new model with fewer layers by:
|
||||
1. Loading the original Maverick model configuration
|
||||
2. Creating a reduced configuration
|
||||
3. Generating compatible safetensors files with appropriate weights
|
||||
4. Creating the necessary index files for vLLM compatibility
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
|
||||
GenerationConfig)
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts for testing
|
||||
PROMPTS: list[str] = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
def run_maverick_serving(model: str):
|
||||
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
|
||||
options with reduced layers.
|
||||
"""
|
||||
|
||||
try:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=8,
|
||||
enable_expert_parallel=True,
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.4,
|
||||
kv_cache_dtype="fp8",
|
||||
)
|
||||
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
|
||||
# Print the outputs
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing or running model: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name:
|
||||
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
text_layers: int = 4,
|
||||
num_experts: int = 4,
|
||||
vision_layers: int = 2,
|
||||
force_recreate: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Create a reduced-layer version of the Maverick model.
|
||||
|
||||
Args:
|
||||
original_model_name: Name of the original Maverick model
|
||||
output_dir: Directory to save the reduced model
|
||||
text_layers: Number of text transformer layers
|
||||
num_experts: Number of experts per layer
|
||||
vision_layers: Number of vision transformer layers
|
||||
force_recreate: Whether to recreate if output_dir already exists
|
||||
|
||||
Returns:
|
||||
Path to the created reduced model directory
|
||||
"""
|
||||
|
||||
print(
|
||||
f"Creating reduced Maverick model with {text_layers} text layers and "
|
||||
f"{vision_layers} vision layers...")
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
if output_path.exists():
|
||||
if force_recreate:
|
||||
shutil.rmtree(output_path)
|
||||
else:
|
||||
print(f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite.")
|
||||
return str(output_path)
|
||||
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(original_config, text_layers,
|
||||
num_experts, vision_layers)
|
||||
|
||||
config_path = output_path / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(reduced_config, f, indent=2)
|
||||
print(f"Saved reduced config to {config_path}")
|
||||
|
||||
print("Copying tokenizer files...")
|
||||
copy_tokenizer_files(original_model_name, output_path)
|
||||
|
||||
print("Creating reduced safetensors files...")
|
||||
create_reduced_safetensors(original_config, reduced_config,
|
||||
output_path)
|
||||
|
||||
print("Creating preprocessor config...")
|
||||
create_preprocessor_config(original_config, output_path)
|
||||
|
||||
try:
|
||||
gen_config = GenerationConfig.from_pretrained(original_model_name)
|
||||
gen_config.save_pretrained(output_path)
|
||||
print("Copied generation config")
|
||||
except Exception as e:
|
||||
print(f"Could not copy generation config: {e}")
|
||||
|
||||
print(f"Successfully created reduced Maverick model at {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating reduced model: {e}")
|
||||
# Clean up on failure
|
||||
if output_path.exists():
|
||||
shutil.rmtree(output_path)
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_config(original_config: Any, text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int) -> dict[str, Any]:
|
||||
"""Create a reduced configuration based on the original."""
|
||||
|
||||
# Convert config to dictionary
|
||||
config_dict = original_config.to_dict()
|
||||
|
||||
# Reduce text layers
|
||||
if "text_config" in config_dict:
|
||||
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
|
||||
config_dict["text_config"]["num_hidden_layers"] = text_layers
|
||||
print(
|
||||
f"Reduced text layers from {original_text_layers} to {text_layers}"
|
||||
)
|
||||
|
||||
original_num_experts = config_dict["text_config"]["num_local_experts"]
|
||||
config_dict["text_config"]["num_local_experts"] = num_experts
|
||||
print(
|
||||
f"Reduced num experts from {original_num_experts} to {num_experts}"
|
||||
)
|
||||
|
||||
hidden_dim_divisor = 4
|
||||
|
||||
original_hidden_size = config_dict["text_config"]["hidden_size"]
|
||||
new_hidden_size = original_hidden_size // hidden_dim_divisor
|
||||
config_dict["text_config"]["hidden_size"] = new_hidden_size
|
||||
print(f"Reduced hidden size from {original_hidden_size} to "
|
||||
f"{new_hidden_size}")
|
||||
|
||||
original_head_dim = config_dict["text_config"]["head_dim"]
|
||||
new_head_dim = original_head_dim // hidden_dim_divisor
|
||||
config_dict["text_config"]["head_dim"] = new_head_dim
|
||||
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
|
||||
|
||||
# Reduce vision layers
|
||||
if "vision_config" in config_dict:
|
||||
original_vision_layers = config_dict["vision_config"][
|
||||
"num_hidden_layers"]
|
||||
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
|
||||
print(f"Reduced vision layers from {original_vision_layers} "
|
||||
f"to {vision_layers}")
|
||||
|
||||
# Update model name to indicate it's a reduced version
|
||||
config_dict["_name_or_path"] = (
|
||||
f"reduced_maverick_{text_layers}t_{vision_layers}v")
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
|
||||
"""Copy tokenizer files from the original model."""
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
tokenizer.save_pretrained(output_path)
|
||||
print("Tokenizer files copied successfully")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not copy tokenizer files: {e}")
|
||||
|
||||
|
||||
def create_preprocessor_config(original_config: Any,
|
||||
output_path: Path) -> None:
|
||||
"""Create preprocessor_config.json for multimodal model."""
|
||||
|
||||
# Try to load the original preprocessor config
|
||||
try:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
original_config._name_or_path
|
||||
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
processor.save_pretrained(output_path)
|
||||
print("Copied original preprocessor config")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Could not copy original preprocessor config: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
|
||||
Any],
|
||||
output_path: Path) -> None:
|
||||
"""Create safetensors files with weights for the reduced model."""
|
||||
|
||||
print("Generating synthetic weights for reduced model...")
|
||||
|
||||
text_config = reduced_config["text_config"]
|
||||
vision_config = reduced_config["vision_config"]
|
||||
|
||||
weights = {}
|
||||
|
||||
print("Creating text model weights...")
|
||||
weights.update(create_text_model_weights(text_config))
|
||||
|
||||
print("Creating vision model weights...")
|
||||
weights.update(create_vision_model_weights(vision_config))
|
||||
|
||||
print("Creating shared model weights...")
|
||||
weights.update(create_shared_weights(text_config, vision_config))
|
||||
|
||||
print("Saving weights to safetensors files...")
|
||||
save_weights_to_safetensors(weights, output_path)
|
||||
|
||||
|
||||
def create_text_model_weights(
|
||||
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the text model with MoE structure."""
|
||||
|
||||
weights = {}
|
||||
|
||||
vocab_size = text_config["vocab_size"]
|
||||
hidden_size = text_config["hidden_size"]
|
||||
intermediate_size = text_config["intermediate_size"]
|
||||
intermediate_size_mlp = text_config["intermediate_size_mlp"]
|
||||
num_layers = text_config["num_hidden_layers"]
|
||||
num_attention_heads = text_config["num_attention_heads"]
|
||||
num_key_value_heads = text_config.get("num_key_value_heads",
|
||||
num_attention_heads)
|
||||
|
||||
# MoE specific parameters
|
||||
num_experts = text_config.get("num_local_experts")
|
||||
assert (num_experts
|
||||
is not None), "num_local_experts must be specified for MoE"
|
||||
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# Embedding layers
|
||||
weights["language_model.model.embed_tokens.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.float16)
|
||||
|
||||
# Transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"language_model.model.layers.{layer_idx}"
|
||||
print(f"Creating weights for layer {layer_prefix}...")
|
||||
|
||||
# Self-attention weights (separate q, k, v projections)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
print("Self-attention weights created.")
|
||||
|
||||
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
|
||||
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
|
||||
# 0,2,4,... are dense
|
||||
interleave_step = text_config.get("interleave_moe_layer_step", 1)
|
||||
is_moe_layer = (interleave_step > 0
|
||||
and (layer_idx + 1) % interleave_step == 0)
|
||||
|
||||
if is_moe_layer:
|
||||
# MoE layer structure
|
||||
# 1. Router weights
|
||||
weights[
|
||||
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16)
|
||||
|
||||
# 2. Individual expert weights (not fused)
|
||||
for expert_idx in range(num_experts):
|
||||
expert_prefix = (
|
||||
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
|
||||
|
||||
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
|
||||
# Expert weight scales (FP8 quantization)
|
||||
weights[
|
||||
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16)
|
||||
|
||||
# 3. Shared expert weights
|
||||
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
|
||||
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
print(f"MoE feed-forward weights created for layer {layer_idx}.")
|
||||
else:
|
||||
# Dense layer structure
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
|
||||
torch.randn(hidden_size,
|
||||
intermediate_size_mlp,
|
||||
dtype=torch.bfloat16))
|
||||
print(f"Dense feed-forward weights created for layer {layer_idx}.")
|
||||
|
||||
# Layer norms
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
print("Layer norms created.")
|
||||
|
||||
# Final layer norm and output projection
|
||||
weights["language_model.model.norm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights["language_model.lm_head.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_vision_model_weights(
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the vision model."""
|
||||
|
||||
weights = {}
|
||||
|
||||
hidden_size = vision_config["hidden_size"]
|
||||
intermediate_size = vision_config["intermediate_size"]
|
||||
num_layers = vision_config["num_hidden_layers"]
|
||||
|
||||
# Vision transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
layer_prefix = f"vision_model.model.layers.{layer_idx}"
|
||||
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
|
||||
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
|
||||
intermediate_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_shared_weights(
|
||||
text_config: dict[str, Any],
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create weights for shared components (vision-language connector)"""
|
||||
|
||||
weights = {}
|
||||
|
||||
text_hidden_size = text_config["hidden_size"]
|
||||
projector_input_dim = vision_config["projector_input_dim"]
|
||||
|
||||
# Vision-language connector (projects vision features to text space)
|
||||
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
output_path: Path) -> None:
|
||||
"""Save weights to safetensors files and create index."""
|
||||
|
||||
# Determine how to shard the weights
|
||||
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
|
||||
|
||||
# Calculate sizes and create shards
|
||||
shards = []
|
||||
current_shard: dict[str, torch.Tensor] = {}
|
||||
current_size = 0
|
||||
|
||||
for name, tensor in weights.items():
|
||||
tensor_size = tensor.numel() * tensor.element_size()
|
||||
|
||||
if current_size + tensor_size > max_shard_size and current_shard:
|
||||
shards.append(current_shard)
|
||||
current_shard = {}
|
||||
current_size = 0
|
||||
|
||||
current_shard[name] = tensor
|
||||
current_size += tensor_size
|
||||
|
||||
if current_shard:
|
||||
shards.append(current_shard)
|
||||
|
||||
# Save shards and create index
|
||||
weight_map = {}
|
||||
|
||||
if len(shards) == 1:
|
||||
# Single file
|
||||
filename = "model.safetensors"
|
||||
save_file(shards[0], output_path / filename)
|
||||
weight_map = {name: filename for name in shards[0]}
|
||||
print(f"Saved weights to single file: {filename}")
|
||||
else:
|
||||
# Multiple shards
|
||||
for i, shard in enumerate(shards):
|
||||
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
|
||||
save_file(shard, output_path / filename)
|
||||
for name in shard:
|
||||
weight_map[name] = filename
|
||||
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
|
||||
|
||||
# Create index file
|
||||
index_data = {
|
||||
"metadata": {
|
||||
"total_size":
|
||||
sum(tensor.numel() * tensor.element_size()
|
||||
for tensor in weights.values())
|
||||
},
|
||||
"weight_map": weight_map,
|
||||
}
|
||||
|
||||
index_path = output_path / "model.safetensors.index.json"
|
||||
with open(index_path, "w") as f:
|
||||
json.dump(index_data, f, indent=2)
|
||||
|
||||
print(f"Created index file: {index_path}")
|
||||
print(f"Total model size: "
|
||||
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
||||
|
||||
|
||||
def run_reduced_model(model_path: str,
|
||||
should_profile: bool = False,
|
||||
**kwargs) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
|
||||
print(f"\nTesting reduced model at {model_path}...")
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=512, # Small context for testing
|
||||
gpu_memory_utilization=0.3, # Conservative memory usage
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=50)
|
||||
|
||||
if should_profile:
|
||||
llm.start_profile()
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
if should_profile:
|
||||
llm.stop_profile()
|
||||
|
||||
print("Test generation successful!")
|
||||
for output in outputs:
|
||||
print(f"Prompt: {output.prompt}")
|
||||
print(f"Output: "
|
||||
f"{output.outputs[0].text}")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"original_model_name,text_layers,num_experts,vision_layers,",
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
def test_dummy_maverick(
|
||||
original_model_name: str,
|
||||
text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int,
|
||||
enforce_eager: bool,
|
||||
tp: int,
|
||||
ep: bool,
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
force_recreate: bool = True,
|
||||
profile: bool = False,
|
||||
) -> None:
|
||||
model_path = create_reduced_maverick_model(
|
||||
original_model_name=original_model_name,
|
||||
output_dir=output_dir,
|
||||
text_layers=text_layers,
|
||||
num_experts=num_experts,
|
||||
vision_layers=vision_layers,
|
||||
force_recreate=force_recreate,
|
||||
)
|
||||
|
||||
print(f"\nReduced model created successfully at: {model_path}")
|
||||
|
||||
run_reduced_model(model_path=model_path,
|
||||
should_profile=profile,
|
||||
enforce_eager=enforce_eager,
|
||||
tensor_parallel_size=tp,
|
||||
enable_expert_parallel=ep)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to create and test the reduced model."""
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a reduced-layer Maverick model")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="/tmp/reduced_maverick",
|
||||
help="Output directory for the reduced model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-layers",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of text transformer layers",
|
||||
)
|
||||
parser.add_argument("--num-experts",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of experts")
|
||||
parser.add_argument(
|
||||
"--vision-layers",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of vision transformer layers",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-recreate",
|
||||
action="store_true",
|
||||
help="Force recreation if output directory exists",
|
||||
)
|
||||
parser.add_argument("--test",
|
||||
action="store_true",
|
||||
help="Test the created model with vLLM")
|
||||
parser.add_argument("--profile",
|
||||
action="store_true",
|
||||
help="Profile the created model with vLLM")
|
||||
parser.add_argument(
|
||||
"--test-original",
|
||||
action="store_true",
|
||||
help="Test the original model with vLLM",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--original-model",
|
||||
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
help="Original model name to base the reduction on",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
test_dummy_maverick(original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile)
|
||||
|
||||
if args.test_original:
|
||||
run_maverick_serving(args.original_model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
@ -180,8 +180,7 @@ def test_chat(
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
for msg in MSGS:
|
||||
output = vllm_model.model.chat(msg,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
|
||||
@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.model.generate(prompt)
|
||||
outputs = vllm_model.llm.generate(prompt)
|
||||
|
||||
assert len(outputs) == 1, f"{len(outputs)=}"
|
||||
output: RequestOutput = outputs[0]
|
||||
|
||||
@ -106,7 +106,7 @@ def run_test(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
llm = vllm_model.llm
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
|
||||
@ -85,7 +85,7 @@ def run_test(
|
||||
enforce_eager=enforce_eager,
|
||||
task=task,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
if get_stop_token_ids is not None:
|
||||
|
||||
@ -96,7 +96,7 @@ def _run_test(
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
texts = [
|
||||
# this is necessary because vllm_model.embed will not apply any
|
||||
# templating to the prompt, and therefore lacks an image_pad
|
||||
|
||||
@ -56,7 +56,7 @@ def vllm_reranker(
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.model.score(query, documents)
|
||||
outputs = vllm_model.llm.score(query, documents)
|
||||
|
||||
return [output.outputs.score for output in outputs]
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
model = LLM(
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = model.generate(prompt, params)
|
||||
outputs = llm.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del model
|
||||
del llm
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
|
||||
@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
|
||||
reason="modelopt_fp4 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
model = LLM(
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = model.generate(prompt, params)
|
||||
outputs = llm.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del model
|
||||
del llm
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
|
||||
@ -144,7 +144,7 @@ def test_quantization(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_classify(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
from vllm.model_executor.models.gemma2 import Gemma2Model
|
||||
from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -26,12 +26,13 @@ class MyGemma2Embedding(nn.Module):
|
||||
self.model = Gemma2Model(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "model"))
|
||||
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
vllm_config.model_config.pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=True,
|
||||
softmax=False,
|
||||
)
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode": Pooler.for_encode(pooler_config),
|
||||
"embed": Pooler.for_embed(pooler_config),
|
||||
})
|
||||
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
||||
@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
|
||||
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
|
||||
def test_disable_sliding_window(model_len_len, ):
|
||||
model, sliding_len, full_len = model_len_len
|
||||
vllm_disabled_model = LLM(model, disable_sliding_window=True)
|
||||
vllm_disabled_model.generate("Hi my name is")
|
||||
model_config = vllm_disabled_model.llm_engine.model_config
|
||||
disabled_llm = LLM(model, disable_sliding_window=True)
|
||||
disabled_llm.generate("Hi my name is")
|
||||
model_config = disabled_llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == sliding_len, (
|
||||
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
|
||||
model_config.max_model_len)
|
||||
|
||||
del vllm_disabled_model
|
||||
del disabled_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
vllm_enabled_model = LLM(model,
|
||||
enforce_eager=True,
|
||||
disable_sliding_window=False,
|
||||
enable_prefix_caching=False)
|
||||
vllm_enabled_model.generate("Hi my name is")
|
||||
model_config = vllm_enabled_model.llm_engine.model_config
|
||||
enabled_llm = LLM(model,
|
||||
enforce_eager=True,
|
||||
disable_sliding_window=False,
|
||||
enable_prefix_caching=False)
|
||||
enabled_llm.generate("Hi my name is")
|
||||
model_config = enabled_llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == full_len, (
|
||||
"Max len expected to equal full_len of %s, but got %s", full_len,
|
||||
model_config.max_model_len)
|
||||
|
||||
del vllm_enabled_model
|
||||
del enabled_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
@ -93,8 +93,8 @@ def test_mixed_requests(
|
||||
# Run all the promopts
|
||||
greedy_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=max_tokens)
|
||||
req_outputs = vllm_model.model.generate(example_prompts,
|
||||
greedy_params)
|
||||
req_outputs = vllm_model.llm.generate(example_prompts,
|
||||
greedy_params)
|
||||
|
||||
# Verify number of cached tokens
|
||||
for i in range(len(req_outputs)):
|
||||
@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_batched_tokens,
|
||||
)
|
||||
engine: LLMEngine = runner.model.llm_engine
|
||||
engine: LLMEngine = runner.llm.llm_engine
|
||||
|
||||
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
|
||||
engine.scheduler[0] = scheduler
|
||||
|
||||
@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
|
||||
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
|
||||
GPTQLinearMethod)
|
||||
|
||||
for name, submodule in (vllm_model.model.llm_engine.model_executor.
|
||||
for name, submodule in (vllm_model.llm.llm_engine.model_executor.
|
||||
driver_worker.model_runner.model.named_modules()):
|
||||
if name == "lm_head":
|
||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||
|
||||
91
tests/quantization/test_modelopt.py
Normal file
91
tests/quantization/test_modelopt.py
Normal file
@ -0,0 +1,91 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test ModelOpt quantization method setup and weight loading.
|
||||
|
||||
Run `pytest tests/quantization/test_modelopt.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
||||
"""
|
||||
if not current_platform.is_cpu():
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.")
|
||||
def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
|
||||
# TODO: provide a small publically available test checkpoint
|
||||
model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
|
||||
"TinyLlama-1.1B-Chat-v1.0-fp8-0710")
|
||||
|
||||
# Skip test if checkpoint doesn't exist
|
||||
if not os.path.exists(model_path):
|
||||
pytest.skip(f"Test checkpoint not found at {model_path}. "
|
||||
"This test requires a local ModelOpt FP8 checkpoint.")
|
||||
|
||||
with vllm_runner(model_path, quantization="modelopt",
|
||||
enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
# Check that ModelOpt quantization method is properly applied
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptFp8LinearMethod)
|
||||
assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method,
|
||||
ModelOptFp8LinearMethod)
|
||||
assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
|
||||
# Check weight dtype is FP8
|
||||
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert o_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert down_proj.weight.dtype == torch.float8_e4m3fn
|
||||
|
||||
# Check scales are present and have correct dtype
|
||||
assert hasattr(qkv_proj, 'weight_scale')
|
||||
assert hasattr(qkv_proj, 'input_scale')
|
||||
assert qkv_proj.weight_scale.dtype == torch.float32
|
||||
assert qkv_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(o_proj, 'weight_scale')
|
||||
assert hasattr(o_proj, 'input_scale')
|
||||
assert o_proj.weight_scale.dtype == torch.float32
|
||||
assert o_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(gate_up_proj, 'weight_scale')
|
||||
assert hasattr(gate_up_proj, 'input_scale')
|
||||
assert gate_up_proj.weight_scale.dtype == torch.float32
|
||||
assert gate_up_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(down_proj, 'weight_scale')
|
||||
assert hasattr(down_proj, 'input_scale')
|
||||
assert down_proj.weight_scale.dtype == torch.float32
|
||||
assert down_proj.input_scale.dtype == torch.float32
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# Run a simple generation test to ensure the model works
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
assert output
|
||||
print(f"ModelOpt FP8 output: {output}")
|
||||
@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner):
|
||||
}
|
||||
with (vllm_runner(quark_model_id, **llm_kwargs) as
|
||||
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
|
||||
quark_model = (quark_handle.model.llm_engine.model_executor.
|
||||
quark_model = (quark_handle.llm.llm_engine.model_executor.
|
||||
driver_worker.model_runner.model)
|
||||
quark_state_dict = quark_model.state_dict()
|
||||
|
||||
fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
|
||||
fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
fp8_state_dict = fp8_model.state_dict()
|
||||
|
||||
|
||||
@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
|
||||
quantization="custom_quant",
|
||||
enforce_eager=True) as llm:
|
||||
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
|
||||
@ -36,7 +36,7 @@ def test_ignore_eos(
|
||||
ignore_eos=True)
|
||||
|
||||
for prompt in example_prompts:
|
||||
ignore_eos_output = vllm_model.model.generate(
|
||||
ignore_eos_output = vllm_model.llm.generate(
|
||||
prompt, sampling_params=sampling_params)
|
||||
output_length = len(ignore_eos_output[0].outputs[0].token_ids)
|
||||
assert output_length == max_tokens
|
||||
|
||||
@ -26,7 +26,7 @@ def test_logits_processor_force_generate(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
repeat_times = 2
|
||||
enforced_answers = " vLLM"
|
||||
vllm_token_ids = tokenizer.encode(enforced_answers,
|
||||
@ -45,13 +45,13 @@ def test_logits_processor_force_generate(
|
||||
)
|
||||
|
||||
# test logits_processors when prompt_logprobs is not None
|
||||
vllm_model.model._add_request(
|
||||
vllm_model.llm._add_request(
|
||||
example_prompts[0],
|
||||
params=params_with_logprobs,
|
||||
)
|
||||
|
||||
# test prompt_logprobs is not None
|
||||
vllm_model.model._add_request(
|
||||
vllm_model.llm._add_request(
|
||||
example_prompts[1],
|
||||
params=SamplingParams(
|
||||
prompt_logprobs=3,
|
||||
@ -60,11 +60,11 @@ def test_logits_processor_force_generate(
|
||||
)
|
||||
|
||||
# test grouped requests
|
||||
vllm_model.model._add_request(
|
||||
vllm_model.llm._add_request(
|
||||
example_prompts[2],
|
||||
params=SamplingParams(max_tokens=max_tokens),
|
||||
)
|
||||
|
||||
outputs = vllm_model.model._run_engine(use_tqdm=False)
|
||||
outputs = vllm_model.llm._run_engine(use_tqdm=False)
|
||||
|
||||
assert outputs[0].outputs[0].text == enforced_answers * repeat_times
|
||||
|
||||
@ -64,7 +64,7 @@ def test_get_prompt_logprobs(
|
||||
prompt_logprobs=num_top_logprobs,
|
||||
temperature=0.0,
|
||||
detokenize=detokenize)
|
||||
vllm_results = vllm_model.model.generate(
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
# Test whether logprobs are included in the results.
|
||||
@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
|
||||
logprobs=None,
|
||||
temperature=0.0,
|
||||
detokenize=detokenize)
|
||||
results_logprobs_none = vllm_model.model.generate(
|
||||
results_logprobs_none = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_none)
|
||||
|
||||
for i in range(len(results_logprobs_none)):
|
||||
|
||||
@ -20,7 +20,7 @@ def v1(run_with_both_engines):
|
||||
|
||||
|
||||
def _generate(
|
||||
model: LLM,
|
||||
llm: LLM,
|
||||
prompt: str,
|
||||
num_prompt_tokens: int,
|
||||
temperature: float = 0,
|
||||
@ -32,7 +32,7 @@ def _generate(
|
||||
)
|
||||
|
||||
# [([output_token_ids, ], [output_text, ]), ]
|
||||
output = model.generate([prompt], sampling_params=sampling_params)
|
||||
output = llm.generate([prompt], sampling_params=sampling_params)
|
||||
|
||||
output_token_ids = output[0][0][0][num_prompt_tokens:]
|
||||
# [0] first (and only) request output
|
||||
@ -66,10 +66,10 @@ class TestOneTokenBadWord:
|
||||
assert self.target_token_id not in output_token_ids
|
||||
|
||||
def _generate(self,
|
||||
model: LLM,
|
||||
llm: LLM,
|
||||
bad_words: Optional[list[str]] = None) -> list[int]:
|
||||
return _generate(
|
||||
model=model,
|
||||
llm=llm,
|
||||
prompt=self.PROMPT,
|
||||
num_prompt_tokens=self.num_prompt_tokens,
|
||||
bad_words=bad_words,
|
||||
@ -156,10 +156,10 @@ class TestTwoTokenBadWord:
|
||||
or (self.neighbour_token_id2 in output_token_ids))
|
||||
|
||||
def _generate(self,
|
||||
model: LLM,
|
||||
llm: LLM,
|
||||
bad_words: Optional[list[str]] = None) -> list[int]:
|
||||
return _generate(
|
||||
model=model,
|
||||
llm=llm,
|
||||
prompt=self.PROMPT,
|
||||
num_prompt_tokens=self.num_prompt_tokens,
|
||||
bad_words=bad_words,
|
||||
|
||||
@ -49,7 +49,7 @@ def test_random_sample_with_seed(
|
||||
sampling_params_seed_2 = copy.deepcopy(sampling_params)
|
||||
sampling_params_seed_2.seed = 200
|
||||
|
||||
llm = vllm_model.model
|
||||
llm = vllm_model.llm
|
||||
|
||||
for prompt in example_prompts:
|
||||
for params in (
|
||||
|
||||
@ -23,9 +23,9 @@ from vllm.transformers_utils.detokenizer_utils import (
|
||||
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
|
||||
MemorySnapshot, PlaceholderModule, StoreBoolean,
|
||||
bind_kv_cache, common_broadcastable_dtype,
|
||||
deprecate_kwargs, get_open_port, get_tcp_uri,
|
||||
is_lossless_cast, join_host_port, make_zmq_path,
|
||||
make_zmq_socket, memory_profiling,
|
||||
current_stream, deprecate_kwargs, get_open_port,
|
||||
get_tcp_uri, is_lossless_cast, join_host_port,
|
||||
make_zmq_path, make_zmq_socket, memory_profiling,
|
||||
merge_async_iterators, sha256, split_host_port,
|
||||
split_zmq_path, supports_kw, swap_dict_values)
|
||||
|
||||
@ -957,3 +957,41 @@ def test_convert_ids_list_to_tokens():
|
||||
]
|
||||
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
|
||||
assert tokens == ['Hello', ',', ' world', '!']
|
||||
|
||||
|
||||
def test_current_stream_multithread():
|
||||
import threading
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip("CUDA not available")
|
||||
|
||||
main_default_stream = torch.cuda.current_stream()
|
||||
child_stream = torch.cuda.Stream()
|
||||
|
||||
thread_stream_ready = threading.Event()
|
||||
thread_can_exit = threading.Event()
|
||||
|
||||
def child_thread_func():
|
||||
with torch.cuda.stream(child_stream):
|
||||
thread_stream_ready.set()
|
||||
thread_can_exit.wait(timeout=10)
|
||||
|
||||
child_thread = threading.Thread(target=child_thread_func)
|
||||
child_thread.start()
|
||||
|
||||
try:
|
||||
assert thread_stream_ready.wait(
|
||||
timeout=5), "Child thread failed to enter stream context in time"
|
||||
|
||||
main_current_stream = current_stream()
|
||||
|
||||
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
|
||||
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
|
||||
|
||||
# Notify child thread it can exit
|
||||
thread_can_exit.set()
|
||||
|
||||
finally:
|
||||
# Ensure child thread exits properly
|
||||
child_thread.join(timeout=5)
|
||||
if child_thread.is_alive():
|
||||
pytest.fail("Child thread failed to exit properly")
|
||||
|
||||
@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill(
|
||||
logprobs=5,
|
||||
prompt_logprobs=5,
|
||||
temperature=0.0)
|
||||
vllm_results = vllm_model.model.generate(
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
for idx, result in enumerate(vllm_results):
|
||||
|
||||
@ -14,7 +14,7 @@ PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model() -> LLM:
|
||||
def llm() -> LLM:
|
||||
return LLM(MODEL,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
@ -24,16 +24,16 @@ def model() -> LLM:
|
||||
block_size=16)
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(model):
|
||||
outputs = model.generate([PROMPT] * 3)
|
||||
def test_concurrent_partial_prefill(llm):
|
||||
outputs = llm.generate([PROMPT] * 3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded(model):
|
||||
def test_prefix_cache_stats_is_recorded(llm):
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 17}
|
||||
_ = model.generate([input_tokens])
|
||||
outputs = model.generate([input_tokens])
|
||||
_ = llm.generate([input_tokens])
|
||||
outputs = llm.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 16
|
||||
|
||||
@ -336,9 +336,10 @@ async def test_customize_loggers(monkeypatch):
|
||||
|
||||
await engine.do_log_stats()
|
||||
|
||||
assert len(engine.stat_loggers) == 1
|
||||
assert len(engine.stat_loggers[0]) == 1
|
||||
engine.stat_loggers[0][0].log.assert_called_once()
|
||||
stat_loggers = engine.logger_manager.per_engine_logger_dict
|
||||
assert len(stat_loggers) == 1
|
||||
assert len(stat_loggers[0]) == 1
|
||||
stat_loggers[0][0].log.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
|
||||
@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init(
|
||||
example_prompts,
|
||||
structured_outputs=True,
|
||||
)
|
||||
model: LLM = vllm_model_skip_tokenizer_init.model
|
||||
llm: LLM = vllm_model_skip_tokenizer_init.llm
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(example_prompts, sampling_params_list)
|
||||
_ = llm.generate(example_prompts, sampling_params_list)
|
||||
|
||||
|
||||
def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
||||
@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
||||
example_prompt: test fixture providing prompts for testing.
|
||||
"""
|
||||
sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
|
||||
model: LLM = vllm_model.model
|
||||
outputs = model.generate(example_prompts, sampling_params_list)
|
||||
llm: LLM = vllm_model.llm
|
||||
outputs = llm.generate(example_prompts, sampling_params_list)
|
||||
|
||||
# Validate each request response
|
||||
for out, n in zip(outputs, n_list):
|
||||
@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
) as vllm_model:
|
||||
model: LLM = vllm_model.model
|
||||
llm: LLM = vllm_model.llm
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=max_tokens)
|
||||
outputs = model.generate(example_prompts, sampling_params)
|
||||
outputs = llm.generate(example_prompts, sampling_params)
|
||||
|
||||
n_prompts = len(example_prompts)
|
||||
assert len(outputs) == n_prompts
|
||||
@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
total_tokens += len(out.outputs[0].token_ids)
|
||||
assert total_tokens == max_tokens * n_prompts
|
||||
|
||||
metrics = model.get_metrics()
|
||||
metrics = llm.get_metrics()
|
||||
|
||||
def find_metric(name) -> list[Metric]:
|
||||
found = []
|
||||
|
||||
@ -112,7 +112,7 @@ def _run_and_validate(
|
||||
max_tokens: int,
|
||||
do_apc: bool,
|
||||
) -> None:
|
||||
vllm_results = vllm_model.model.generate(
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
test_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
|
||||
@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs(
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
|
||||
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
||||
if do_apc and (temperature < 2.0
|
||||
or batch_logprobs_composition != SAMPLE_PROMPT):
|
||||
# Skip some test-cases to save time.
|
||||
@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts,
|
||||
prompt_logprobs=None,
|
||||
temperature=0.0,
|
||||
)
|
||||
results_logprobs_none = vllm_model.model.generate(
|
||||
results_logprobs_none = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params_logprobs_none,
|
||||
)
|
||||
@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
|
||||
logprobs=0,
|
||||
prompt_logprobs=0,
|
||||
temperature=0.0)
|
||||
results_logprobs_zero = vllm_model.model.generate(
|
||||
results_logprobs_zero = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_zero)
|
||||
|
||||
for i in range(len(results_logprobs_zero)):
|
||||
|
||||
@ -14,30 +14,30 @@ PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model() -> LLM:
|
||||
def llm() -> LLM:
|
||||
# Disable prefix caching so that we can test prompt logprobs.
|
||||
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
|
||||
# is merged
|
||||
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
|
||||
|
||||
|
||||
def test_n_gt_1(model):
|
||||
def test_n_gt_1(llm):
|
||||
"""ParallelSampling is supported."""
|
||||
|
||||
params = SamplingParams(n=3)
|
||||
outputs = model.generate(PROMPT, params)
|
||||
outputs = llm.generate(PROMPT, params)
|
||||
assert len(outputs[0].outputs) == 3
|
||||
|
||||
|
||||
def test_best_of(model):
|
||||
def test_best_of(llm):
|
||||
"""Raise a ValueError since best_of is deprecated."""
|
||||
|
||||
params = SamplingParams(n=2, best_of=3)
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT, params)
|
||||
_ = llm.generate(PROMPT, params)
|
||||
|
||||
|
||||
def test_penalties(model):
|
||||
def test_penalties(llm):
|
||||
"""Check that we do not get errors if applied."""
|
||||
|
||||
params = SamplingParams(
|
||||
@ -49,18 +49,18 @@ def test_penalties(model):
|
||||
top_p=0.5,
|
||||
top_k=3,
|
||||
)
|
||||
_ = model.generate(PROMPT, params)
|
||||
_ = llm.generate(PROMPT, params)
|
||||
|
||||
|
||||
def test_stop(model):
|
||||
def test_stop(llm):
|
||||
"""Check that we respect the stop words."""
|
||||
|
||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
||||
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||
split_text = output[0].outputs[0].text.split()
|
||||
|
||||
STOP_IDX = 5
|
||||
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
new_split_text = output[0].outputs[0].text.split()
|
||||
|
||||
# Output should not contain the stop word.
|
||||
@ -69,40 +69,40 @@ def test_stop(model):
|
||||
params = SamplingParams(temperature=0,
|
||||
stop=split_text[STOP_IDX],
|
||||
include_stop_str_in_output=True)
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
new_split_text = output[0].outputs[0].text.split()
|
||||
|
||||
# Output should contain the stop word.
|
||||
assert len(new_split_text) == STOP_IDX + 1
|
||||
|
||||
|
||||
def test_stop_token_ids(model):
|
||||
def test_stop_token_ids(llm):
|
||||
"""Check that we respect the stop token ids."""
|
||||
|
||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
||||
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||
|
||||
stop_token_id_0 = output[0].outputs[0].token_ids[5]
|
||||
stop_token_id_1 = output[0].outputs[0].token_ids[6]
|
||||
|
||||
stop_token_ids = [stop_token_id_1, stop_token_id_0]
|
||||
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
||||
|
||||
stop_token_ids = [stop_token_id_0, stop_token_id_1]
|
||||
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
||||
|
||||
|
||||
def test_detokenize_false(model):
|
||||
def test_detokenize_false(llm):
|
||||
"""Check that detokenize=False option works."""
|
||||
|
||||
output = model.generate(PROMPT, SamplingParams(detokenize=False))
|
||||
output = llm.generate(PROMPT, SamplingParams(detokenize=False))
|
||||
assert len(output[0].outputs[0].token_ids) > 0
|
||||
assert len(output[0].outputs[0].text) == 0
|
||||
|
||||
output = model.generate(
|
||||
output = llm.generate(
|
||||
PROMPT, SamplingParams(detokenize=False, logprobs=3,
|
||||
prompt_logprobs=3))
|
||||
assert len(output[0].outputs[0].token_ids) > 0
|
||||
@ -118,28 +118,28 @@ def test_detokenize_false(model):
|
||||
assert all(lp.decoded_token is None for lp in logprobs.values())
|
||||
|
||||
|
||||
def test_bad_words(model):
|
||||
def test_bad_words(llm):
|
||||
"""Check that we respect bad words."""
|
||||
|
||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
||||
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||
split_text = output[0].outputs[0].text.split()
|
||||
|
||||
bad_words_1 = " ".join(split_text[:2])
|
||||
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
new_text = output[0].outputs[0].text
|
||||
assert bad_words_1 not in new_text
|
||||
|
||||
bad_words_2 = new_text.split()[-1]
|
||||
params = SamplingParams(temperature=0,
|
||||
bad_words=[bad_words_1, bad_words_2])
|
||||
output = model.generate(PROMPT, params)
|
||||
output = llm.generate(PROMPT, params)
|
||||
new_text = output[0].outputs[0].text
|
||||
assert bad_words_1 not in new_text
|
||||
assert bad_words_2 not in new_text
|
||||
|
||||
|
||||
def test_logits_processor(model):
|
||||
def test_logits_processor(llm):
|
||||
"""Check that we reject logits processor."""
|
||||
|
||||
# This sample logits processor gives infinite score to the i-th token,
|
||||
@ -150,47 +150,45 @@ def test_logits_processor(model):
|
||||
return logits
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT,
|
||||
SamplingParams(logits_processors=[pick_ith]))
|
||||
_ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
|
||||
|
||||
|
||||
def test_allowed_token_ids(model):
|
||||
def test_allowed_token_ids(llm):
|
||||
"""Check that we can use allowed_token_ids."""
|
||||
|
||||
TOKEN_ID = 10
|
||||
allowed_token_ids = [TOKEN_ID]
|
||||
output = model.generate(
|
||||
PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
|
||||
output = llm.generate(PROMPT,
|
||||
SamplingParams(allowed_token_ids=allowed_token_ids))
|
||||
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
|
||||
|
||||
# Reject empty allowed_token_ids.
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
|
||||
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
|
||||
|
||||
# Reject negative token id.
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
|
||||
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
|
||||
|
||||
# Reject out of vocabulary.
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT,
|
||||
SamplingParams(allowed_token_ids=[10000000]))
|
||||
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
|
||||
|
||||
|
||||
def test_priority(model):
|
||||
def test_priority(llm):
|
||||
"""Check that we reject requests with priority."""
|
||||
|
||||
# Reject all allowed token ids
|
||||
with pytest.raises(ValueError):
|
||||
_ = model.generate(PROMPT, priority=[1])
|
||||
_ = llm.generate(PROMPT, priority=[1])
|
||||
|
||||
|
||||
def test_seed(model):
|
||||
def test_seed(llm):
|
||||
"""Check that seed impacts randomness."""
|
||||
|
||||
out_1 = model.generate(PROMPT, SamplingParams(seed=42))
|
||||
out_2 = model.generate(PROMPT, SamplingParams(seed=42))
|
||||
out_3 = model.generate(PROMPT, SamplingParams(seed=43))
|
||||
out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
|
||||
out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
|
||||
out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
|
||||
|
||||
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
|
||||
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
|
||||
|
||||
@ -90,8 +90,10 @@ async def test_load(output_kind: RequestOutputKind,
|
||||
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
|
||||
stats_loggers[engine_index] = self
|
||||
|
||||
def record(self, scheduler_stats: Optional[SchedulerStats],
|
||||
iteration_stats: Optional[IterationStats]):
|
||||
def record(self,
|
||||
scheduler_stats: Optional[SchedulerStats],
|
||||
iteration_stats: Optional[IterationStats],
|
||||
engine_idx: int = 0):
|
||||
if iteration_stats:
|
||||
self.finished_req_count += len(
|
||||
iteration_stats.finished_requests)
|
||||
|
||||
@ -106,9 +106,9 @@ def test_v1_llm_by_default(monkeypatch):
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should default to V1 for supported config.
|
||||
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
||||
print(model.generate("Hello my name is"))
|
||||
assert hasattr(model.llm_engine, "engine_core")
|
||||
llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
||||
print(llm.generate("Hello my name is"))
|
||||
assert hasattr(llm.llm_engine, "engine_core")
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
|
||||
@ -137,6 +137,13 @@ class Attention(nn.Module):
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# For v1 we have backend agnostic iRoPE (local chunked attention)
|
||||
# we have to store the flag on the layer so gpu model runner can
|
||||
# set KVSpec appropriately (and pop it so it doesnt get passed to
|
||||
# the backends)
|
||||
if envs.VLLM_USE_V1:
|
||||
self.use_irope = extra_impl_args.pop("use_irope", False)
|
||||
|
||||
quant_method = quant_config.get_quant_method(
|
||||
self, prefix=prefix) if quant_config else None
|
||||
if quant_method is not None and not isinstance(
|
||||
|
||||
@ -94,7 +94,7 @@ ConfigT = TypeVar("ConfigT", bound=ConfigType)
|
||||
TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
|
||||
"score", "reward", "transcription", "draft"]
|
||||
|
||||
_ResolvedTask = Literal["generate", "transcription", "pooling", "embed",
|
||||
_ResolvedTask = Literal["generate", "transcription", "encode", "embed",
|
||||
"classify", "reward", "draft"]
|
||||
|
||||
RunnerOption = Literal["auto", "generate", "pooling", "draft"]
|
||||
@ -103,7 +103,7 @@ RunnerType = Literal["generate", "pooling", "draft"]
|
||||
|
||||
_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
|
||||
"generate": ["generate", "transcription"],
|
||||
"pooling": ["pooling", "embed", "classify", "reward"],
|
||||
"pooling": ["encode", "embed", "classify", "reward"],
|
||||
"draft": [],
|
||||
}
|
||||
|
||||
@ -346,11 +346,11 @@ class ModelConfig:
|
||||
"""Maximum number of data items per modality per prompt. Only applicable
|
||||
for multimodal models."""
|
||||
interleave_mm_strings: bool = False
|
||||
"""Enable fully interleaved support for multimodal prompts, while using
|
||||
"""Enable fully interleaved support for multimodal prompts, while using
|
||||
--chat-template-content-format=string. Defaults to False."""
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
|
||||
use_async_output_proc: bool = True
|
||||
"""Whether to use async output processor."""
|
||||
@ -579,7 +579,7 @@ class ModelConfig:
|
||||
# user-selected task
|
||||
if runner_type == "pooling" and self.task == "auto":
|
||||
selected_task = all_supported_tasks[runner_type][-1]
|
||||
assert selected_task != "pooling"
|
||||
assert selected_task != "encode"
|
||||
self.task = selected_task
|
||||
self.supported_runner_types = supported_runner_types
|
||||
self.runner_type = runner_type
|
||||
@ -884,7 +884,7 @@ class ModelConfig:
|
||||
|
||||
supported_tasks = list[_ResolvedTask]()
|
||||
if registry.is_pooling_model(architectures):
|
||||
supported_tasks.append("pooling")
|
||||
supported_tasks.append("encode")
|
||||
|
||||
# For now, users must specify the task (other than "pooling")
|
||||
# to use for pooling models
|
||||
@ -1000,9 +1000,13 @@ class ModelConfig:
|
||||
quant_cfg = self._parse_quant_hf_config()
|
||||
|
||||
if quant_cfg is not None:
|
||||
# Use the community standard 'quant_method'
|
||||
quant_method = quant_cfg.get("quant_method", "").lower()
|
||||
|
||||
# Normalize library names
|
||||
quant_method = quant_method.replace("compressed_tensors",
|
||||
"compressed-tensors")
|
||||
|
||||
quant_cfg["quant_method"] = quant_method
|
||||
|
||||
# Quantization methods which are overrides (i.e. they have a
|
||||
@ -1017,6 +1021,8 @@ class ModelConfig:
|
||||
"awq_marlin",
|
||||
"ipex",
|
||||
"moe_wna16",
|
||||
"modelopt",
|
||||
"modelopt_fp4",
|
||||
]
|
||||
quantization_methods = [
|
||||
q for q in supported_quantization if q not in overrides
|
||||
@ -3193,8 +3199,8 @@ class MultiModalConfig:
|
||||
"""
|
||||
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
|
||||
|
||||
mm_processor_kwargs: Optional[dict[str, object]] = None
|
||||
@ -4094,7 +4100,7 @@ class CompilationConfig:
|
||||
- True: inductor compilation is used (custom_ops disabled by default).
|
||||
One graph for symbolic shape and one graph per size in compile_sizes
|
||||
are compiled using configurations in inductor_compile_config.
|
||||
|
||||
|
||||
This setting is ignored if level<PIECEWISE."""
|
||||
compile_sizes: Optional[list[Union[int, str]]] = None
|
||||
"""Sizes to compile for inductor. In addition
|
||||
@ -4393,7 +4399,7 @@ class VllmConfig:
|
||||
|
||||
As a shorthand, `-O<n>` can be used to directly specify the compilation
|
||||
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
|
||||
Currently, -O <n> and -O=<n> are supported as well but this will likely be
|
||||
Currently, -O <n> and -O=<n> are supported as well but this will likely be
|
||||
removed in favor of clearer -O<n> syntax in the future.
|
||||
|
||||
NOTE: level 0 is the default level without any optimization. level 1 and 2
|
||||
|
||||
@ -2,11 +2,12 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch.distributed import ProcessGroup
|
||||
|
||||
from vllm.distributed.utils import pickle
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.interface import CpuArchEnum
|
||||
|
||||
@ -26,7 +27,8 @@ class CpuCommunicator(DeviceCommunicatorBase):
|
||||
if (current_platform.get_cpu_architecture()
|
||||
== CpuArchEnum.X86) and hasattr(
|
||||
torch.ops._C,
|
||||
"init_shm_manager") and unique_name.startswith("tp"):
|
||||
"init_shm_manager") and (unique_name.startswith("tp")
|
||||
or unique_name.startswith("pp")):
|
||||
self.dist_module = _CPUSHMDistributed(self)
|
||||
|
||||
def all_reduce(self, input_):
|
||||
@ -94,6 +96,19 @@ class CpuCommunicator(DeviceCommunicatorBase):
|
||||
input_size[dim + 1:])
|
||||
return output_tensor
|
||||
|
||||
def send_tensor_dict(
|
||||
self,
|
||||
tensor_dict: dict[str, Union[torch.Tensor, Any]],
|
||||
dst: int,
|
||||
) -> None:
|
||||
return self.dist_module.send_tensor_dict(tensor_dict, dst)
|
||||
|
||||
def recv_tensor_dict(
|
||||
self,
|
||||
src: int,
|
||||
) -> dict[str, Union[torch.Tensor, Any]]:
|
||||
return self.dist_module.recv_tensor_dict(src)
|
||||
|
||||
|
||||
class _CPUSHMDistributed:
|
||||
|
||||
@ -143,3 +158,44 @@ class _CPUSHMDistributed:
|
||||
input: torch.Tensor,
|
||||
group: Optional[ProcessGroup] = None) -> None:
|
||||
torch.ops._C.shm_all_gather(self.handle, input, output)
|
||||
|
||||
def send_tensor_dict(
|
||||
self,
|
||||
tensor_dict: dict[str, Union[torch.Tensor, Any]],
|
||||
dst: int,
|
||||
) -> None:
|
||||
key_list = list(tensor_dict.keys())
|
||||
value_list = list(tensor_dict.values())
|
||||
size_list = []
|
||||
for v in value_list:
|
||||
if not isinstance(v, torch.Tensor):
|
||||
raise RuntimeError(
|
||||
"CpuCommunicator only supports sending tensors.")
|
||||
size_list.append(v.size())
|
||||
key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]),
|
||||
dtype=torch.uint8)
|
||||
value_list.append(key_size_tensor)
|
||||
|
||||
torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst)
|
||||
|
||||
return None
|
||||
|
||||
def recv_tensor_dict(
|
||||
self,
|
||||
src: int,
|
||||
) -> dict[str, Union[torch.Tensor, Any]]:
|
||||
tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
|
||||
|
||||
value_list: list[torch.Tensor] = tensor_list[:-1]
|
||||
key_size_tensor = tensor_list[-1]
|
||||
|
||||
key_size = pickle.loads(key_size_tensor.numpy().tobytes())
|
||||
key_list = key_size[0]
|
||||
size_list = key_size[1]
|
||||
assert len(key_list) == len(size_list)
|
||||
assert len(key_list) == len(value_list)
|
||||
|
||||
tensor_dict: dict[str, torch.Tensor] = {}
|
||||
for key, size, t in zip(key_list, size_list, value_list):
|
||||
tensor_dict[key] = t.view(size)
|
||||
return tensor_dict
|
||||
|
||||
@ -272,6 +272,9 @@ class GroupCoordinator:
|
||||
self.use_custom_op_call = (current_platform.is_cuda_alike()
|
||||
or current_platform.is_tpu())
|
||||
|
||||
self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr(
|
||||
torch.ops._C, "init_shm_manager"))
|
||||
|
||||
@property
|
||||
def first_rank(self):
|
||||
"""Return the global rank of the first process in the group"""
|
||||
@ -663,6 +666,11 @@ class GroupCoordinator:
|
||||
dst = (self.rank_in_group + 1) % self.world_size
|
||||
assert dst < self.world_size, f"Invalid dst rank ({dst})"
|
||||
|
||||
if self.use_cpu_custom_send_recv:
|
||||
self.device_communicator.send_tensor_dict( # type: ignore
|
||||
tensor_dict, dst)
|
||||
return None
|
||||
|
||||
metadata_list: list[tuple[Any, Any]] = []
|
||||
assert isinstance(
|
||||
tensor_dict,
|
||||
@ -718,6 +726,10 @@ class GroupCoordinator:
|
||||
src = (self.rank_in_group - 1) % self.world_size
|
||||
assert src < self.world_size, f"Invalid src rank ({src})"
|
||||
|
||||
if self.use_cpu_custom_send_recv:
|
||||
return self.device_communicator.recv_tensor_dict( # type: ignore
|
||||
src)
|
||||
|
||||
recv_metadata_list = self.recv_object(src=src)
|
||||
tensor_dict: dict[str, Any] = {}
|
||||
for key, value in recv_metadata_list:
|
||||
|
||||
@ -1668,13 +1668,14 @@ class EngineArgs:
|
||||
|
||||
# cpu specific default values.
|
||||
if current_platform.is_cpu():
|
||||
world_size = self.pipeline_parallel_size * self.tensor_parallel_size
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 4096,
|
||||
UsageContext.OPENAI_API_SERVER: 2048,
|
||||
UsageContext.LLM_CLASS: 4096 * world_size,
|
||||
UsageContext.OPENAI_API_SERVER: 2048 * world_size,
|
||||
}
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 128,
|
||||
UsageContext.OPENAI_API_SERVER: 32,
|
||||
UsageContext.LLM_CLASS: 256 * world_size,
|
||||
UsageContext.OPENAI_API_SERVER: 128 * world_size,
|
||||
}
|
||||
|
||||
use_context_value = usage_context.value if usage_context else None
|
||||
|
||||
@ -1668,7 +1668,7 @@ async def init_app_state(
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
) if "pooling" in model_config.supported_tasks else None
|
||||
) if "encode" in model_config.supported_tasks else None
|
||||
state.openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
model_config,
|
||||
|
||||
@ -42,7 +42,7 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
|
||||
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
|
||||
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
|
||||
VLLM_CPU_MOE_PREPACK: bool = True
|
||||
@ -430,9 +430,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
||||
|
||||
# (CPU backend only) CPU key-value cache space.
|
||||
# default is 4 GiB
|
||||
# default is None and will be set as 4 GB
|
||||
"VLLM_CPU_KVCACHE_SPACE":
|
||||
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
|
||||
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0"))
|
||||
if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None,
|
||||
|
||||
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
|
||||
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
|
||||
|
||||
@ -1,15 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping, Set
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum
|
||||
from itertools import groupby
|
||||
from typing import Callable, Optional, TypeVar, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import PretrainedConfig
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig, PoolerConfig
|
||||
from vllm.model_executor.pooling_metadata import ( # noqa: E501
|
||||
@ -21,6 +22,10 @@ from vllm.utils import resolve_obj_by_qualname
|
||||
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
|
||||
|
||||
PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
|
||||
PoolingFn = Callable[
|
||||
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
|
||||
Union[torch.Tensor, list[torch.Tensor]]]
|
||||
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
|
||||
|
||||
|
||||
class PoolingType(IntEnum):
|
||||
@ -79,37 +84,81 @@ class Pooler(nn.Module, ABC):
|
||||
"""The interface required for all poolers used in pooling models in vLLM."""
|
||||
|
||||
@staticmethod
|
||||
def from_config_with_defaults(
|
||||
def for_encode(
|
||||
pooler_config: PoolerConfig,
|
||||
pooling_type: PoolingType,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
step_tag_id: Optional[int] = None,
|
||||
returned_token_ids: Optional[list[int]] = None,
|
||||
) -> "Pooler":
|
||||
*,
|
||||
default_pooling_type: PoolingType = PoolingType.ALL,
|
||||
default_normalize: bool = False,
|
||||
default_softmax: bool = False,
|
||||
default_step_tag_id: Optional[int] = None,
|
||||
default_returned_token_ids: Optional[list[int]] = None,
|
||||
):
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=pooling_type,
|
||||
normalize=normalize,
|
||||
softmax=softmax,
|
||||
step_tag_id=step_tag_id,
|
||||
returned_token_ids=returned_token_ids,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
step_tag_id=default_step_tag_id,
|
||||
returned_token_ids=default_returned_token_ids,
|
||||
)
|
||||
|
||||
if pooling_type == PoolingType.STEP:
|
||||
if resolved_config.pooling_type == PoolingType.STEP:
|
||||
return StepPooler.from_config(resolved_config)
|
||||
|
||||
return SimplePooler.from_config(resolved_config)
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
@staticmethod
|
||||
def for_embed(
|
||||
pooler_config: PoolerConfig,
|
||||
*,
|
||||
default_pooling_type: PoolingType = PoolingType.LAST,
|
||||
default_normalize: bool = True,
|
||||
default_softmax: bool = False,
|
||||
):
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
)
|
||||
|
||||
return SimplePooler.from_config(resolved_config)
|
||||
|
||||
@staticmethod
|
||||
def for_classify(
|
||||
pooler_config: PoolerConfig,
|
||||
classifier: Optional[ClassifierFn],
|
||||
*,
|
||||
default_pooling_type: PoolingType = PoolingType.LAST,
|
||||
default_normalize: bool = False,
|
||||
default_softmax: bool = True,
|
||||
):
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
)
|
||||
base_pooler = SimplePooler.from_config(resolved_config)
|
||||
if classifier is None:
|
||||
return base_pooler
|
||||
|
||||
return ClassifierPooler(
|
||||
pooling=base_pooler.pooling,
|
||||
classifier=classifier,
|
||||
act_fn=base_pooler.head.activation,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
"""Determine which pooling tasks are supported."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
"""
|
||||
Construct the pooling parameters to use for a task,
|
||||
or `None` if the task is not supported.
|
||||
Construct the updated pooling parameters to use for a supported task.
|
||||
"""
|
||||
return None
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
@abstractmethod
|
||||
def forward(
|
||||
@ -127,9 +176,8 @@ def get_prompt_lens(
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return pooling_metadata.prompt_lens
|
||||
|
||||
assert isinstance(hidden_states, torch.Tensor)
|
||||
return PoolingTensors.from_pooling_metadata(
|
||||
pooling_metadata, hidden_states.device).prompt_lens
|
||||
pooling_metadata, hidden_states[0].device).prompt_lens
|
||||
|
||||
|
||||
def get_prompt_token_ids(
|
||||
@ -149,6 +197,21 @@ def get_prompt_token_ids(
|
||||
]
|
||||
|
||||
|
||||
def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
|
||||
if isinstance(pooling_metadata, V0PoolingMetadata):
|
||||
pooling_params = [p for _, p in pooling_metadata.seq_groups]
|
||||
else:
|
||||
pooling_params = pooling_metadata.pooling_params
|
||||
|
||||
tasks: list[PoolingTask] = [
|
||||
task for pooling_param in pooling_params
|
||||
if (task := pooling_param.task) is not None
|
||||
]
|
||||
assert len(pooling_params) == len(tasks)
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
def get_classification_activation_function(config: PretrainedConfig):
|
||||
return PoolerClassify()
|
||||
|
||||
@ -172,7 +235,8 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
|
||||
return PoolerScore()
|
||||
|
||||
|
||||
def build_output(all_data: torch.Tensor) -> PoolerOutput:
|
||||
def build_output(
|
||||
all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput:
|
||||
all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
|
||||
return PoolerOutput(outputs=all_outputs)
|
||||
|
||||
@ -193,12 +257,12 @@ class PoolingMethod(nn.Module, ABC):
|
||||
raise NotImplementedError(f"Unsupported method: {pooling_type}")
|
||||
|
||||
@abstractmethod
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
@abstractmethod
|
||||
def forward_one(
|
||||
self,
|
||||
@ -237,16 +301,8 @@ class PoolingMethod(nn.Module, ABC):
|
||||
|
||||
class CLSPool(PoolingMethod):
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
# The equalities are split up to keep mypy happy
|
||||
if (task == "encode" or task == "embed" or task == "classify"
|
||||
or task == "score"):
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
assert_never(task)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode", "embed", "classify", "score"}
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
@ -270,16 +326,8 @@ class CLSPool(PoolingMethod):
|
||||
|
||||
class LastPool(PoolingMethod):
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
# The equalities are split up to keep mypy happy
|
||||
if (task == "encode" or task == "embed" or task == "classify"
|
||||
or task == "score"):
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
assert_never(task)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode", "embed", "classify", "score"}
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
@ -299,18 +347,8 @@ class LastPool(PoolingMethod):
|
||||
|
||||
class AllPool(PoolingMethod):
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
if task == "encode":
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
# The equalities are split up to keep mypy happy
|
||||
if task == "embed" or task == "classify" or task == "score":
|
||||
return None
|
||||
|
||||
assert_never(task)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode"}
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
@ -327,28 +365,13 @@ class AllPool(PoolingMethod):
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
offset = 0
|
||||
pooled_data = list[torch.Tensor]()
|
||||
|
||||
for prompt_len in prompt_lens:
|
||||
pooled_data.append(hidden_states[offset:offset + prompt_len])
|
||||
offset += prompt_len
|
||||
|
||||
return pooled_data
|
||||
return list(hidden_states.split_with_sizes(prompt_lens.tolist()))
|
||||
|
||||
|
||||
class MeanPool(PoolingMethod):
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
# The equalities are split up to keep mypy happy
|
||||
if (task == "encode" or task == "embed" or task == "classify"
|
||||
or task == "score"):
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
assert_never(task)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode", "embed", "classify", "score"}
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
@ -529,24 +552,6 @@ class SimplePooler(Pooler):
|
||||
3. Returns structured results as `PoolerOutput`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_config_with_defaults( # type: ignore[override]
|
||||
cls,
|
||||
pooler_config: PoolerConfig,
|
||||
pooling_type: PoolingType,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
) -> "SimplePooler":
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=pooling_type,
|
||||
normalize=normalize,
|
||||
softmax=softmax,
|
||||
)
|
||||
assert resolved_config.pooling_type != PoolingType.STEP
|
||||
|
||||
return cls.from_config(resolved_config)
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
@ -563,10 +568,10 @@ class SimplePooler(Pooler):
|
||||
self.pooling = pooling
|
||||
self.head = head
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return self.pooling.get_supported_tasks()
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return self.pooling.get_pooling_updates(task)
|
||||
|
||||
def forward(
|
||||
@ -627,18 +632,11 @@ class StepPooler(Pooler):
|
||||
|
||||
return pooled_data
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
if task == "encode":
|
||||
return PoolingParamsUpdate(requires_token_ids=True)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode"}
|
||||
|
||||
# The equalities are split up to keep mypy happy
|
||||
if task == "embed" or task == "classify" or task == "score":
|
||||
return None
|
||||
|
||||
assert_never(task)
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return PoolingParamsUpdate(requires_token_ids=True)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -650,68 +648,43 @@ class StepPooler(Pooler):
|
||||
return build_output(pooled_data)
|
||||
|
||||
|
||||
PoolingFn = Callable[
|
||||
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
|
||||
Union[torch.Tensor, list[torch.Tensor]]]
|
||||
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
|
||||
|
||||
|
||||
class ClassifierPooler(nn.Module):
|
||||
class ClassifierPooler(Pooler):
|
||||
"""A pooling layer for classification tasks.
|
||||
|
||||
This layer does the following:
|
||||
1. Applies a classification layer to the hidden states.
|
||||
2. Optionally applies a pooler layer.
|
||||
3. Applies an activation function to the output. In the case of
|
||||
classification models it is either sigmoid or softmax. In the
|
||||
case of scoring models, the same behavior is configuration
|
||||
dependent, as in the sentence-transformers library.
|
||||
3. Applies an activation function to the output.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def act_fn_for_seq_cls(config: ModelConfig):
|
||||
return get_classification_activation_function(config.hf_config)
|
||||
|
||||
@staticmethod
|
||||
def act_fn_for_cross_encoder(config: ModelConfig):
|
||||
return get_cross_encoder_activation_function(config.hf_config)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: ModelConfig,
|
||||
pooling: PoolingFn,
|
||||
classifier: ClassifierFn,
|
||||
act_fn: Optional[PoolerActivation] = None,
|
||||
act_fn: PoolerActivation,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.pooling = pooling
|
||||
self.classifier = classifier
|
||||
self.act_fn = act_fn
|
||||
|
||||
self.classification_act_fn = get_classification_activation_function(
|
||||
config.hf_config) if act_fn is None else act_fn
|
||||
self.cross_encoder_act_fn = get_cross_encoder_activation_function(
|
||||
config.hf_config) if act_fn is None else act_fn
|
||||
|
||||
def _get_act_fn(self, task: PoolingTask):
|
||||
if task == "encode" or task == "classify":
|
||||
return self.classification_act_fn
|
||||
if task == "score":
|
||||
return self.cross_encoder_act_fn
|
||||
|
||||
raise ValueError(f"Unsupported task: {task!r}")
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
# The equalities are split up to keep mypy happy
|
||||
if task == "encode" or task == "classify" or task == "score":
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
if task == "embed":
|
||||
return None
|
||||
|
||||
assert_never(task)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"classify", "score"}
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
"""Pools sentence pair scores from the hidden_states."""
|
||||
pooled_data = self.pooling(hidden_states, pooling_metadata)
|
||||
|
||||
# apply classifier once on the full batch if possible
|
||||
@ -722,28 +695,59 @@ class ClassifierPooler(nn.Module):
|
||||
else:
|
||||
pooled_output = [self.classifier(data) for data in pooled_data]
|
||||
|
||||
task_list: list[PoolingTask]
|
||||
if isinstance(pooling_metadata, V0PoolingMetadata):
|
||||
task_list = [
|
||||
task for _, pooling_param in pooling_metadata.seq_groups
|
||||
if (task := pooling_param.task) is not None
|
||||
]
|
||||
else:
|
||||
task_list = [
|
||||
task for pooling_param in pooling_metadata.pooling_params
|
||||
if (task := pooling_param.task) is not None
|
||||
]
|
||||
|
||||
assert len(task_list) == len(pooled_output)
|
||||
|
||||
# shape of scores: (batch_size, num_labels)
|
||||
if len(set(task_list)) <= 1:
|
||||
act_fn = self._get_act_fn(task_list[0])
|
||||
scores = act_fn(pooled_output)
|
||||
else:
|
||||
scores = torch.stack([
|
||||
self._get_act_fn(task)(vecs)
|
||||
for task, vecs in zip(task_list, pooled_output)
|
||||
])
|
||||
scores = self.act_fn(pooled_output)
|
||||
|
||||
return build_output(scores)
|
||||
|
||||
|
||||
class DispatchPooler(Pooler):
|
||||
"""Dispatches calls to a sub-pooler based on the pooling task."""
|
||||
|
||||
def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
|
||||
super().__init__()
|
||||
|
||||
for task, pooler in poolers_by_task.items():
|
||||
if task not in pooler.get_supported_tasks():
|
||||
raise ValueError(
|
||||
f"{pooler=} does not support {task=}. "
|
||||
f"Supported tasks: {pooler.get_supported_tasks()}")
|
||||
|
||||
self.poolers_by_task = poolers_by_task
|
||||
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return set(self.poolers_by_task)
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return self.poolers_by_task[task].get_pooling_updates(task)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
poolers_by_task = self.poolers_by_task
|
||||
|
||||
if isinstance(hidden_states, list):
|
||||
hidden_states_lst = hidden_states
|
||||
else:
|
||||
prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
|
||||
hidden_states_lst = list(hidden_states.split(prompt_lens.tolist()))
|
||||
|
||||
outputs = list[PoolingSequenceGroupOutput]()
|
||||
offset = 0
|
||||
for task, group in groupby(get_tasks(pooling_metadata)):
|
||||
if not (pooler := poolers_by_task.get(task)):
|
||||
raise ValueError(
|
||||
f"Unsupported task: {task} "
|
||||
f"Supported tasks: {self.get_supported_tasks()}")
|
||||
|
||||
num_items = len(list(group))
|
||||
group_output: PoolerOutput = pooler(
|
||||
hidden_states_lst[offset:offset + num_items],
|
||||
pooling_metadata[offset:offset + num_items],
|
||||
)
|
||||
|
||||
outputs.extend(group_output.outputs)
|
||||
offset += num_items
|
||||
|
||||
return PoolerOutput(outputs)
|
||||
|
||||
@ -75,20 +75,64 @@ class ModelOptFp8Config(QuantizationConfig):
|
||||
def get_config_filenames(cls) -> list[str]:
|
||||
return ["hf_quant_config.json"]
|
||||
|
||||
@classmethod
|
||||
def override_quantization_method(
|
||||
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
|
||||
"""Detect if this ModelOpt config should be used based on
|
||||
quantization config."""
|
||||
|
||||
if hf_quant_cfg is None:
|
||||
return None
|
||||
|
||||
# Use the community standard 'quant_method'
|
||||
quant_method = hf_quant_cfg.get("quant_method", "").lower()
|
||||
|
||||
# Only proceed if the method is explicitly "modelopt"
|
||||
if quant_method != "modelopt":
|
||||
return None
|
||||
|
||||
# Look for ModelOpt-specific config structure
|
||||
if "quantization" in hf_quant_cfg:
|
||||
quant_config = hf_quant_cfg["quantization"]
|
||||
if isinstance(quant_config, dict):
|
||||
quant_algo = quant_config.get("quant_algo", "")
|
||||
if "FP8" in quant_algo:
|
||||
return "modelopt"
|
||||
else:
|
||||
# Check for compressed-tensors style config with specific quant_algo
|
||||
quant_algo = hf_quant_cfg.get("quant_algo", "")
|
||||
if isinstance(quant_algo, str) and "FP8" in quant_algo:
|
||||
return "modelopt"
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
|
||||
quant_config = cls.get_from_keys(config, ["quantization"])
|
||||
quant_method = quant_config["quant_algo"]
|
||||
kv_cache_quant_method = cls.get_from_keys(
|
||||
config, ["quantization"]).get("kv_cache_quant_algo")
|
||||
exclude_modules = cls.get_from_keys(
|
||||
config, ["quantization"]).get("exclude_modules")
|
||||
# Handle both ModelOpt format and compressed-tensors style format
|
||||
if "quantization" in config:
|
||||
# ModelOpt format: {"quantization": {"quant_algo": "..."}}
|
||||
quant_config = cls.get_from_keys(config, ["quantization"])
|
||||
if not isinstance(quant_config, dict):
|
||||
raise ValueError(
|
||||
"Expected 'quantization' to be a dictionary in config")
|
||||
quant_method = quant_config.get("quant_algo", "")
|
||||
if not quant_method:
|
||||
raise ValueError("Missing 'quant_algo' in quantization config")
|
||||
kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
|
||||
exclude_modules = quant_config.get("exclude_modules")
|
||||
else:
|
||||
# Compressed-tensors style format:
|
||||
# {"quant_algo": "...", "quant_method": "modelopt"}
|
||||
quant_method = config.get("quant_algo", "")
|
||||
kv_cache_quant_method = config.get("kv_cache_quant_algo")
|
||||
exclude_modules = config.get("exclude_modules")
|
||||
|
||||
if quant_method not in QUANT_ALGOS:
|
||||
raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
|
||||
" quantizations in vLLM. Please check the "
|
||||
"`hf_quant_config.json` file for your model's "
|
||||
"quant configuration.")
|
||||
raise ValueError(
|
||||
f"ModelOpt currently only supports: {QUANT_ALGOS} "
|
||||
"quantizations in vLLM. Please check the "
|
||||
"`hf_quant_config.json` file for your model's "
|
||||
"quant configuration.")
|
||||
is_checkpoint_fp8_serialized = ("FP8" in quant_method)
|
||||
|
||||
return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
|
||||
@ -434,7 +478,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
|
||||
def __init__(
|
||||
self,
|
||||
is_checkpoint_nvfp4_serialized: bool,
|
||||
kv_cache_quant_algo: str,
|
||||
kv_cache_quant_algo: Optional[str],
|
||||
exclude_modules: list[str],
|
||||
group_size: int = 16,
|
||||
) -> None:
|
||||
@ -465,24 +509,138 @@ class ModelOptNvFp4Config(QuantizationConfig):
|
||||
def get_config_filenames(cls) -> list[str]:
|
||||
return ["hf_quant_config.json"]
|
||||
|
||||
@classmethod
|
||||
def override_quantization_method(
|
||||
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
|
||||
"""Detect if this ModelOpt FP4 config should be used based on
|
||||
quantization config."""
|
||||
if hf_quant_cfg is None:
|
||||
return None
|
||||
|
||||
# Use the community standard 'quant_method'
|
||||
quant_method = hf_quant_cfg.get("quant_method", "").lower()
|
||||
|
||||
# Only proceed if the method is explicitly "modelopt"
|
||||
if quant_method != "modelopt":
|
||||
return None
|
||||
|
||||
# Look for ModelOpt-specific config structure
|
||||
if "quantization" in hf_quant_cfg:
|
||||
quant_config = hf_quant_cfg["quantization"]
|
||||
if isinstance(quant_config, dict):
|
||||
quant_algo = quant_config.get("quant_algo", "")
|
||||
if "NVFP4" in quant_algo:
|
||||
return "modelopt_fp4"
|
||||
else:
|
||||
# Check for compressed-tensors style config with specific
|
||||
# quant_algo field
|
||||
quant_algo = hf_quant_cfg.get("quant_algo", "")
|
||||
if isinstance(quant_algo, str) and "FP4" in quant_algo.upper():
|
||||
return "modelopt_fp4"
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
|
||||
quant_config = cls.get_from_keys(config, ["quantization"])
|
||||
quant_method = quant_config["quant_algo"]
|
||||
# Handle both traditional ModelOpt format and compressed-tensors
|
||||
# style format
|
||||
if "quantization" in config:
|
||||
# Traditional ModelOpt format:
|
||||
# {"quantization": {"quant_algo": "..."}}
|
||||
quant_config = cls.get_from_keys(config, ["quantization"])
|
||||
if not isinstance(quant_config, dict):
|
||||
raise ValueError(
|
||||
"Expected 'quantization' to be a dictionary in config")
|
||||
|
||||
quant_method = quant_config.get("quant_algo", "")
|
||||
if not quant_method:
|
||||
raise ValueError("Missing 'quant_algo' in quantization config")
|
||||
|
||||
# Handle kv_cache_quant_algo with proper type validation
|
||||
kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
|
||||
if kv_cache_quant_algo_raw is None:
|
||||
# No KV cache quantization by default
|
||||
kv_cache_quant_algo = None
|
||||
elif isinstance(kv_cache_quant_algo_raw, str):
|
||||
kv_cache_quant_algo = kv_cache_quant_algo_raw
|
||||
else:
|
||||
raise ValueError(f"kv_cache_quant_algo must be a string, got "
|
||||
f"{type(kv_cache_quant_algo_raw)}")
|
||||
|
||||
# Handle group_size with proper type validation
|
||||
group_size_raw = quant_config.get("group_size")
|
||||
if group_size_raw is None:
|
||||
group_size = 16 # Default value
|
||||
elif isinstance(group_size_raw, int):
|
||||
group_size = group_size_raw
|
||||
else:
|
||||
try:
|
||||
group_size = int(group_size_raw)
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"group_size must be an integer, got "
|
||||
f"{type(group_size_raw)}") from None
|
||||
|
||||
exclude_modules = quant_config.get("exclude_modules", [])
|
||||
if not isinstance(exclude_modules, list):
|
||||
raise ValueError(f"exclude_modules must be a list, got "
|
||||
f"{type(exclude_modules)}")
|
||||
else:
|
||||
# Compressed-tensors style format:
|
||||
# {"quant_algo": "...", "quant_method": "modelopt"}
|
||||
quant_method = config.get("quant_algo", "")
|
||||
|
||||
# Handle kv_cache_quant_algo with proper type validation
|
||||
kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
|
||||
if kv_cache_quant_algo_raw is None:
|
||||
# No KV cache quantization by default
|
||||
kv_cache_quant_algo = None
|
||||
elif isinstance(kv_cache_quant_algo_raw, str):
|
||||
kv_cache_quant_algo = kv_cache_quant_algo_raw
|
||||
else:
|
||||
raise ValueError(f"kv_cache_quant_algo must be a string, got "
|
||||
f"{type(kv_cache_quant_algo_raw)}")
|
||||
|
||||
# Handle group_size with proper type validation
|
||||
group_size_raw = config.get("group_size")
|
||||
if group_size_raw is None:
|
||||
group_size = 16 # Default value
|
||||
elif isinstance(group_size_raw, int):
|
||||
group_size = group_size_raw
|
||||
else:
|
||||
try:
|
||||
group_size = int(group_size_raw)
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"group_size must be an integer, got "
|
||||
f"{type(group_size_raw)}") from None
|
||||
|
||||
exclude_modules = config.get("exclude_modules", [])
|
||||
if not isinstance(exclude_modules, list):
|
||||
raise ValueError(f"exclude_modules must be a list, got "
|
||||
f"{type(exclude_modules)}")
|
||||
|
||||
if quant_method not in QUANT_ALGOS:
|
||||
raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
|
||||
" quantizations in vLLM. Please check the "
|
||||
"`hf_quant_config.json` file for your model's "
|
||||
"quant configuration.")
|
||||
raise ValueError(
|
||||
f"ModelOpt currently only supports: {QUANT_ALGOS} "
|
||||
"quantizations in vLLM. Please check the "
|
||||
"`hf_quant_config.json` file for your model's "
|
||||
"quant configuration.")
|
||||
is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method)
|
||||
if ("group_size" and "kv_cache_quant_algo"
|
||||
and "exclude_modules") not in quant_config:
|
||||
raise ValueError("NVFP4 quantization requires group size and "
|
||||
"kv_cache_quant_algo specified in "
|
||||
"hf_quant_config.json")
|
||||
kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
|
||||
group_size = quant_config["group_size"]
|
||||
exclude_modules = quant_config["exclude_modules"]
|
||||
|
||||
# For FP4, these fields are required
|
||||
if is_checkpoint_nvfp4_serialized and "quantization" in config:
|
||||
# Check if required fields are present in the quantization config
|
||||
quant_config = config["quantization"]
|
||||
required_fields = [
|
||||
"group_size", "kv_cache_quant_algo", "exclude_modules"
|
||||
]
|
||||
missing_fields = [
|
||||
field for field in required_fields if field not in quant_config
|
||||
]
|
||||
if missing_fields:
|
||||
raise ValueError(
|
||||
f"NVFP4 quantization requires the following fields in "
|
||||
f"hf_quant_config.json: {missing_fields}")
|
||||
|
||||
return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
|
||||
exclude_modules, group_size)
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ from .interfaces_base import VllmModelForPooling, is_pooling_model
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
|
||||
_T = TypeVar("_T", bound=type[nn.Module])
|
||||
|
||||
@ -34,16 +33,8 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
|
||||
return model_name + pooling_suffix
|
||||
|
||||
|
||||
def _create_pooling_model_cls(
|
||||
orig_cls: _T,
|
||||
*,
|
||||
default_pooling_type: "PoolingType",
|
||||
default_normalize: bool,
|
||||
default_softmax: bool,
|
||||
) -> _T:
|
||||
def _create_pooling_model_cls(orig_cls: _T) -> _T:
|
||||
# Lazy import
|
||||
from vllm.model_executor.layers.pooler import Pooler
|
||||
|
||||
from .utils import AutoWeightsLoader, WeightsMapper
|
||||
|
||||
class ModelForPooling(orig_cls, VllmModelForPooling):
|
||||
@ -71,15 +62,7 @@ def _create_pooling_model_cls(
|
||||
self._init_pooler(vllm_config, prefix=prefix)
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
)
|
||||
raise NotImplementedError
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# TODO: Support uninitialized params tracking
|
||||
@ -132,14 +115,20 @@ def as_embedding_model(cls: _T) -> _T:
|
||||
return cls
|
||||
|
||||
# Lazy import
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
|
||||
class ModelForEmbedding(_create_pooling_model_cls(cls)):
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler(
|
||||
{
|
||||
"encode": Pooler.for_encode(pooler_config),
|
||||
"embed": Pooler.for_embed(pooler_config),
|
||||
}, )
|
||||
|
||||
ModelForEmbedding = _create_pooling_model_cls(
|
||||
cls,
|
||||
default_pooling_type=PoolingType.LAST,
|
||||
default_normalize=True,
|
||||
default_softmax=False,
|
||||
)
|
||||
ModelForEmbedding.__name__ = \
|
||||
_get_pooling_model_name(cls.__name__, "ForEmbedding")
|
||||
|
||||
@ -165,20 +154,14 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
# Lazy import
|
||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler,
|
||||
PoolingType, SimplePooler)
|
||||
DispatchPooler, Pooler,
|
||||
PoolingMethod, PoolingType)
|
||||
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .utils import maybe_prefix
|
||||
|
||||
ModelForPooling = _create_pooling_model_cls(
|
||||
cls,
|
||||
default_pooling_type=PoolingType.LAST,
|
||||
default_normalize=False,
|
||||
default_softmax=True,
|
||||
)
|
||||
|
||||
class ModelForSequenceClassification(ModelForPooling,
|
||||
class ModelForSequenceClassification(_create_pooling_model_cls(cls),
|
||||
SupportsCrossEncoding):
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
@ -198,19 +181,28 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
pooler = SimplePooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=True,
|
||||
)
|
||||
pooling_type_str = pooler_config.pooling_type
|
||||
pooling_type = (PoolingType.LAST if pooling_type_str is None else
|
||||
PoolingType[pooling_type_str])
|
||||
|
||||
self.pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=pooler.pooling,
|
||||
classifier=self._classifier,
|
||||
act_fn=pooler.head.activation,
|
||||
)
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
ClassifierPooler(
|
||||
pooling=PoolingMethod.from_pooling_type(pooling_type),
|
||||
classifier=self._classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_seq_cls(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
"score":
|
||||
ClassifierPooler(
|
||||
pooling=PoolingMethod.from_pooling_type(pooling_type),
|
||||
classifier=self._classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
})
|
||||
|
||||
def _classifier(self, x: torch.Tensor):
|
||||
x, _ = self.score(x.float())
|
||||
@ -259,14 +251,16 @@ def as_reward_model(cls: _T) -> _T:
|
||||
return cls
|
||||
|
||||
# Lazy import
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
|
||||
ModelForReward = _create_pooling_model_cls(
|
||||
cls,
|
||||
default_pooling_type=PoolingType.ALL,
|
||||
default_normalize=False,
|
||||
default_softmax=False,
|
||||
)
|
||||
class ModelForReward(_create_pooling_model_cls(cls)):
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler(
|
||||
{"encode": Pooler.for_encode(pooler_config)}, )
|
||||
|
||||
ModelForReward.__name__ = \
|
||||
_get_pooling_model_name(cls.__name__, "ForReward")
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Iterable, Set
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -17,7 +17,8 @@ from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler,
|
||||
DispatchPooler, Pooler,
|
||||
PoolingMethod,
|
||||
PoolingParamsUpdate,
|
||||
PoolingType)
|
||||
@ -92,20 +93,29 @@ class BertPooler(Pooler):
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return self.pooling.get_supported_tasks()
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return self.pooling.get_pooling_updates(task)
|
||||
|
||||
def _head(self, pooled_output: torch.Tensor):
|
||||
pooled_output = self.dense(pooled_output)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
pooled_output = self.pooling(hidden_states, pooling_metadata)
|
||||
pooled_output = self.dense(pooled_output)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
|
||||
if isinstance(pooled_output, list):
|
||||
pooled_output = [self._head(output) for output in pooled_output]
|
||||
else:
|
||||
pooled_output = self._head(pooled_output)
|
||||
|
||||
return pooled_output
|
||||
|
||||
|
||||
@ -333,18 +343,19 @@ class BertModel(nn.Module, SupportsQuant):
|
||||
|
||||
packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
embedding_class: type = BertEmbedding,
|
||||
add_pooling_layer: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
embedding_class: type[nn.Module] = BertEmbedding,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
self.embeddings = embedding_class(config)
|
||||
self.encoder = BertEncoder(vllm_config=vllm_config,
|
||||
prefix=f"{prefix}.encoder")
|
||||
self.pooler = BertPooler(config) if add_pooling_layer else None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -366,8 +377,7 @@ class BertModel(nn.Module, SupportsQuant):
|
||||
token_type_ids=token_type_ids)
|
||||
return self.encoder(hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "query", "q"),
|
||||
@ -395,10 +405,43 @@ class BertModel(nn.Module, SupportsQuant):
|
||||
if name in params_dict:
|
||||
other_weights.append((name, loaded_weight))
|
||||
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["pooler."] if self.pooler is None else []),
|
||||
return other_weights, loaded_stacked_params
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
other_weights, loaded_stacked_params = self._load_weights(weights)
|
||||
|
||||
loader = AutoWeightsLoader(self, skip_prefixes=["pooler."])
|
||||
loaded_params = loader.load_weights(other_weights)
|
||||
loaded_params.update(loaded_stacked_params)
|
||||
return loaded_params
|
||||
|
||||
|
||||
class BertPoolingModel(BertModel):
|
||||
|
||||
is_pooling_model = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
embedding_class: type[nn.Module] = BertEmbedding,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
vllm_config=vllm_config,
|
||||
prefix=prefix,
|
||||
embedding_class=embedding_class,
|
||||
)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
self.pooler = BertPooler(config)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
other_weights, loaded_stacked_params = self._load_weights(weights)
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
loaded_params = loader.load_weights(other_weights)
|
||||
loaded_params.update(loaded_stacked_params)
|
||||
return loaded_params
|
||||
@ -421,6 +464,8 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
super().__init__()
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.model = self._build_model(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "model"))
|
||||
self.pooler = self._build_pooler(pooler_config)
|
||||
@ -456,10 +501,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
embedding_class=BertEmbedding)
|
||||
|
||||
def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
|
||||
return Pooler.from_config_with_defaults(pooler_config,
|
||||
pooling_type=PoolingType.CLS,
|
||||
normalize=True,
|
||||
softmax=False)
|
||||
return DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"embed":
|
||||
Pooler.for_embed(
|
||||
pooler_config,
|
||||
default_pooling_type=PoolingType.CLS,
|
||||
),
|
||||
})
|
||||
|
||||
|
||||
class BertForSequenceClassification(nn.Module, SupportsV0Only,
|
||||
@ -481,16 +531,32 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
|
||||
config = vllm_config.model_config.hf_config
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.bert = BertModel(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "bert"),
|
||||
embedding_class=BertEmbedding,
|
||||
add_pooling_layer=True)
|
||||
self.bert = BertPoolingModel(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "bert"),
|
||||
embedding_class=BertEmbedding)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
self.pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=self.bert.pooler,
|
||||
classifier=self.classifier,
|
||||
)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
ClassifierPooler(
|
||||
pooling=self.bert.pooler,
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_seq_cls(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
"score":
|
||||
ClassifierPooler(
|
||||
pooling=self.bert.pooler,
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
|
||||
@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from ..layers.pooler import Pooler, PoolingType
|
||||
from ..layers.pooler import DispatchPooler, Pooler
|
||||
from .interfaces import SupportsPP
|
||||
from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
@ -339,12 +339,16 @@ class GPT2ForSequenceClassification(nn.Module):
|
||||
self.transformer = GPT2Model(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "gpt2"))
|
||||
self.score = nn.Linear(config.n_embd, config.num_labels, bias=False)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=True)
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
Pooler.for_classify(pooler_config, classifier=None),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
|
||||
@ -1,17 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Set
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.pooler import (Pooler, PoolerHead,
|
||||
PoolerNormalize,
|
||||
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
|
||||
PoolerHead, PoolerNormalize,
|
||||
PoolingParamsUpdate,
|
||||
build_output, get_prompt_lens,
|
||||
get_prompt_token_ids)
|
||||
@ -135,18 +134,11 @@ class GritLMMeanPool(nn.Module):
|
||||
|
||||
return instruction_len
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
# The equalities are split up to keep mypy happy
|
||||
if task == "encode" or task == "embed":
|
||||
return PoolingParamsUpdate(requires_token_ids=True)
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"encode", "embed"}
|
||||
|
||||
if task == "classify" or task == "score":
|
||||
return None
|
||||
|
||||
assert_never(task)
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return PoolingParamsUpdate(requires_token_ids=True)
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
@ -207,10 +199,10 @@ class GritLMPooler(Pooler):
|
||||
self.pooling = GritLMMeanPool(model_config)
|
||||
self.head = PoolerHead(PoolerNormalize())
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return self.pooling.get_supported_tasks()
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return self.pooling.get_pooling_updates(task)
|
||||
|
||||
def forward(
|
||||
@ -262,4 +254,11 @@ class GritLM(LlamaForCausalLM, SupportsV0Only):
|
||||
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
|
||||
|
||||
self.pooler = GritLMPooler(vllm_config.model_config)
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
if pooler_config is not None:
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"embed":
|
||||
GritLMPooler(vllm_config.model_config),
|
||||
})
|
||||
|
||||
@ -22,7 +22,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
@ -429,12 +429,10 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
|
||||
)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.ALL,
|
||||
normalize=False,
|
||||
softmax=False,
|
||||
)
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler(
|
||||
{"encode": Pooler.for_encode(pooler_config)}, )
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -19,8 +19,8 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType,
|
||||
SimplePooler)
|
||||
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
|
||||
PoolingType)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
|
||||
@ -584,16 +584,15 @@ class JambaForSequenceClassification(JambaForCausalLM):
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
pooler = SimplePooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=False,
|
||||
)
|
||||
|
||||
self.pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=pooler.pooling,
|
||||
classifier=self.score,
|
||||
act_fn=pooler.head.activation,
|
||||
)
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
Pooler.for_classify(
|
||||
pooler_config,
|
||||
classifier=self.score,
|
||||
default_pooling_type=PoolingType.LAST,
|
||||
default_normalize=False,
|
||||
default_softmax=False,
|
||||
),
|
||||
})
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.inputs import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@ -96,11 +96,17 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
|
||||
|
||||
self.score = JinaVLScorer(config)
|
||||
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=True)
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
Pooler.for_classify(pooler_config, classifier=None),
|
||||
"score":
|
||||
Pooler.for_classify(pooler_config, classifier=None),
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Iterable, Set
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -13,7 +13,8 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler,
|
||||
DispatchPooler, Pooler,
|
||||
PoolingMethod,
|
||||
PoolingParamsUpdate,
|
||||
PoolingType)
|
||||
@ -271,19 +272,27 @@ class ModernBertPooler(Pooler):
|
||||
eps=config.norm_eps,
|
||||
bias=config.norm_bias)
|
||||
|
||||
def get_pooling_updates(
|
||||
self,
|
||||
task: PoolingTask,
|
||||
) -> Optional[PoolingParamsUpdate]:
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return self.pooling.get_supported_tasks()
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return self.pooling.get_pooling_updates(task)
|
||||
|
||||
def _head(self, pooled_output: torch.Tensor):
|
||||
return self.norm(self.act(self.dense(pooled_output)))
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
pooled_output = self.pooling(hidden_states, pooling_metadata)
|
||||
pooled_output = self.norm(self.act(self.dense(pooled_output)))
|
||||
|
||||
if isinstance(pooled_output, list):
|
||||
pooled_output = [self._head(output) for output in pooled_output]
|
||||
else:
|
||||
pooled_output = self._head(pooled_output)
|
||||
|
||||
return pooled_output
|
||||
|
||||
|
||||
@ -299,11 +308,28 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
|
||||
self.model = ModernBertModel(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "modernbert"))
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
self.pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=ModernBertPooler(config),
|
||||
classifier=self.classifier,
|
||||
)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
ClassifierPooler(
|
||||
pooling=ModernBertPooler(config),
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_seq_cls(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
"score":
|
||||
ClassifierPooler(
|
||||
pooling=ModernBertPooler(config),
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
|
||||
@ -15,7 +15,8 @@ from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler
|
||||
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
|
||||
PoolingType)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsLoRA, SupportsPP
|
||||
@ -26,7 +27,7 @@ from .utils import AutoWeightsLoader, maybe_prefix
|
||||
class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
|
||||
|
||||
is_pooling_model = True
|
||||
pooler: SimplePooler
|
||||
pooler: Pooler
|
||||
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
@ -94,12 +95,12 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
vllm_config.model_config.hf_config.num_labels = 1
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.ALL,
|
||||
normalize=False,
|
||||
softmax=False)
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler(
|
||||
{"encode": Pooler.for_encode(pooler_config)}, )
|
||||
|
||||
|
||||
class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
|
||||
@ -107,11 +108,17 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
vllm_config.model_config.hf_config.num_labels = 2
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
self.pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.STEP,
|
||||
normalize=False,
|
||||
softmax=True,
|
||||
step_tag_id=151651,
|
||||
)
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(
|
||||
pooler_config,
|
||||
default_pooling_type=PoolingType.STEP,
|
||||
default_normalize=False,
|
||||
default_softmax=True,
|
||||
default_step_tag_id=151651,
|
||||
)
|
||||
})
|
||||
|
||||
@ -9,7 +9,8 @@ from torch import nn
|
||||
from transformers import RobertaConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.pooler import ClassifierPooler, CLSPool
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
|
||||
DispatchPooler, Pooler)
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
|
||||
@ -63,16 +64,10 @@ class RobertaEmbedding(nn.Module):
|
||||
# References:
|
||||
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
|
||||
# - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
|
||||
pos_list = []
|
||||
token_list = []
|
||||
offset = 0
|
||||
for seq_len in seq_lens:
|
||||
pos_list.append(position_ids[offset:offset + seq_len])
|
||||
token_list.append(input_ids[offset:offset + seq_len])
|
||||
offset += seq_len
|
||||
|
||||
seq_lens_list = seq_lens.tolist()
|
||||
new_pos_list = []
|
||||
for positions, tokens in zip(pos_list, token_list):
|
||||
for positions, tokens in zip(position_ids.split(seq_lens_list),
|
||||
input_ids.split(seq_lens_list)):
|
||||
# Verify assumption that incoming position are
|
||||
# always a sequence from 0 to N.
|
||||
expected_pos = torch.arange(positions.size()[0],
|
||||
@ -184,15 +179,30 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
|
||||
self.num_labels = config.num_labels
|
||||
self.roberta = BertModel(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "bert"),
|
||||
embedding_class=RobertaEmbedding,
|
||||
add_pooling_layer=False)
|
||||
embedding_class=RobertaEmbedding)
|
||||
self.classifier = RobertaClassificationHead(config)
|
||||
|
||||
self.pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=CLSPool(),
|
||||
classifier=self.classifier,
|
||||
)
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
ClassifierPooler(
|
||||
pooling=CLSPool(),
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_seq_cls(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
"score":
|
||||
ClassifierPooler(
|
||||
pooling=CLSPool(),
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
|
||||
@ -38,6 +38,13 @@ class PoolingMetadata:
|
||||
f"seq_data={self.seq_data}, "
|
||||
f"prompt_lens={self.prompt_lens})")
|
||||
|
||||
def __getitem__(self, indices: slice):
|
||||
return PoolingMetadata(
|
||||
seq_groups=self.seq_groups[indices],
|
||||
seq_data=dict(list(self.seq_data.items())[indices]),
|
||||
prompt_lens=self.prompt_lens[indices],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PoolingTensors:
|
||||
|
||||
@ -104,8 +104,19 @@ class CpuPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
import psutil
|
||||
return psutil.virtual_memory().total
|
||||
import vllm.envs as envs
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
if kv_cache_space is None:
|
||||
kv_cache_space = 4 * GiB_bytes # type: ignore
|
||||
logger.warning_once(
|
||||
"Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
|
||||
"for CPU backend is not set, using 4 by default.")
|
||||
else:
|
||||
kv_cache_space *= GiB_bytes
|
||||
|
||||
return kv_cache_space
|
||||
|
||||
@classmethod
|
||||
def set_device(cls, device: torch.device) -> None:
|
||||
@ -124,8 +135,6 @@ class CpuPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
import vllm.envs as envs
|
||||
from vllm.utils import GiB_bytes
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if model_config is not None:
|
||||
@ -162,20 +171,8 @@ class CpuPlatform(Platform):
|
||||
" support fp16 for now, cast to bf16.")
|
||||
model_config.dtype = torch.bfloat16
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0:
|
||||
cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning(
|
||||
"Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
|
||||
"for CPU backend is not set, using 4 by default.")
|
||||
else:
|
||||
cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
cache_config.cpu_kvcache_space_bytes = \
|
||||
CpuPlatform.get_device_total_memory()
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if (parallel_config.world_size > 1
|
||||
@ -216,8 +213,6 @@ class CpuPlatform(Platform):
|
||||
False,
|
||||
"nan_asserts":
|
||||
False,
|
||||
"memory_planning":
|
||||
True,
|
||||
"epilogue_fusion":
|
||||
True,
|
||||
})
|
||||
|
||||
@ -42,7 +42,7 @@ def adapt_config_dict(config_dict: dict[str, Any],
|
||||
|
||||
config = PretrainedConfig.from_dict(config_dict)
|
||||
|
||||
logger.debug("Initialized config", config)
|
||||
logger.debug("Initialized config %s", config)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
@ -1383,12 +1383,11 @@ def find_nccl_library() -> str:
|
||||
|
||||
prev_set_stream = torch.cuda.set_stream
|
||||
|
||||
_current_stream = None
|
||||
_current_stream_tls = threading.local()
|
||||
|
||||
|
||||
def _patched_set_stream(stream: torch.cuda.Stream) -> None:
|
||||
global _current_stream
|
||||
_current_stream = stream
|
||||
_current_stream_tls.value = stream
|
||||
prev_set_stream(stream)
|
||||
|
||||
|
||||
@ -1407,16 +1406,16 @@ def current_stream() -> torch.cuda.Stream:
|
||||
from C/C++ code.
|
||||
"""
|
||||
from vllm.platforms import current_platform
|
||||
global _current_stream
|
||||
if _current_stream is None:
|
||||
if not hasattr(_current_stream_tls,
|
||||
"value") or _current_stream_tls.value is None:
|
||||
# when this function is called before any stream is set,
|
||||
# we return the default stream.
|
||||
# On ROCm using the default 0 stream in combination with RCCL
|
||||
# is hurting performance. Therefore creating a dedicated stream
|
||||
# per process
|
||||
_current_stream = torch.cuda.Stream() if current_platform.is_rocm(
|
||||
) else torch.cuda.current_stream()
|
||||
return _current_stream
|
||||
_current_stream_tls.value = torch.cuda.Stream(
|
||||
) if current_platform.is_rocm() else torch.cuda.current_stream()
|
||||
return _current_stream_tls.value
|
||||
|
||||
|
||||
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
|
||||
|
||||
@ -446,17 +446,12 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0.")
|
||||
if logits_soft_cap is not None:
|
||||
logger.warning_once("Torch SPDA does not support logits soft cap. "
|
||||
"Outputs may be slightly off.")
|
||||
if use_irope:
|
||||
logger.warning_once(
|
||||
"Using irope in Torch SPDA is not supported yet, it will fall"
|
||||
" back to global attention for long context.")
|
||||
self.paged_attn_impl = _get_paged_attn_impl()
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
|
||||
@ -352,7 +352,6 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: AttentionType = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
@ -381,7 +380,6 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"FlashAttentionImpl")
|
||||
self.use_irope = use_irope
|
||||
self.vllm_flash_attn_version = get_flash_attn_version()
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype) \
|
||||
and not flash_attn_supports_fp8():
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user