Revert "Merge commit '6e8d8c4afbddf725b34ef938616701869f5b3462' into sage/dbo-full-cudagraphsh"

This reverts commit 5215c80a4988e81d2f5971e02d50d3785cab5ae8, reversing
changes made to dd2a94fd9d90d0c04772380c220b5ec81bd0b61e.
This commit is contained in:
yewentao256 2025-08-19 12:17:31 -07:00
parent a0a11bc0b5
commit 9f04a6cf57
149 changed files with 1970 additions and 3770 deletions

View File

@ -104,6 +104,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {

View File

@ -11,6 +11,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,

View File

@ -35,6 +35,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
@ -89,6 +90,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
@ -143,6 +145,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
@ -194,6 +197,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
@ -247,6 +251,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
@ -300,6 +305,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,

View File

@ -17,6 +17,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -49,6 +50,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -81,6 +83,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -114,6 +117,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -149,6 +153,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -184,6 +189,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,

View File

@ -17,6 +17,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -49,6 +50,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -82,6 +84,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -115,6 +118,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -150,6 +154,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -186,6 +191,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,

View File

@ -17,6 +17,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -49,6 +50,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -81,6 +83,7 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -114,6 +117,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
@ -149,6 +153,7 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,

View File

@ -7,6 +7,7 @@
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -25,6 +26,7 @@
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -43,6 +45,7 @@
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -57,7 +60,8 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {

View File

@ -647,31 +647,13 @@ steps:
- label: Blackwell Test
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
- csrc/
- vllm/
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
##### 1 GPU test #####
##### multi gpus test #####
@ -822,7 +804,6 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_multi_loras_with_tp.py
- label: Weight Loading Multiple GPU Test # 33min

View File

@ -1,45 +1,13 @@
# Security Policy
## Reporting security issues
## Reporting a Vulnerability
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
## Issue triage
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
## Threat model
---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
## Issue severity
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
### CRITICAL Severity
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥9.0.
### HIGH Severity
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
### MODERATE Severity
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
### LOW Severity
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
## Prenotification policy
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

View File

@ -22,13 +22,6 @@ from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()
def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, (
"intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
)
class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
BLOCK_SIZE_N: int
@ -610,7 +603,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
ensure_divisibility(intermediate_size, args.tp_size)
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"

View File

@ -1,156 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)
logger = init_logger(__name__)
@torch.inference_mode()
def run_benchmark(
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
value = torch.randn_like(key)
# prepare the slot mapping.
# each token is assigned a unique slot in the KV-cache.
num_slots = block_size * num_blocks
if num_tokens > num_slots:
raise ValueError("num_tokens cannot exceed the total number of cache slots")
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
key_caches, value_caches = create_kv_caches_with_random_flash(
num_blocks,
block_size,
1, # num_layers
num_heads,
head_size,
kv_cache_dtype,
dtype,
device=device,
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
ops.reshape_and_cache_flash(
key,
value,
key_cache,
value_cache,
slot_mapping,
kv_cache_dtype,
k_scale,
v_scale,
)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
# warm-up
run_cuda_benchmark(3)
lat = run_cuda_benchmark(num_iters)
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.cuda.empty_cache()
return lat
def main(args):
rows = []
for layout in ["NHD", "HND"]:
for exp in range(1, 17):
n_tok = 2**exp
lat = run_benchmark(
num_tokens=n_tok,
num_heads=args.num_heads,
head_size=args.head_size,
block_size=args.block_size,
num_blocks=args.num_blocks,
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--num-heads", type=int, default=128)
parser.add_argument(
"--head-size",
type=int,
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128,
)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--num-blocks", type=int, default=128 * 512)
parser.add_argument(
"--dtype",
type=str,
choices=["half", "bfloat16", "float"],
default="bfloat16",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8"],
default="auto",
)
parser.add_argument("--iters", type=int, default=100)
args = parser.parse_args()
main(args)

View File

@ -4,16 +4,49 @@
# ruff: noqa: E501
import time
# Import DeepGEMM functions
import deep_gemm
import torch
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
# Import vLLM functions
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
w8a8_block_fp8_matmul,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
def per_token_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-token scaling."""
assert x.dim() == 2 and x.size(1) % 128 == 0
m, n = x.shape
x_view = x.view(m, -1, 128)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
def per_block_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-block scaling."""
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
def benchmark_shape(m: int,
@ -36,14 +69,14 @@ def benchmark_shape(m: int,
# Pre-quantize B for all implementations
# (weights can be pre-quantized offline)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
# Block size configuration
block_size = [128, 128]
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
@ -52,7 +85,7 @@ def benchmark_shape(m: int,
# === DeepGEMM Implementation ===
def deepgemm_gemm():
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm),
C_deepgemm)
return C_deepgemm

View File

@ -5,7 +5,6 @@
#include "cuda_utils.h"
#include "cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
#include "quantization/fp8/amd/quant_utils.cuh"
@ -262,26 +261,14 @@ __global__ void reshape_and_cache_kernel(
}
}
// Used by vectorization_utils to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below
cache_t* __restrict__ value_cache, // same above
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
// head_size]
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
// head_size]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int64_t block_stride, const int64_t page_stride,
const int64_t head_stride, const int64_t key_stride,
@ -295,58 +282,25 @@ __global__ void reshape_and_cache_flash_kernel(
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
const int n_elems = num_heads * head_size;
// pointers to the beginning of the source row for this token.
const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
// find the start position inside the kv-cache for this token.
cache_t* __restrict__ key_dst =
key_cache + block_idx * block_stride + block_offset * page_stride;
cache_t* __restrict__ value_dst =
value_cache + block_idx * block_stride + block_offset * page_stride;
// this is true for the NHD layout where `head_stride == head_size`
const bool is_contiguous_heads = (head_stride == head_size);
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
if (is_contiguous_heads) {
// NHD layout
// kv cache: [num_blocks, block_size, num_heads, head_size]
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
blockDim.x, k_op);
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
threadIdx.x, blockDim.x, v_op);
} else {
// HND layout: heads are strided, but each head_size segment is contiguous
// kv cache: [num_blocks, num_heads, block_size, head_size]
const int lane = threadIdx.x & 31; // 0..31 within warp
const int warp_id = threadIdx.x >> 5; // warp index within block
const int warps_per_block = blockDim.x >> 5;
for (int head = warp_id; head < num_heads; head += warps_per_block) {
const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
cache_t* __restrict__ k_dst_h =
key_dst + static_cast<int64_t>(head) * head_stride;
cache_t* __restrict__ v_dst_h =
value_dst + static_cast<int64_t>(head) * head_stride;
// within each head, let the 32 threads of the warp perform the vector
// copy
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
k_op);
vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
v_op);
const int n = num_heads * head_size;
for (int i = threadIdx.x; i < n; i += blockDim.x) {
const int64_t src_key_idx = token_idx * key_stride + i;
const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int64_t tgt_key_value_idx = block_idx * block_stride +
block_offset * page_stride +
head_idx * head_stride + head_offset;
scalar_t tgt_key = key[src_key_idx];
scalar_t tgt_value = value[src_value_idx];
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
key_cache[tgt_key_value_idx] = tgt_key;
value_cache[tgt_key_value_idx] = tgt_value;
} else {
key_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
value_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
}
}
}

View File

@ -24,12 +24,9 @@
#ifndef USE_ROCM
#include <cub/util_type.cuh>
#include <cub/cub.cuh>
#include <cuda/std/functional>
using AddOp = cuda::std::plus<float>;
#else
#include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
using AddOp = cub::Sum;
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@ -65,6 +62,7 @@ __launch_bounds__(TPB) __global__
const int thread_row_offset = blockIdx.x * num_cols;
cub::Sum sum;
float threadData(-FLT_MAX);
// Don't touch finished rows.
@ -94,7 +92,7 @@ __launch_bounds__(TPB) __global__
threadData += exp((static_cast<float>(input[idx]) - float_max));
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
if (threadIdx.x == 0)
{

View File

@ -1,3 +1,4 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
@ -15,7 +16,6 @@ ARG PYTHON_VERSION=3.12
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
@ -206,16 +206,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
ARG VLLM_USE_PRECOMPILED=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
@ -232,6 +223,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
@ -245,9 +238,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
echo "Cleaning up extra wheels in dist/..." && \
# Identify the most recent manylinux1_x86_64 wheel
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
if [ -n "$KEEP_WHEEL" ]; then \
echo "Keeping wheel: $KEEP_WHEEL"; \
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
fi; \
fi
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
@ -283,6 +289,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
#################### vLLM installation IMAGE ####################
# image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM ${FINAL_BASE_IMAGE} AS vllm-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
@ -363,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi
# Install vllm wheel first, so that torch etc will be installed.
# !bang
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
@ -427,33 +435,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
CUDA_MAJOR="${CUDA_VERSION%%.*}"
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
CUDA_MINOR="${CUDA_MINOR%%.*}"
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
git clone --recursive --shallow-submodules \
${DEEPGEMM_GIT_REPO} deepgemm
echo "🏗️ Building DeepGEMM"
pushd deepgemm
git checkout ${DEEPGEMM_GIT_REF}
# Build DeepGEMM
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
rm -rf build dist
rm -rf *.egg-info
python3 setup.py bdist_wheel
uv pip install --system dist/*.whl
popd
rm -rf deepgemm
else
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
fi
BASH
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################

View File

@ -172,36 +172,6 @@ Multi-image input can be extended to perform video captioning. We show this with
print(generated_text)
```
#### Custom RGBA Background Color
When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`.
??? code
```python
from vllm import LLM
# Default white background (no configuration needed)
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Custom black background for dark theme
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
)
# Custom brand color background (e.g., blue)
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
)
```
!!! note
- The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255
- This setting only affects RGBA images with transparency; RGB images are unchanged
- If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
### Video Inputs
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
@ -508,20 +478,6 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
```
#### Custom RGBA Background Color
To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
```bash
# Example: Black background for dark theme
vllm serve llava-hf/llava-1.5-7b-hf \
--media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}'
# Example: Custom gray background
vllm serve llava-hf/llava-1.5-7b-hf \
--media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}'
```
### Audio Inputs
Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).

View File

@ -15,10 +15,6 @@ Speculative decoding is a technique which improves inter-token latency in memory
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
!!! warning
In vllm v0.10.0, speculative decoding with a draft model is not supported.
If you use the following code, you will get a `NotImplementedError`.
??? code
```python

View File

@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
}
],
response_format={
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "car-description",

View File

@ -1,6 +1,6 @@
# --8<-- [start:installation]
vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source.
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
uv pip install -r requirements/cpu.txt
uv pip install -e .
pip install -r requirements/cpu.txt
pip install -e .
```
!!! note
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
!!! example "Troubleshooting"
If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your
If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
```text

View File

@ -1,4 +1,4 @@
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
@ -6,34 +6,28 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
Second, clone vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
Third, install Python packages for vLLM CPU backend building:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend auto
uv pip install -r requirements/cpu.txt --torch-backend auto
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
Finally, build and install vLLM CPU backend:
```bash
VLLM_TARGET_DEVICE=cpu python setup.py install
```
If you want to develop vLLM, install it in editable mode instead.
If you want to develop vllm, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu python setup.py develop

View File

@ -1,6 +1,6 @@
# --8<-- [start:installation]
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
@ -40,32 +40,21 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
. "$HOME/.cargo/env"
```
Execute the following commands to build and install vLLM from source.
Execute the following commands to build and install vLLM from the source.
!!! tip
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
```bash
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
uv pip install -v \
--torch-backend auto \
pip install -v \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
-r requirements-build.txt \
-r requirements-cpu.txt \
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
uv pip install dist/*.whl
pip install dist/*.whl
```
??? console "pip"
```bash
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
pip install -v \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
-r requirements-build.txt \
-r requirements-cpu.txt \
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
pip install dist/*.whl
```
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
@ -74,19 +63,19 @@ Execute the following commands to build and install vLLM from source.
```bash
docker build -f docker/Dockerfile.s390x \
--tag vllm-cpu-env .
--tag vllm-cpu-env .
# Launch OpenAI server
# Launching OpenAI server
docker run --rm \
--privileged true \
--shm-size 4g \
-p 8000:8000 \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
vllm-cpu-env \
--model meta-llama/Llama-3.2-1B-Instruct \
--dtype float \
other vLLM OpenAI server arguments
--privileged=true \
--shm-size=4g \
-p 8000:8000 \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
vllm-cpu-env \
--model=meta-llama/Llama-3.2-1B-Instruct \
--dtype=float \
other vLLM OpenAI server arguments
```
# --8<-- [end:build-image-from-source]

View File

@ -713,7 +713,6 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
### Pooling Models

View File

@ -423,6 +423,51 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
)
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
limit_mm_per_prompt={modality: 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1"
@ -496,6 +541,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Nemontron_VL
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
assert modality == "image"
placeholder = "<image>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Keye-VL
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@ -551,41 +634,6 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
)
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
[
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
}
]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)
stop_token_ids = None
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -828,6 +876,41 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
)
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
[
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
}
]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)
stop_token_ids = None
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -853,44 +936,6 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
)
# Nemontron_VL
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
assert modality == "image"
placeholder = "<image>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# NVLM-D
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1248,6 +1293,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1281,111 +1357,6 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
)
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
limit_mm_per_prompt={modality: 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Step3
def run_step3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
gpu_memory_utilization=0.85,
tensor_parallel_size=8,
limit_mm_per_prompt={modality: 1},
reasoning_parser="step3",
)
prompts = [
"<begin▁of▁sentence> You are a helpful assistant. <|BOT|>user\n "
f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
@ -1402,9 +1373,9 @@ model_example_map = {
"idefics3": run_idefics3,
"interns1": run_interns1,
"internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl,
"keye_vl": run_keye_vl,
"kimi_vl": run_kimi_vl,
"llama4": run_llama4,
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
@ -1414,8 +1385,8 @@ model_example_map = {
"minicpmv": run_minicpmv,
"mistral3": run_mistral3,
"mllama": run_mllama,
"llama4": run_llama4,
"molmo": run_molmo,
"nemotron_vl": run_nemotron_vl,
"NVLM_D": run_nvlm_d,
"ovis": run_ovis,
"paligemma": run_paligemma,
@ -1430,7 +1401,6 @@ model_example_map = {
"qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
"step3": run_step3,
"tarsier": run_tarsier,
"tarsier2": run_tarsier2,
}

View File

@ -197,53 +197,6 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
)
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
@ -272,6 +225,34 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
)
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
prompt = (
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "internlm/Intern-S1"
@ -335,36 +316,49 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
max_model_len=131072,
tensor_parallel_size=8,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
"type": "text",
"text": question,
}
]
)
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
@ -469,6 +463,40 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
)
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=131072,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@ -926,62 +954,6 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
)
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
prompt = (
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
gpu_memory_utilization=0.85,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
reasoning_parser="step3",
)
prompt = (
"<begin▁of▁sentence> You are a helpful assistant. <|BOT|>user\n "
f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
">assistant\n<think>\n"
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier-7b"
@ -1034,16 +1006,16 @@ model_example_map = {
"deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"idefics3": load_idefics3,
"interns1": load_interns1,
"internvl_chat": load_internvl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl,
"llama4": load_llama4,
"llava": load_llava,
"llava-next": load_llava_next,
"llava-onevision": load_llava_onevision,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
@ -1056,7 +1028,6 @@ model_example_map = {
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
"step3": load_step3,
"tarsier": load_tarsier,
"tarsier2": load_tarsier2,
}

View File

@ -293,8 +293,6 @@ class Proxy:
# add params to request
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
if "max_completion_tokens" in kv_prepare_request:
kv_prepare_request["max_completion_tokens"] = 1
# prefill stage
prefill_instance = self.schedule(self.prefill_cycler)

View File

@ -128,8 +128,6 @@ async def handle_request():
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1
if "max_completion_tokens" in prefill_request:
prefill_request["max_completion_tokens"] = 1
global count
global prefill_instances

View File

@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
pydantic >= 2.10
prometheus_client >= 0.18.0
pillow # Required for image processing

View File

@ -11,4 +11,6 @@ torchaudio==2.7.1
# These must be updated alongside torch
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
# FlashInfer should be updated together with the Dockerfile
flashinfer_python==0.2.9rc2

View File

@ -22,9 +22,7 @@ aiohttp==3.10.11
aiohttp-cors==0.8.1
# via ray
aiosignal==1.3.1
# via
# aiohttp
# ray
# via aiohttp
albucore==0.0.16
# via terratorch
albumentations==1.4.6
@ -139,7 +137,7 @@ contourpy==1.3.0
# via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.3.0
cupy-cuda12x==13.5.1
# via ray
cycler==0.12.1
# via matplotlib
@ -226,7 +224,6 @@ frozenlist==1.5.0
# via
# aiohttp
# aiosignal
# ray
fsspec==2024.9.0
# via
# datasets
@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86
opentelemetry-api==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
opentelemetry-exporter-prometheus==0.56b0
# via ray
opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0
# via mlflow-skinny
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk
packaging==24.2
@ -697,7 +702,9 @@ pqdm==0.2.0
pretrainedmodels==0.7.4
# via segmentation-models-pytorch
prometheus-client==0.22.0
# via ray
# via
# opentelemetry-exporter-prometheus
# ray
propcache==0.2.0
# via yarl
proto-plus==1.26.1
@ -707,6 +714,7 @@ protobuf==5.28.3
# google-api-core
# googleapis-common-protos
# mlflow-skinny
# opentelemetry-proto
# proto-plus
# ray
# tensorboardx
@ -854,7 +862,7 @@ rasterio==1.4.3
# rioxarray
# terratorch
# torchgeo
ray==2.43.0
ray==2.48.0
# via -r requirements/test.in
redis==5.2.0
# via tensorizer

186
setup.py
View File

@ -7,6 +7,7 @@ import json
import logging
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
@ -281,10 +282,69 @@ class cmake_build_ext(build_ext):
self.copy_file(file, dst_file)
class repackage_wheel(build_ext):
class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
def get_base_commit_in_main_branch(self) -> str:
@staticmethod
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
print(f"Using existing wheel at {wheel_path}")
package_data_patch = {}
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))
return package_data_patch
finally:
if temp_dir is not None:
print(f"Removing temporary directory {temp_dir}")
shutil.rmtree(temp_dir)
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
@ -297,6 +357,10 @@ class repackage_wheel(build_ext):
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
@ -329,92 +393,15 @@ class repackage_wheel(build_ext):
"wheel may not be compatible with your dev branch: %s", err)
return "nightly"
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile
if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]
import tempfile
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)
if package_name not in package_data:
package_data[package_name] = []
wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue
package_data[package_name].append(file_name)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
def _is_cuda() -> bool:
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
return True
has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu()))
@ -639,16 +626,37 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
if _no_device():
ext_modules = []
if not ext_modules:
if not ext_modules or envs.VLLM_USE_PRECOMPILED:
# Disable build_ext when using precompiled wheel
cmdclass = {}
else:
cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
cmdclass = {"build_ext": cmake_build_ext}
setup(
# static metadata should rather go in pyproject.toml
@ -663,9 +671,7 @@ setup(
["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile",
"mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.9rc2"],
"video": [] # Kept for backwards compatibility
},
cmdclass=cmdclass,
package_data=package_data,

View File

@ -136,15 +136,12 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"test_model",
[
TestAllReduceRMSNormModel,
TestAllReduceFusedAddRMSNormModel,
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
# TODO: Enable with torch==2.8.0
# TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
])
@pytest.mark.parametrize("test_model", [
TestAllReduceRMSNormModel,
TestAllReduceFusedAddRMSNormModel,
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [8])
@pytest.mark.parametrize("hidden_size", [16])

View File

@ -28,6 +28,7 @@ def test_mp_reducer(monkeypatch):
max_model_len=32,
gpu_memory_utilization=0.1,
disable_log_stats=True,
disable_log_requests=True,
)
async_llm = AsyncLLM.from_engine_args(

View File

@ -5,7 +5,7 @@ import json
from argparse import ArgumentError
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Annotated, Literal, Optional, Union
from typing import Annotated, Literal, Optional
import pytest
@ -136,8 +136,6 @@ class DummyConfig:
"""List with variable length"""
list_literal: list[Literal[1, 2]] = field(default_factory=list)
"""List with literal choices"""
list_union: list[Union[str, type[object]]] = field(default_factory=list)
"""List with union type"""
literal_literal: Literal[Literal[1], Literal[2]] = 1
"""Literal of literals with default 1"""
json_tip: dict = field(default_factory=dict)
@ -189,9 +187,6 @@ def test_get_kwargs():
assert kwargs["list_literal"]["type"] is int
assert kwargs["list_literal"]["nargs"] == "+"
assert kwargs["list_literal"]["choices"] == [1, 2]
# lists with unions should become str type.
# If not, we cannot know which type to use for parsing
assert kwargs["list_union"]["type"] is str
# literals of literals should have merged choices
assert kwargs["literal_literal"]["choices"] == [1, 2]
# dict should have json tip in help

View File

@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import torch
import vllm._custom_ops as ops
from vllm.utils.deep_gemm import per_block_cast_to_fp8
def per_token_cast_to_fp8(
@ -20,6 +20,29 @@ def per_token_cast_to_fp8(
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
def per_block_cast_to_fp8(
x: torch.Tensor, block_size_k: int,
block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros(
(
int(math.ceil(m / block_size_k)) * block_size_k,
int(math.ceil(n / block_size_n)) * block_size_n,
),
dtype=x.dtype,
device=x.device,
)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, block_size_k,
x_padded.size(1) // block_size_k, block_size_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def make_non_quant_weights(
e: int,
n: int,
@ -76,9 +99,11 @@ def make_block_quant_fp8_weights(
for i in range(e):
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
block_size=[block_k, block_n])
block_size_k=block_k,
block_size_n=block_n)
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
block_size=[block_k, block_n])
block_size_k=block_k,
block_size_n=block_n)
return w1, w2, w1_s, w2_s

View File

@ -1,103 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedPrepareAndFinalize, BatchedTritonExperts)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported
from .test_deepgemm import make_block_quant_fp8_weights
BLOCK_SIZE = [128, 128]
@pytest.mark.skipif(not is_deep_gemm_supported(),
reason="Requires deep_gemm kernels")
@pytest.mark.parametrize("E", [16, 32]) # number of experts
@pytest.mark.parametrize("T", [256, 512]) # tokens per expert
@pytest.mark.parametrize("K", [128, 256]) # hidden dim
@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert
@pytest.mark.parametrize("topk", [2, 4])
def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
monkeypatch):
"""Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
device = "cuda"
w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE)
M = E * T # total tokens
a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0
fp8_info = torch.finfo(torch.float8_e4m3fn)
a.clamp_(fp8_info.min, fp8_info.max)
# random router outputs → top-k indices / weights
router_logits = torch.randn(M, E, device=device, dtype=torch.float32)
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
# token number for each expert
cnt = torch.bincount(topk_ids.flatten(), minlength=E)
max_cnt = int(cnt.max().item())
# next power of 2 for max token number
max_num_tokens = 1 << (max_cnt - 1).bit_length()
prep_finalize = BatchedPrepareAndFinalize(
max_num_tokens=max_num_tokens,
num_local_experts=E,
num_dispatchers=1,
rank=0,
)
# triton (reference)
triton_experts = BatchedTritonExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=1,
use_fp8_w8a8=True,
per_act_token_quant=False,
block_shape=BLOCK_SIZE,
)
mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)
out_triton = mk_triton(
hidden_states=a,
w1=w1,
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
w1_scale=w1_s,
w2_scale=w2_s,
global_num_experts=E,
)
# deepgemm
deepgemm_experts = BatchedDeepGemmExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=1,
block_shape=BLOCK_SIZE,
per_act_token_quant=False,
)
mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)
out_deepgemm = mk_deepgemm(
hidden_states=a,
w1=w1,
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
w1_scale=w1_s,
w2_scale=w2_s,
global_num_experts=E,
)
diff = calc_diff(out_deepgemm, out_triton)
assert diff < 1e-3, f"Output diff too large: {diff}"

View File

@ -12,8 +12,10 @@ import torch
from tests.kernels.utils import baseline_scaled_mm
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.deep_gemm import per_block_cast_to_fp8
def cdiv(a, b):
return (a + b - 1) // b
def per_token_cast_to_fp8(
@ -30,6 +32,21 @@ def per_token_cast_to_fp8(
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
def per_block_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
device=x.device,
dtype=x.dtype)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
(4, 8192, 7168, 4096),
(4, 8192, 2048, 7168),

View File

@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
is_deep_gemm_supported)
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights
@ -47,7 +46,7 @@ requires_deep_ep = pytest.mark.skipif(
)
requires_deep_gemm = pytest.mark.skipif(
not is_deep_gemm_supported(),
not has_deep_gemm(),
reason="Requires deep_gemm kernels",
)

View File

@ -15,11 +15,16 @@ import torch
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8)
from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
per_block_cast_to_fp8)
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
BLOCK_SIZE = [128, 128]
requires_deep_gemm = pytest.mark.skipif(
not has_deep_gemm(),
reason="Requires deep_gemm kernels",
)
def make_block_quant_fp8_weights(
e: int,
@ -64,12 +69,8 @@ def make_block_quant_fp8_weights(
dtype=torch.float32)
for i in range(e):
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
block_size=block_size,
use_ue8m0=True)
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
block_size=block_size,
use_ue8m0=True)
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
return w1, w2, w1_s, w2_s
@ -147,8 +148,7 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("mnk", MNKs)
@pytest.mark.parametrize("topk", TOPKS)
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@pytest.mark.skipif(not is_deep_gemm_supported(),
reason="Requires deep_gemm kernels")
@requires_deep_gemm
def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
with monkeypatch.context() as m:

View File

@ -5,7 +5,8 @@ from typing import Optional
import torch
import vllm._custom_ops as ops
from tests.kernels.quant_utils import per_block_cast_to_int8
from tests.kernels.quant_utils import (per_block_cast_to_fp8,
per_block_cast_to_int8)
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@ -14,7 +15,6 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.utils import round_up
from vllm.utils.deep_gemm import per_block_cast_to_fp8
def triton_moe(

View File

@ -222,6 +222,25 @@ def native_per_token_group_quant_int8(x,
DEFAULT_BLOCK_SHAPE = [128, 128]
def per_block_cast_to_fp8(
x: torch.Tensor,
block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
) -> tuple[torch.Tensor, torch.Tensor]:
block_m, block_n = block_shape
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def per_block_cast_to_int8(
x: torch.Tensor,
block_shape: list[int] = DEFAULT_BLOCK_SHAPE,

View File

@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
As = As_fp8.to(torch.float32)
Bs = Bs_fp8.to(torch.float32)

View File

@ -559,6 +559,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
m_a_scales = m_g if per_act_token else 1
n_b_scales = n_g if per_out_ch else 1
print("shape:", m_g, n_g, k_g)
# Create group-specific A and B (FP8) and output (FP16/FP32)
a_g = to_fp8(torch.randn((m_g, k_g), device=device))
b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
@ -637,4 +639,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
for g in range(num_experts):
baseline = baseline_tensors[g]
c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
print(baseline)
print(c)
print("*")
torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)

View File

@ -1,158 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Script to test multi loras service with tp >= 2
"""
from tests.utils import multi_gpu_test
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
MODEL_PATH = "Qwen/Qwen3-0.6B"
LORA_NAME_PATH_MAP = {
"Alice": "charent/self_cognition_Alice",
"Bob": "charent/self_cognition_Bob",
"Cat": "charent/self_cognition_Bob", # same as Bob
}
LORA_NAME_ID_MAP = {}
INCREASE_LORA_ID = 0
LORA_RANK = 8
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
LORA_TEST_EXPECTED = [
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501
]
def format_chatml_messages(prompt: str):
return [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
},
]
def make_add_lora_request(name: str, path: str):
global INCREASE_LORA_ID, LORA_NAME_ID_MAP
INCREASE_LORA_ID += 1
LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
return LoRARequest(
lora_name=name,
lora_int_id=INCREASE_LORA_ID,
lora_path=path,
)
@multi_gpu_test(num_gpus=2)
def test_multi_loras_with_tp_sync():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2, # ensure max_loras < max_cpu_loras
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
tensor_parallel_size=2, # ensure tp >= 2
max_cpu_loras=4, # ensure max_cpu_loras >= 2
)
def run_check_lora(fn, args, expected: list):
fn(args)
assert set(llm.llm_engine.list_loras()) == set(expected)
# simulate add loras with CLI args
# likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
[1],
)
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
[1, 2],
)
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
[1, 2, 3],
)
# set temperature = 0 for greedy search
sampling_params = SamplingParams(temperature=0, max_tokens=64)
def call_llm_get_outputs(prompt: str, lora_name: str):
lora_request = LoRARequest(
lora_name=lora_name,
lora_int_id=LORA_NAME_ID_MAP[lora_name],
lora_path=LORA_NAME_PATH_MAP[lora_name],
)
messages = format_chatml_messages(prompt)
outputs = llm.chat(
[messages],
sampling_params,
chat_template_kwargs={
"enable_thinking": False
}, # for those loras, ensure enable_thinking=False
lora_request=lora_request,
use_tqdm=False,
)
output_text = outputs[0].outputs[0].text
return output_text
def reload_lora(name: str):
"""
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
for dynamic lora loading and unloading
"""
remove_lora_response = llm.llm_engine.remove_lora(
lora_id=LORA_NAME_ID_MAP[name])
add_lora_response = llm.llm_engine.add_lora(
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
print(f"{remove_lora_response=}, {add_lora_response=}")
def check_outputs(outputs: str, expected: str):
print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
print("\n----------------------------\n")
assert outputs == expected
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# call Bob, ignore what it is output
call_llm_get_outputs(prompt, "Bob")
print("After call Bob:")
# call Alice
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# reload Bob Lora
reload_lora("Bob")
print("After reload Bob:")
# call Alice
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# reload Alice Lora
reload_lora("Alice")
print("After reload Alice:")
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)

View File

@ -4,6 +4,8 @@ from dataclasses import dataclass
from typing import Optional
import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
import vllm
from vllm.assets.image import ImageAsset
@ -183,6 +185,10 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
)
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,

View File

@ -702,38 +702,13 @@ VLM_TEST_SETTINGS = {
"smolvlm": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
),
"tarsier": VLMTestInfo(
models=["omni-research/Tarsier-7b"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.tarsier_patch_hf_runner,
),
"tarsier2": VLMTestInfo(
models=["omni-research/Tarsier2-Recap-7b"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO,
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skip("Model initialization hangs")],
),
### Tensor parallel / multi-gpu broadcast tests
"chameleon-broadcast": VLMTestInfo(
models=["facebook/chameleon-7b"],

View File

@ -818,15 +818,3 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
thinker.get_output_embeddings = lambda: thinker.lm_head
hf_model.model = thinker
return hf_model
def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
from vllm.model_executor.models.tarsier import get_vision_encoder_info
vision_encoder_info = get_vision_encoder_info(hf_model.config)
hf_processor = hf_model.processor
if hf_processor.patch_size is None:
hf_processor.patch_size = vision_encoder_info.get_patch_size()
return hf_model

View File

@ -1,51 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("expected_toks_per_frame", [299])
@pytest.mark.parametrize("num_frames", [32, 128])
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
def test_processor_override(
model_id: str,
expected_toks_per_frame: int,
expected_grid_t: int,
fps: int,
num_frames: int,
):
"""Ensure GLM4vMultiModalProcessor can handle video frames properly."""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video, metadata = video_assets.np_ndarrays, video_assets.metadata
metadata["fps"] = fps
mm_data = {"video": [(video, metadata)]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
video_tok_count = processed_inputs["prompt_token_ids"].count(
video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t

View File

@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
model_impl="transformers",
)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
image_pil = ImageAsset('cherry_blossom').pil_image
mm_data = {"image": image_pil}

View File

@ -465,7 +465,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False),
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
trust_remote_code=True),
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
"VoxtralForConditionalGeneration": _HfExamplesInfo(
@ -525,7 +526,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
}
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
"TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
}

View File

@ -34,7 +34,8 @@ def check_implementation(
with runner_test(model, **kwargs_test, **kwargs) as model_test:
model_config = model_test.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
outputs_test = model_test.generate_greedy_logprobs(*args)
@ -134,7 +135,8 @@ def test_quantization(
enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
transformers_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
@ -147,25 +149,6 @@ def test_quantization(
)
@pytest.mark.parametrize(
"model",
[
# Layers live in `layers`
"Qwen/Qwen3-Embedding-0.6B",
# Layers live in `model.layers`
"meta-llama/Llama-3.2-1B-Instruct"
],
)
def test_embed_loading(vllm_runner, model):
with vllm_runner(model,
max_model_len=1024,
enforce_eager=True,
runner="pooling",
model_impl="transformers") as model_test:
model_config = model_test.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
@pytest.mark.parametrize(
"model",
["jason9693/Qwen2.5-1.5B-apeach"],
@ -186,7 +169,8 @@ def test_classify(
dtype=dtype,
model_impl="transformers") as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
vllm_outputs = vllm_model.classify(example_prompts)

View File

@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS = 10000
# Scenarios to test for num generated token.
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
@pytest.fixture(scope="function")

View File

@ -3,10 +3,9 @@
from pathlib import Path
import numpy as np
import pytest
from PIL import Image, ImageChops
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
from vllm.multimodal.image import convert_image_mode
ASSETS_DIR = Path(__file__).parent / "assets"
assert ASSETS_DIR.exists()
@ -36,115 +35,3 @@ def test_rgba_to_rgb():
assert converted_image_numpy[i][j][0] == 255
assert converted_image_numpy[i][j][1] == 255
assert converted_image_numpy[i][j][2] == 255
def test_rgba_to_rgb_custom_background(tmp_path):
"""Test RGBA to RGB conversion with custom background colors."""
# Create a simple RGBA image with transparent and opaque pixels
rgba_image = Image.new("RGBA", (10, 10),
(255, 0, 0, 255)) # Red with full opacity
# Make top-left quadrant transparent
for i in range(5):
for j in range(5):
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
# Save the test image to tmp_path
test_image_path = tmp_path / "test_rgba.png"
rgba_image.save(test_image_path)
# Test 1: Default white background (backward compatibility)
image_io_default = ImageMediaIO()
converted_default = image_io_default.load_file(test_image_path)
default_numpy = np.array(converted_default)
# Check transparent pixels are white
assert default_numpy[0][0][0] == 255 # R
assert default_numpy[0][0][1] == 255 # G
assert default_numpy[0][0][2] == 255 # B
# Check opaque pixels remain red
assert default_numpy[5][5][0] == 255 # R
assert default_numpy[5][5][1] == 0 # G
assert default_numpy[5][5][2] == 0 # B
# Test 2: Custom black background via kwargs
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
converted_black = image_io_black.load_file(test_image_path)
black_numpy = np.array(converted_black)
# Check transparent pixels are black
assert black_numpy[0][0][0] == 0 # R
assert black_numpy[0][0][1] == 0 # G
assert black_numpy[0][0][2] == 0 # B
# Check opaque pixels remain red
assert black_numpy[5][5][0] == 255 # R
assert black_numpy[5][5][1] == 0 # G
assert black_numpy[5][5][2] == 0 # B
# Test 3: Custom blue background via kwargs (as list)
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
converted_blue = image_io_blue.load_file(test_image_path)
blue_numpy = np.array(converted_blue)
# Check transparent pixels are blue
assert blue_numpy[0][0][0] == 0 # R
assert blue_numpy[0][0][1] == 0 # G
assert blue_numpy[0][0][2] == 255 # B
# Test 4: Test with load_bytes method
with open(test_image_path, 'rb') as f:
image_data = f.read()
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
converted_green = image_io_green.load_bytes(image_data)
green_numpy = np.array(converted_green)
# Check transparent pixels are green
assert green_numpy[0][0][0] == 0 # R
assert green_numpy[0][0][1] == 255 # G
assert green_numpy[0][0][2] == 0 # B
def test_rgba_background_color_validation():
"""Test that invalid rgba_background_color values are properly rejected."""
# Test invalid types
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color="255,255,255")
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=255)
# Test wrong number of elements
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(255, 255))
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
# Test non-integer values
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(255, "255", 255))
# Test out of range values
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(256, 255, 255))
with pytest.raises(ValueError,
match="rgba_background_color must be a list or tuple"):
ImageMediaIO(rgba_background_color=(255, -1, 255))
# Test that valid values work
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise

View File

@ -2,15 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import nullcontext
from typing import Optional, cast
from types import MethodType
from typing import cast
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from transformers import ProcessorMixin
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
MultiModalKwargsItem,
@ -1012,91 +1013,57 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
)
class DummyProcessor:
class _ProcessorProxy:
def __init__(self, a: int = 0, b: int = 0) -> None:
def __init__(self, processor: ProcessorMixin) -> None:
super().__init__()
self.a = a
self.b = b
self.__processor = processor
def __getattr__(self, key: str):
return getattr(self.__processor, key)
def __call__(
self,
a: int = 0,
c: int = 0,
return_tensors: Optional[str] = None,
) -> dict[str, int]:
return dict(a=a, c=c)
text=None,
images=None,
videos=None,
exists=None,
return_tensors=None,
):
return dict(exists=exists)
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
# yapf: disable
@pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"),
("call_kwargs", "expected_kwargs"),
[
({"a": 1}, {}, {"a": 1, "b": 0}),
({}, {"a": 1}, {"a": 1, "b": 0}),
# inference_kwargs should take precedence
({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
# Should ignore extra kwargs
({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
# Should ignore invalid kwargs
({"does_not_exist": 100}, {"exists": None}),
({"exists": 1}, {"exists": 1}),
({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
],
)
# yapf: enable
def test_hf_processor_init_kwargs(
model_id,
config_kwargs,
inference_kwargs,
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
model_config = ModelConfig(model_id)
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=mock_tokenizer,
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
orig_get_hf_processor = processor.info.get_hf_processor
def get_hf_processor(self, **kwargs):
assert kwargs == call_kwargs
return _ProcessorProxy(orig_get_hf_processor())
processor.info.get_hf_processor = MethodType(get_hf_processor,
processor.info)
out_kwargs = processor._call_hf_processor(
prompt="",
mm_data={},
mm_kwargs=call_kwargs,
tok_kwargs={},
)
processor = ctx.get_hf_processor(
DummyProcessor, # type: ignore[arg-type]
**inference_kwargs,
)
for k, v in expected_kwargs.items():
assert getattr(processor, k) == v
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"),
[
({"a": 1}, {}, {"a": 1, "c": 0}),
({}, {"a": 1}, {"a": 1, "c": 0}),
# inference_kwargs should take precedence
({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
# Should ignore extra kwargs
({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
],
)
# yapf: enable
def test_hf_processor_call_kwargs(
model_id,
config_kwargs,
inference_kwargs,
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=mock_tokenizer,
)
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
assert result == expected_kwargs
assert out_kwargs == expected_kwargs

View File

@ -1,26 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
@pytest.mark.parametrize(
"model_path",
[("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
def test_llama(vllm_runner, example_prompts, model_path):
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens=20)
print(vllm_outputs)
assert vllm_outputs
@pytest.mark.parametrize(
"model_path",
[("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
def test_qwen(vllm_runner, example_prompts, model_path):
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens=20)
print(vllm_outputs)
assert vllm_outputs

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from torch_xla._internal import tpu
import vllm
from vllm.lora.request import LoRARequest
@ -28,31 +27,25 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
def setup_vllm(num_loras: int) -> vllm.LLM:
return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
num_scheduler_steps=1,
max_model_len=256,
max_seq_len_to_capture=256,
max_num_seqs=8,
tensor_parallel_size=tp,
enable_lora=True,
max_loras=num_loras,
max_lora_rank=8)
TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips()
] if tpu.num_available_chips() > 1 else [1]
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_single_lora(tp: int):
def test_single_lora():
"""
This test ensures we can run a single LoRA adapter on the TPU backend.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=1.
"""
llm = setup_vllm(1, tp)
llm = setup_vllm(1)
prompt = "What is 1+1? \n"
@ -70,8 +63,7 @@ def test_single_lora(tp: int):
assert int(answer) == 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_lora_hotswapping(tp: int):
def test_lora_hotswapping():
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, even
if we only have space to store 1.
@ -87,7 +79,7 @@ def test_lora_hotswapping(tp: int):
for i in range(1, 5)
]
llm = setup_vllm(1, tp)
llm = setup_vllm(1)
prompt = "What is 1+1? \n"
@ -102,8 +94,7 @@ def test_lora_hotswapping(tp: int):
assert int(answer) == i + 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_multi_lora(tp: int):
def test_multi_lora():
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, when
we have enough space to store all of them.
@ -118,7 +109,7 @@ def test_multi_lora(tp: int):
for i in range(1, 5)
]
llm = setup_vllm(4, tp)
llm = setup_vllm(4)
prompt = "What is 1+1? \n"

View File

@ -26,10 +26,12 @@ if not current_platform.is_cuda():
TEXT_ENGINE_ARGS = AsyncEngineArgs(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
disable_log_requests=True,
)
VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
enforce_eager=True)
enforce_eager=True,
disable_log_requests=True)
TEXT_PROMPT = "Hello my name is Robert and"

View File

@ -213,29 +213,3 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
assert len(num_accepted_tokens_per_pos) == 1
assert isinstance(num_accepted_tokens_per_pos[0], Vector)
assert len(num_accepted_tokens_per_pos[0].values) == 5
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str,
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm = LLM(
model=model,
skip_tokenizer_init=True,
enforce_eager=True,
)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError, match="cannot pass text prompts when"):
llm.generate("abc", sampling_params)
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
sampling_params=sampling_params)
assert len(outputs) > 0
completions = outputs[0].outputs
assert len(completions) > 0
assert completions[0].text == ""
assert completions[0].token_ids

View File

@ -25,6 +25,7 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
enforce_eager=True,
disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=DP_SIZE,
)

View File

@ -44,9 +44,9 @@ from vllm.attention.layer import Attention
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
make_tensor_with_pad)
from vllm.utils.flashinfer import use_trtllm_decode_attention
logger = init_logger(__name__)
@ -56,6 +56,7 @@ if TYPE_CHECKING:
class FlashInferBackend(AttentionBackend):
cached_sm100a_supported: Optional[bool] = None
@staticmethod
def get_name() -> str:
@ -122,6 +123,47 @@ class FlashInferBackend(AttentionBackend):
else:
raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
@staticmethod
def use_trtllm_decode_attention(
batch_size: int,
max_seq_len: int,
kv_cache_dtype: str,
num_qo_heads: Optional[int],
num_kv_heads: Optional[int],
attn_head_size: Optional[int],
) -> bool:
if FlashInferBackend.cached_sm100a_supported is None:
FlashInferBackend.cached_sm100a_supported = (
current_platform.has_device_capability(100))
if not FlashInferBackend.cached_sm100a_supported:
return False
# Check if the dimensions are supported by TRTLLM decode attention
if (attn_head_size is None or num_qo_heads is None
or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
return False
env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
if env_value is not None:
logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
env_value)
# Environment variable is set - respect it
# Making the conditional check for zero because
# the path is automatically enabled if the batch size condition
# is satisfied.
no_use_trtllm = (env_value == "0")
if not no_use_trtllm:
logger.info_once("Using TRTLLM decode attention.")
return not no_use_trtllm
else:
# Environment variable not set - use auto-detection
use_trtllm = (FlashInferBackend.cached_sm100a_supported
and batch_size <= 256 and max_seq_len < 131072
and kv_cache_dtype == "auto")
if use_trtllm:
logger.warning_once(
"Using TRTLLM decode attention (auto-detected).")
return use_trtllm
@dataclass
class PerLayerParameters:
@ -1114,7 +1156,7 @@ class FlashInferImpl(AttentionImpl):
assert decode_meta.decode_wrapper._sm_scale == softmax_scale
# TODO: @pavanimajety Remove this once the switch happens
# inside flashinfer.
if not use_trtllm_decode_attention(
if not FlashInferBackend.use_trtllm_decode_attention(
num_decode_tokens, attn_metadata.max_decode_seq_len,
kv_cache_dtype, attn_metadata.num_qo_heads,
attn_metadata.num_kv_heads, attn_metadata.head_dim):

View File

@ -267,15 +267,8 @@ def _support_torch_compile(
code.co_filename)
return inline_call(parent, func, args, kwargs)
# Disable the C++ compilation of symbolic shape guards. C++-fication
# of symbolic shape guards can improve guard overhead. But, since
# vllm skip guards anyways, setting this flag to False can improve
# compile time.
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards",
False), patch.object(
InliningInstructionTranslator,
'inline_call',
patched_inline_call):
with patch.object(InliningInstructionTranslator, 'inline_call',
patched_inline_call):
output = self.compiled_callable(*args, **kwargs)
return output

View File

@ -164,6 +164,3 @@ class AttnFusionPass(VllmInductorPass):
logger.debug("Fused quantization onto %s attention nodes", count)
self.dump_graph(graph, "after_attn_fusion")
self.end_and_log()
def uuid(self):
return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern)

View File

@ -76,10 +76,9 @@ class InductorPass(CustomGraphPass):
for src in srcs:
if isinstance(src, str):
src_str = src
elif isinstance(src, (types.FunctionType, type)):
elif isinstance(src, types.FunctionType):
src_str = inspect.getsource(src)
else:
# object instance
src_str = inspect.getsource(src.__class__)
hasher.update(src_str.encode("utf-8"))
return hasher.hexdigest()

View File

@ -7,7 +7,7 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.platforms import current_platform
if current_platform.is_cuda():
if current_platform.is_cuda_alike():
from .fusion import FusionPass
from .collective_fusion import AllReduceFusionPass, AsyncTPPass
from .fusion_attn import AttnFusionPass

View File

@ -11,7 +11,6 @@ import textwrap
import uuid
import warnings
from collections import Counter
from collections.abc import Mapping
from contextlib import contextmanager
from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
replace)
@ -39,8 +38,8 @@ from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config,
get_hf_text_config, get_pooling_config,
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
maybe_override_with_speculators_target_model, try_get_generation_config,
try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope)
try_get_generation_config, try_get_safetensors_metadata,
try_get_tokenizer_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
# yapf conflicts with isort for this block
@ -535,15 +534,6 @@ class ModelConfig:
"affect the random state of the Python process that "
"launched vLLM.", self.seed)
if self.runner != "draft":
# If we're not running the draft model, check for speculators config
# If speculators config, set model / tokenizer to be target model
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
model=self.model,
tokenizer=self.tokenizer,
revision=self.revision,
trust_remote_code=self.trust_remote_code)
# Keep set served_model_name before maybe_model_redirect(self.model)
self.served_model_name = get_served_model_name(self.model,
self.served_model_name)
@ -615,8 +605,8 @@ class ModelConfig:
self.config_format,
hf_overrides_kw=hf_overrides_kw,
hf_overrides_fn=hf_overrides_fn)
self.hf_config = hf_config
self.hf_text_config = get_hf_text_config(self.hf_config)
self.attention_chunk_size = getattr(self.hf_text_config,
"attention_chunk_size", None)
@ -812,17 +802,12 @@ class ModelConfig:
def _get_transformers_backend_cls(self) -> str:
"""Determine which Transformers backend class will be used if
`model_impl` is set to `transformers` or `auto`."""
if getattr(self, "runner_type", self.runner) == "pooling":
return "TransformersModel"
if self.hf_config != self.hf_text_config:
# If 'hf_text_config' is the same as 'hf_config'. If not, it is
# probably a composite config, i.e. multimodal
return "TransformersForMultimodalLM"
return "TransformersForCausalLM"
def using_transformers_backend(self) -> bool:
"""Check if the model is using the Transformers backend class."""
return self.architecture == self._get_transformers_backend_cls()
else:
return "TransformersForCausalLM"
@property
def registry(self):
@ -885,12 +870,6 @@ class ModelConfig:
return None
def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
mm_config = self.get_multimodal_config()
self.disable_mm_preprocessor_cache = value
mm_config.disable_mm_preprocessor_cache = value
def _get_encoder_config(self):
return get_sentence_transformer_tokenizer_config(
self.model, self.revision)
@ -1580,18 +1559,7 @@ class ModelConfig:
return self.multimodal_config
def try_get_generation_config(self) -> dict[str, Any]:
"""
This method attempts to retrieve the non-default values of the
generation config for this model.
The generation config can contain information about special tokens, as
well as sampling parameters. Which is why this method exists separately
to `get_diff_sampling_param`.
Returns:
A dictionary containing the non-default generation config.
"""
if self.generation_config in {"auto", "vllm"}:
if self.generation_config in ("auto", "vllm"):
config = try_get_generation_config(
self.hf_config_path or self.model,
trust_remote_code=self.trust_remote_code,
@ -1610,18 +1578,13 @@ class ModelConfig:
def get_diff_sampling_param(self) -> dict[str, Any]:
"""
This method returns a dictionary containing the non-default sampling
parameters with `override_generation_config` applied.
The default sampling parameters are:
- vLLM's neutral defaults if `self.generation_config="vllm"`
- the model's defaults if `self.generation_config="auto"`
- as defined in `generation_config.json` if
`self.generation_config="path/to/generation_config/dir"`
This method returns a dictionary containing the parameters
that differ from the default sampling parameters. If
`generation_config` is `"vllm"`, an empty dictionary is returned.
Returns:
A dictionary containing the non-default sampling parameters.
dict[str, Any]: A dictionary with the differing sampling
parameters, if `generation_config` is `"vllm"` an empty dictionary.
"""
if self.generation_config == "vllm":
config = {}
@ -3021,13 +2984,10 @@ class SpeculativeConfig:
"Chunked prefill and EAGLE are not compatible "
"when using V0.")
from vllm.transformers_utils.configs import (
SpeculatorsConfig)
from vllm.transformers_utils.configs.eagle import (
EAGLEConfig)
if isinstance(self.draft_model_config.hf_config,
(EAGLEConfig, SpeculatorsConfig)):
EAGLEConfig):
pass
else:
eagle_config = EAGLEConfig(
@ -3186,19 +3146,10 @@ class SpeculativeConfig:
"speculative decoding is > 1, but got "
f"{self.disable_by_batch_size=}")
from vllm.transformers_utils.configs import SpeculatorsConfig
eagle3_target_supported = ["llama"]
if self.draft_model_config and isinstance(
self.draft_model_config.hf_config, SpeculatorsConfig):
eagle3_target_supported.append("qwen")
if self.method == "eagle3" and self.target_model_config and not any(
supported_model in
self.target_model_config.hf_text_config.model_type
for supported_model in eagle3_target_supported):
if self.method == "eagle3" and self.target_model_config and \
"llama" not in self.target_model_config.hf_text_config.model_type:
raise ValueError(
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
"Eagle3 is only supported for Llama models. "
f"Got {self.target_model_config.hf_text_config.model_type=}")
return self
@ -3392,16 +3343,7 @@ class MultiModalConfig:
999 if envs.VLLM_USE_V1 else 1,
)
def merge_mm_processor_kwargs(
self,
inference_kwargs: Mapping[str, object],
) -> dict[str, object]:
"""
Get the keyword arguments to pass to the multi-modal processor
according to the extra arguments passed during inference.
"""
kwargs = self.mm_processor_kwargs or {}
return kwargs | dict(inference_kwargs)
# TODO: Add configs to init vision tower or not.
@config

View File

@ -1,257 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import uuid
from typing import Any, Optional
import ray
import torch
from ray.exceptions import RayChannelError
from ray.experimental.channel.communicator import (Communicator,
TorchTensorAllocator)
from torch.distributed import ReduceOp
from vllm.distributed.device_communicators.base_device_communicator import (
DeviceCommunicatorBase)
from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import init_logger
from vllm.utils import current_stream
logger = init_logger(__name__)
class RayPPCommunicator(Communicator):
"""
Communicator to be used for pipeline parallelism in Ray Compiled Graph.
This is wraps around the vLLM _PP GroupCoordinator.
This class is not thread-safe.
"""
_comm: Optional[DeviceCommunicatorBase]
def __init__(
self,
world_size: int,
comm_id: Any,
rank: Optional[int],
actor_handles: list["ray.actor.ActorHandle"],
cuda_stream: Optional[torch.cuda.Stream],
use_communication_streams: bool = False,
):
"""
Initialize a RayPPCommunicator that can be used to communicate with
other Ray Compiled Graph actors for pipeline parallelism.
Args:
world_size: The number of participating actors.
comm_id: A unique communicator ID. This is just to conform with
the Ray Communicator API and is not used.
rank: The rank of this actor. If None, then the caller is not a
participant of the RayPPCommunicator group (e.g., the Ray
driver).
actor_handles: A list of actor handles.
cuda_stream: A CUDA stream to dispatch communication ops to. This
is not supported.
use_communication_streams: Whether to use communication streams.
This is not supported.
"""
self._world_size = world_size
self._rank: Optional[int] = None
self._actor_handles = actor_handles
if use_communication_streams:
raise NotImplementedError(
"use_communication_streams is not supported")
if cuda_stream is not None and cuda_stream != current_stream():
raise ValueError(
"cuda_stream other than the current stream is not supported")
if rank is not None:
# Rank is not None, this is Ray worker
assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned"
self._comm = get_pp_group().device_communicator
# Since we wrap around the vLLM _PP communicator, we use
# the rank from the vLLM communicator, and ignore the rank
# passed in from Ray.
# TODO(rui): refactor the Ray Communicator API so that
# it also supports no rank passed in.
self._rank = self._comm.rank_in_group
self._build_actor_rank_mapping()
else:
# Rank is None, this is Ray driver
self._comm = None
self._closed = False
def _build_actor_rank_mapping(self):
"""
Use collective communication to build a mapping from actor IDs to ranks.
This should be called once during initialization.
"""
if self._comm is None:
return {}
current_actor = ray.get_runtime_context().current_actor
actor_id_str = current_actor._actor_id.hex()
# Ray actor IDs are 32-character hex strings (128 bits)
ACTOR_ID_LEN = 32
actor_id_bytes = actor_id_str.encode('utf-8')
assert len(
actor_id_bytes
) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}"
actor_id_tensor = torch.frombuffer(
actor_id_bytes, dtype=torch.uint8).to(self._comm.device)
# All-gather full actor IDs from all actors
gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
# Build mapping: actor_id -> device_comm_rank
self._actor_id_to_rank = {}
for rank in range(self._world_size):
start_idx = rank * ACTOR_ID_LEN
end_idx = (rank + 1) * ACTOR_ID_LEN
actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy(
).tobytes()
actor_id = actor_bytes.decode('utf-8')
self._actor_id_to_rank[actor_id] = rank
def initialize(self, rank: int) -> None:
# No additional initialization is needed.
pass
def get_actor_handles(self) -> list["ray.actor.ActorHandle"]:
return self._actor_handles
def get_rank(self, actor: ray.actor.ActorHandle) -> int:
"""
Return the given actor's rank using device communicator collective ops.
"""
assert hasattr(self, '_actor_id_to_rank'), (
"Actor rank mapping not built. "
"This should have been done during initialization.")
actor_id_str = actor._actor_id.hex()
if actor_id_str in self._actor_id_to_rank:
return self._actor_id_to_rank[actor_id_str] # type: ignore
else:
raise ValueError(f"Actor {actor} not found in communicator group")
def get_self_rank(self) -> Optional[int]:
"""
Return this actor's rank.
"""
return self._rank
def get_world_size(self) -> int:
"""
Return the number of ranks in the RayPPCommunicator group.
"""
return self._world_size
def send(self, buf: "torch.Tensor", peer_rank: int) -> None:
"""
Send a torch.Tensor to a peer.
This returns when the send kernel has been queued, but the kernel may
not have completed. Therefore, the caller should ensure that there are
no concurrent writes to the sent `buf` until the send has finished.
That is, either all writes should be submitted on the current stream
(self._cuda_stream) or, if on a different stream, that stream should
synchronize with the current stream.
Args:
buf: The torch.Tensor to send. It should already be on this
actor's default device.
peer_rank: The rank of the actor to send to.
"""
if self._closed:
raise RayChannelError("RayPPCommunicator has been destroyed.")
assert self._comm is not None
self._comm.send(buf, peer_rank)
def recv(
self,
shape: tuple[int],
dtype: "torch.dtype",
peer_rank: int,
allocator: TorchTensorAllocator,
) -> "torch.Tensor":
"""
Receive a torch.Tensor from a peer and synchronize the current stream.
After this call returns, the receive buffer is safe to read from from
any stream. An RayChannelError will be raised if an error occurred
(e.g., remote actor died), and the buffer is not safe to read.
Args:
shape: The shape of the tensor to receive.
dtype: The dtype of the tensor to receive.
peer_rank: The rank of the actor to receive from.
allocator: The allocator to use to create the received tensor.
This is ignored for this implementation.
"""
if self._closed:
raise RayChannelError("RayPPCommunicator has been destroyed.")
assert self._comm is not None
size = torch.Size(shape)
buf = self._comm.recv(size, dtype, src=peer_rank)
# Buffer values are undefined if NCCL ops are aborted. Therefore, we
# need to synchronize here and check that the channel is still
# open to ensure that the receive buffer is valid.
# TODO(swang): Avoid CUDA synchronization.
current_stream().synchronize()
if self._closed:
raise RayChannelError("RayPPCommunicator has been destroyed.")
return buf
def allgather(
self,
send_buf: "torch.Tensor",
recv_buf: "torch.Tensor",
):
raise NotImplementedError("allgather is not supported")
def allreduce(
self,
send_buf: "torch.Tensor",
recv_buf: "torch.Tensor",
op: ReduceOp = ReduceOp.SUM,
):
raise NotImplementedError("allreduce is not supported")
def reducescatter(
self,
send_buf: "torch.Tensor",
recv_buf: "torch.Tensor",
op: ReduceOp = ReduceOp.SUM,
):
raise NotImplementedError("reducescatter is not supported")
@property
def recv_stream(self):
return torch.cuda.StreamContext(current_stream())
@property
def send_stream(self):
return torch.cuda.StreamContext(current_stream())
def destroy(self) -> None:
# Just sets a flag, vLLM manages the lifecycle of the underlying
# _PP GroupCoordinator.
self._closed = True
def get_transport_name(self) -> str:
return "nccl"
@classmethod
def generate_communicator_id(cls) -> Any:
return uuid.uuid4()

View File

@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
import regex as re
import torch
from pydantic import TypeAdapter, ValidationError
from typing_extensions import TypeIs, deprecated
from typing_extensions import TypeIs
import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
@ -217,12 +217,10 @@ Additionally, list elements can be passed individually using `+`:
elif contains_type(type_hints, list):
type_hint = get_type(type_hints, list)
types = get_args(type_hint)
list_type = types[0]
if get_origin(list_type) is Union:
msg = "List type must contain str if it is a Union."
assert str in get_args(list_type), msg
list_type = str
kwargs[name]["type"] = list_type
assert len(types) == 1, (
"List type must have exactly one type. Got "
f"{type_hint} with types {types}")
kwargs[name]["type"] = types[0]
kwargs[name]["nargs"] = "+"
elif contains_type(type_hints, int):
kwargs[name]["type"] = int
@ -986,28 +984,8 @@ class EngineArgs:
provided as a JSON string input via CLI arguments or directly as a
dictionary from the engine.
"""
from vllm.transformers_utils.config import get_config
from vllm.transformers_utils.configs.speculators.base import (
SpeculatorsConfig)
if self.speculative_config is None:
hf_config = get_config(self.hf_config_path or self.model,
self.trust_remote_code, self.revision,
self.code_revision, self.config_format)
# if loading a SpeculatorsConfig, load the specualtive_config
# details from the config directly
# no user input required / expected
if isinstance(hf_config, SpeculatorsConfig):
# We create one since we dont create one
self.speculative_config = {}
self.speculative_config[
"num_speculative_tokens"] = hf_config.num_lookahead_tokens
self.speculative_config["model"] = self.model
self.speculative_config["method"] = hf_config.method
else:
return None
return None
# Note(Shangming): These parameters are not obtained from the cli arg
# '--speculative-config' and must be passed in when creating the engine
@ -1227,18 +1205,6 @@ class EngineArgs:
enable_multimodal_encoder_data_parallel,
)
supports_mm_preprocessor_cache = (self.data_parallel_size == 1
or data_parallel_external_lb)
if (not supports_mm_preprocessor_cache
and model_config.is_multimodal_model
and not model_config.disable_mm_preprocessor_cache):
logger.warning(
"Multi-modal preprocessor cache is not compatible "
"with data parallelism when there does not exist a "
"one-to-one correspondance between API process and "
"EngineCore process, so the cache will be disabled.")
model_config.set_disable_mm_preprocessor_cache(True)
speculative_config = self.create_speculative_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
@ -1712,23 +1678,7 @@ class EngineArgs:
@dataclass
class AsyncEngineArgs(EngineArgs):
"""Arguments for asynchronous vLLM engine."""
enable_log_requests: bool = False
@property
@deprecated(
"`disable_log_requests` is deprecated and has been replaced with "
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
"`enable_log_requests` instead.")
def disable_log_requests(self) -> bool:
return not self.enable_log_requests
@disable_log_requests.setter
@deprecated(
"`disable_log_requests` is deprecated and has been replaced with "
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
"`enable_log_requests` instead.")
def disable_log_requests(self, value: bool):
self.enable_log_requests = not value
disable_log_requests: bool = False
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser,
@ -1739,15 +1689,9 @@ class AsyncEngineArgs(EngineArgs):
load_general_plugins()
if not async_args_only:
parser = EngineArgs.add_cli_args(parser)
parser.add_argument('--enable-log-requests',
action=argparse.BooleanOptionalAction,
default=AsyncEngineArgs.enable_log_requests,
help='Enable logging requests.')
parser.add_argument('--disable-log-requests',
action=argparse.BooleanOptionalAction,
default=not AsyncEngineArgs.enable_log_requests,
help='[DEPRECATED] Disable logging requests.',
deprecated=True)
action='store_true',
help='Disable logging requests.')
current_platform.pre_register_and_update(parser)
return parser

View File

@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, deprecate_kwargs, weak_bind
from vllm.utils import Device, weak_bind
logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@ -554,20 +554,14 @@ class AsyncLLMEngine(EngineClient):
return LLMEngine._get_executor_cls(engine_config)
@classmethod
@deprecate_kwargs(
"disable_log_requests",
additional_message=("This argument will have no effect. "
"Use `enable_log_requests` instead."),
)
def from_vllm_config(
cls,
vllm_config: VllmConfig,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
enable_log_requests: bool = False,
disable_log_stats: bool = False,
disable_log_requests: bool = True, # Deprecated, will be removed
cls,
vllm_config: VllmConfig,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
disable_log_requests: bool = False,
disable_log_stats: bool = False,
) -> "AsyncLLMEngine":
"""Create an AsyncLLMEngine from the EngineArgs."""
@ -575,7 +569,7 @@ class AsyncLLMEngine(EngineClient):
vllm_config=vllm_config,
executor_class=cls._get_executor_cls(vllm_config),
start_engine_loop=start_engine_loop,
log_requests=enable_log_requests,
log_requests=not disable_log_requests,
log_stats=not disable_log_stats,
usage_context=usage_context,
stat_loggers=stat_loggers,
@ -604,7 +598,7 @@ class AsyncLLMEngine(EngineClient):
usage_context=usage_context,
stat_loggers=stat_loggers,
disable_log_stats=engine_args.disable_log_stats,
enable_log_requests=engine_args.enable_log_requests,
disable_log_requests=engine_args.disable_log_requests,
)
@property

View File

@ -34,7 +34,6 @@ from vllm.outputs import RequestOutput
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.usage.usage_lib import UsageContext
from vllm.utils import deprecate_kwargs
from vllm.worker.model_runner_base import InputProcessingError
logger = init_logger(__name__)
@ -121,20 +120,10 @@ class MQLLMEngine:
return ENGINE_DEAD_ERROR()
@classmethod
@deprecate_kwargs(
"disable_log_requests",
additional_message=("This argument will have no effect. "
"Use `enable_log_requests` instead."),
)
def from_vllm_config(
cls,
vllm_config: VllmConfig,
usage_context: UsageContext,
enable_log_requests: bool,
disable_log_stats: bool,
ipc_path: str,
disable_log_requests: bool = True, # Deprecated, will be removed
) -> "MQLLMEngine":
def from_vllm_config(cls, vllm_config: VllmConfig,
usage_context: UsageContext,
disable_log_requests: bool, disable_log_stats: bool,
ipc_path: str) -> "MQLLMEngine":
# Setup plugins for each process
from vllm.plugins import load_general_plugins
load_general_plugins()
@ -147,7 +136,7 @@ class MQLLMEngine:
ipc_path=ipc_path,
usage_context=usage_context,
use_async_sockets=use_async_sockets,
log_requests=enable_log_requests,
log_requests=(not disable_log_requests),
log_stats=(not disable_log_stats),
)
@ -161,7 +150,7 @@ class MQLLMEngine:
ipc_path=ipc_path,
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats,
)
@ -447,7 +436,7 @@ def signal_handler(*_) -> None:
def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
ipc_path: str, disable_log_stats: bool,
enable_log_requests: bool, engine_alive):
disable_log_requests: bool, engine_alive):
try:
# Ensure we can serialize transformer config before spawning
maybe_register_config_serialize_by_value()
@ -456,7 +445,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
vllm_config=vllm_config,
usage_context=usage_context,
disable_log_stats=disable_log_stats,
enable_log_requests=enable_log_requests,
disable_log_requests=disable_log_requests,
ipc_path=ipc_path)
signal.signal(signal.SIGTERM, signal_handler)

View File

@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
import signal
import sys
from typing import Optional
import uvloop
@ -16,10 +18,10 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args)
from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
show_filtered_argument_or_group_from_help)
from vllm.executor.multiproc_worker_utils import _add_prefix
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri,
set_process_title)
from vllm.utils import FlexibleArgumentParser, get_tcp_uri
from vllm.v1.engine.core import EngineCoreProc
from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
from vllm.v1.executor.abstract import Executor
@ -75,7 +77,7 @@ def run_headless(args: argparse.Namespace):
if args.api_server_count > 1:
raise ValueError("api_server_count can't be set in headless mode")
# set_process_title("Headless_ProcManager")
# Create the EngineConfig.
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
usage_context = UsageContext.OPENAI_API_SERVER
@ -140,6 +142,8 @@ def run_multi_api_server(args: argparse.Namespace):
orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
# set_process_title("ProcManager")
if num_api_servers > 1:
setup_multiprocess_prometheus()
@ -163,9 +167,8 @@ def run_multi_api_server(args: argparse.Namespace):
if model_config.is_multimodal_model and not (
orig_disable_mm_preprocessor_cache):
logger.warning(
"Multi-modal preprocessor cache is not compatible "
"with api_server_count > 1, so the cache will be disabled.")
logger.warning("Multi-model preprocessor cache will be disabled "
"for api_server_count > 1")
executor_class = Executor.get_class(vllm_config)
log_stats = not engine_args.disable_log_stats
@ -224,10 +227,12 @@ def run_api_server_worker_proc(listen_address,
**uvicorn_kwargs) -> None:
"""Entrypoint for individual API server worker processes."""
# Set process title and add process-specific prefix to stdout and stderr.
server_index = client_config.get("client_index", 0) if client_config else 0
set_process_title("APIServer", str(server_index))
decorate_logs()
# Add process-specific prefix to stdout and stderr.
from multiprocessing import current_process
process_name = current_process().name
pid = os.getpid()
_add_prefix(sys.stdout, process_name, pid)
_add_prefix(sys.stderr, process_name, pid)
uvloop.run(
run_server_worker(listen_address, sock, args, client_config,

View File

@ -11,6 +11,7 @@ import multiprocessing
import os
import signal
import socket
import sys
import tempfile
import uuid
from argparse import Namespace
@ -94,15 +95,15 @@ from vllm.entrypoints.openai.serving_transcription import (
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
log_non_default_args, with_cancellation)
from vllm.executor.multiproc_worker_utils import _add_prefix
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.transformers_utils.tokenizer import MistralTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
get_open_zmq_ipc_path, is_valid_ipv6_address,
set_ulimit)
from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
is_valid_ipv6_address, set_process_title, set_ulimit)
from vllm.v1.metrics.prometheus import get_prometheus_registry
from vllm.version import __version__ as VLLM_VERSION
@ -199,18 +200,15 @@ async def build_async_engine_client_from_engine_args(
from vllm.v1.engine.async_llm import AsyncLLM
async_llm: Optional[AsyncLLM] = None
client_count = client_config.pop(
"client_count") if client_config else 1
client_index = client_config.pop(
"client_index") if client_config else 0
try:
async_llm = AsyncLLM.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats,
client_addresses=client_config,
client_count=client_count,
client_index=client_index)
# Don't keep the dummy data in memory
@ -230,7 +228,7 @@ async def build_async_engine_client_from_engine_args(
engine_client = AsyncLLMEngine.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats)
yield engine_client
finally:
@ -275,7 +273,7 @@ async def build_async_engine_client_from_engine_args(
target=run_mp_engine,
args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
engine_args.disable_log_stats,
engine_args.enable_log_requests, engine_alive))
engine_args.disable_log_requests, engine_alive))
engine_process.start()
engine_pid = engine_process.pid
assert engine_pid is not None, "Engine process failed to start."
@ -1573,10 +1571,10 @@ async def init_app_state(
else:
served_model_names = [args.model]
if args.enable_log_requests:
request_logger = RequestLogger(max_log_len=args.max_log_len)
else:
if args.disable_log_requests:
request_logger = None
else:
request_logger = RequestLogger(max_log_len=args.max_log_len)
base_model_paths = [
BaseModelPath(name=name, model_path=args.model)
@ -1810,7 +1808,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""
# Add process-specific prefix to stdout and stderr.
decorate_logs("APIServer")
process_name = "APIServer"
pid = os.getpid()
_add_prefix(sys.stdout, process_name, pid)
_add_prefix(sys.stderr, process_name, pid)
listen_address, sock = setup_server(args)
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
@ -1827,7 +1828,7 @@ async def run_server_worker(listen_address,
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
server_index = client_config.get("client_index", 0) if client_config else 0
set_process_title("APIServer", str(server_index))
# Load logging config for uvicorn if specified
log_config = load_log_config(args.log_config_file)
if log_config is not None:

View File

@ -859,15 +859,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
'are supported.'
)
# if tool_choice is "required" but the "tools" list is empty,
# override the data to behave like "none" to align with
# OpenAIs behavior.
if data["tool_choice"] == "required" and isinstance(
data["tools"], list) and len(data["tools"]) == 0:
data["tool_choice"] = "none"
del data["tools"]
return data
# ensure that if "tool_choice" is specified as an object,
# it matches a valid tool
correct_usage_message = 'Correct usage: `{"type": "function",' \

View File

@ -324,10 +324,10 @@ async def run_batch(
else:
served_model_names = [args.model]
if args.enable_log_requests:
request_logger = RequestLogger(max_log_len=args.max_log_len)
else:
if args.disable_log_requests:
request_logger = None
else:
request_logger = RequestLogger(max_log_len=args.max_log_len)
base_model_paths = [
BaseModelPath(name=name, model_path=args.model)

View File

@ -55,7 +55,6 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
VLLM_XLA_USE_SPMD: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@ -69,6 +68,7 @@ if TYPE_CHECKING:
MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_NO_DEPRECATION_WARNING: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@ -126,7 +126,6 @@ if TYPE_CHECKING:
VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
VLLM_TPU_USING_PATHWAYS: bool = False
VLLM_USE_DEEP_GEMM: bool = False
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
VLLM_XGRAMMAR_CACHE_MB: int = 0
@ -214,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Target device of vLLM, supporting [cuda (by default),
# rocm, neuron, cpu]
"VLLM_TARGET_DEVICE":
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
# Maximum number of compilation jobs to run in parallel.
# By default this is the number of CPUs
@ -229,8 +228,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED":
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT":
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
("1", "true"),
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
@ -500,13 +505,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
),
# If the env var is set, it uses a Ray Communicator wrapping
# vLLM's pipeline parallelism communicator to interact with Ray's
# Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_WRAPPED_PP_COMM":
lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD":
@ -911,14 +909,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine
# startup time by a couple of minutes.
# Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
"VLLM_SKIP_DEEP_GEMM_WARMUP":
lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
# Allow use of FlashInfer MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),

View File

@ -3,20 +3,21 @@
import asyncio
import os
import sys
import threading
import uuid
from dataclasses import dataclass
from multiprocessing import Queue
from multiprocessing.connection import wait
from multiprocessing.process import BaseProcess
from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
TypeVar, Union)
import torch
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
run_method)
from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
logger = init_logger(__name__)
@ -24,6 +25,10 @@ T = TypeVar('T')
_TERMINATE = "TERMINATE" # sentinel
# ANSI color codes
CYAN = '\033[1;36m'
RESET = '\033[0;0m'
JOIN_TIMEOUT_S = 2
@ -208,7 +213,9 @@ def _run_worker_process(
# Add process-specific prefix to stdout and stderr
process_name = get_mp_context().current_process().name
decorate_logs(process_name)
pid = os.getpid()
_add_prefix(sys.stdout, process_name, pid)
_add_prefix(sys.stderr, process_name, pid)
# Initialize worker
worker = worker_factory(vllm_config, rank)
@ -253,6 +260,33 @@ def _run_worker_process(
logger.info("Worker exiting")
def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
"""Prepend each output line with process-specific prefix"""
prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
file_write = file.write
def write_with_prefix(s: str):
if not s:
return
if file.start_new_line: # type: ignore[attr-defined]
file_write(prefix)
idx = 0
while (next_idx := s.find('\n', idx)) != -1:
next_idx += 1
file_write(s[idx:next_idx])
if next_idx == len(s):
file.start_new_line = True # type: ignore[attr-defined]
return
file_write(prefix)
idx = next_idx
file_write(s[idx:])
file.start_new_line = False # type: ignore[attr-defined]
file.start_new_line = True # type: ignore[attr-defined]
file.write = write_with_prefix # type: ignore[method-assign]
def set_multiprocessing_worker_envs(parallel_config):
""" Set up environment variables that should be used when there are workers
in a multiprocessing environment. This should be called by the parent

View File

@ -608,21 +608,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
forward_dag = MultiOutputNode(outputs)
if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
from ray.experimental.channel.accelerator_context import (
register_accelerator_context)
from vllm.distributed.device_communicators.ray_communicator import (
RayPPCommunicator)
register_accelerator_context(torch_module_name="cuda",
communicator_cls=RayPPCommunicator)
logger.info("Using RayPPCommunicator "
"(which wraps vLLM _PP GroupCoordinator) "
"for Ray Compiled Graph communication.")
else:
logger.info("Using Ray's NCCL communicator for "
"Ray Compiled Graph communication.")
return forward_dag.experimental_compile(
enable_asyncio=enable_asyncio,
_overlap_gpu_communication=envs.

View File

@ -11,7 +11,7 @@ from typing_extensions import TypeVar
from vllm.jsontree import JSONTree, json_map_leaves
from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.utils import get_allowed_kwarg_only_overrides
from vllm.utils import resolve_mm_processor_kwargs
if TYPE_CHECKING:
from vllm.config import ModelConfig
@ -154,11 +154,14 @@ class InputProcessingContext(InputContext):
assert callable(hf_processor)
mm_config = self.model_config.get_multimodal_config()
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
base_kwargs = mm_config.mm_processor_kwargs
if base_kwargs is None:
base_kwargs = {}
allowed_kwargs = get_allowed_kwarg_only_overrides(
merged_kwargs = resolve_mm_processor_kwargs(
base_kwargs,
kwargs,
hf_processor,
merged_kwargs,
requires_kw_only=False,
allow_var_kwargs=True,
)
@ -170,9 +173,7 @@ class InputProcessingContext(InputContext):
return x
try:
output = hf_processor(**data,
**allowed_kwargs,
return_tensors="pt")
output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
# this emulates output.to(dtype=self.model_config.dtype)
if isinstance(output, BatchFeature):
cast_output = json_map_leaves(maybe_cast_dtype, output.data)
@ -188,7 +189,7 @@ class InputProcessingContext(InputContext):
except Exception as exc:
msg = (f"Failed to apply {type(hf_processor).__name__} "
f"on data={data} with kwargs={allowed_kwargs}")
f"on data={data} with kwargs={merged_kwargs}")
raise ValueError(msg) from exc

View File

@ -682,14 +682,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
def slice_lora_b(
self, lora_b: list[Union[torch.Tensor, None]]
) -> list[Union[torch.Tensor, None]]:
sliced_lora_b = [None] * self.n_slices
for i, (shard_id, shard_size) in enumerate(
zip(self.output_ids, self.output_slices)):
if (lora_b_i := lora_b[i]) is not None:
sliced_lora_b[i] = lora_b_i[:,
shard_size * shard_id:shard_size *
(shard_id + 1)]
return sliced_lora_b
lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
(shard_id + 1)]
return lora_b
def slice_bias(
self, bias: list[Union[torch.Tensor,

View File

@ -4,9 +4,7 @@ import functools
from typing import Any, Optional
import torch
from tqdm import tqdm
import vllm.envs as env
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@ -19,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8)
from vllm.utils import has_deep_gemm, run_once
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
logger = init_logger(__name__)
@ -84,65 +82,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
return True
@run_once
def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
num_topk: int):
"""
DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
input tensor shapes. In this function, we construct all possible input
tensor shapes so all the kernels are JIT'ed and cached.
Note that this warmup is expected to happen during the model profile
call and not during actual model inference.
"""
assert w1.size(0) == w2.size(0), (
"w1 and w2 must have the same number of experts")
block_m = deep_gemm_block_shape()[0]
num_experts = w1.size(0)
device = w1.device
# This is the maximum GroupedGemm M size that we expect to run
# the grouped_gemm with.
MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE,
num_topk,
num_experts,
block_m,
expert_tokens_meta=None)
# Distribute expert-ids evenly.
MAX_BLOCKS = MAX_M // block_m
expert_ids_block = torch.randint(low=0,
high=num_experts,
size=(MAX_BLOCKS, ),
device=device,
dtype=torch.int32)
expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
_, n, k = w.size()
a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
a1q_scales = torch.empty((MAX_M, k // block_m),
device=device,
dtype=torch.float32)
out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
pbar = tqdm(total=MAX_BLOCKS,
desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})")
num_tokens = MAX_M
while num_tokens > 0:
m_grouped_fp8_gemm_nt_contiguous(
(a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
out[:num_tokens], expert_ids[:num_tokens])
pbar.update(1)
num_tokens = num_tokens - block_m
_warmup(w1, w1_scale)
_warmup(w2, w2_scale)
class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
def __init__(self):
@ -217,20 +156,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
):
assert self.block_shape is not None
assert a1q_scale is not None
assert w1_scale is not None
assert w2_scale is not None
if not env.VLLM_SKIP_DEEP_GEMM_WARMUP:
# DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing
# to happen during actual model-inference. The
# `warmup_deepgemm_kernels` function is a `run_once` decorated
# function that executes during the model profile run. This warmup
# should create all the required JITs for the current model.
warmup_deepgemm_gg_contiguous_kernels(w1,
w2,
w1_scale,
w2_scale,
num_topk=topk_ids.size(1))
a1q = hidden_states
_, N, K = w1.size()

View File

@ -144,13 +144,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
"apply_router_weight_on_input is only implemented for topk=1")
a1 = a1 * topk_weights.to(a1.dtype)
if quant_config.is_block_quantized:
# Quant and Dispatch
if quant_config.per_act_token_quant:
a1q, a1q_scale = moe_kernel_quantize_input(
a1,
a1_scale,
quant_dtype=quant_config.quant_dtype,
per_act_token_quant=quant_config.per_act_token_quant,
per_act_token_quant=True,
block_shape=quant_config.block_shape,
)
if a1q_scale is not None and a1q_scale.numel() == 1:
@ -163,10 +162,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
rank_topk_weights=topk_weights,
num_experts=num_experts)
else:
# Dispatch and Quant
# DeepEP kernels only support dispatching block-quantized
# activation scales.
# Dispatch in bfloat16
# DeepEP kernels only support dispatching per-token-quant
# quantization. dispatch in bfloat16.
(expert_x, _, expert_tokens_meta, expert_topk_ids,
expert_topk_weights) = self._do_dispatch(
tokens=a1,
@ -174,7 +171,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
rank_topk_ids=topk_ids,
rank_topk_weights=topk_weights,
num_experts=num_experts)
# Quantize after dispatch.
# quantize now
expert_x_scale = None
if expert_x.numel() != 0:
expert_x, expert_x_scale = moe_kernel_quantize_input(

View File

@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,
def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
num_experts: int, bit: int):
return current_platform.is_cuda() and bit == 4 and \
group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
return bit == 4 and group_size in [32, 64, 128] and \
num_valid_tokens / num_experts <= 6
def get_default_config(

View File

@ -17,14 +17,9 @@ from vllm.model_executor.layers.fused_moe import (
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa
FlashInferCutlassMoEPrepareAndFinalize)
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
from vllm.model_executor.layers.quantization.utils import replace_parameter
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
build_flashinfer_fp4_cutlass_moe_kernel,
flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
check_moe_marlin_supports_layer, marlin_make_workspace_new,
marlin_moe_permute_scales)
@ -33,7 +28,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
prepare_moe_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
swizzle_blockscale)
cutlass_fp4_supported, swizzle_blockscale)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
from vllm.model_executor.utils import set_weight_attrs
@ -101,14 +96,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
def __init__(self):
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
detect_nvfp4_moe_support)
_nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
self.use_marlin = _nvfp4.use_marlin
self.use_marlin = not cutlass_fp4_supported()
self.group_size = 16
self.fused_experts = None # type: ignore[assignment]
def create_weights(self, layer: torch.nn.Module, num_experts: int,
hidden_size: int, intermediate_size_per_partition: int,
@ -211,14 +200,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
requires_grad=False)
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
if self.allow_flashinfer_cutlass:
w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
layer.w13_weight_scale.data,
dim=-2)
layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
if not torch.allclose(layer.w13_weight_global_scale[:, 0],
layer.w13_weight_global_scale[:, 1]):
logger.warning_once(
@ -265,21 +246,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
layer.w2_input_scale_quant = torch.nn.Parameter(
(layer.w2_input_global_scale), requires_grad=False)
def maybe_swap_experts_impl(self, moe_parallel_config):
if not self.allow_flashinfer_cutlass:
return
self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
moe_parallel_config)
def select_gemm_impl(self, prepare_finalize, moe):
"""Return the appropriate GEMM experts implementation."""
assert moe is not None and prepare_finalize is not None
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501
select_nvfp4_gemm_impl)
return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
logger)
def apply(
self,
layer: torch.nn.Module,
@ -337,23 +303,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
global_num_experts=global_num_experts,
expert_map=expert_map)
# FlashInfer fused experts path
if self.fused_experts is not None:
return flashinfer_fp4_cutlass_moe_forward(
self.fused_experts,
layer,
x,
topk_weights,
topk_ids,
activation=activation,
global_num_experts=global_num_experts,
expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input,
)
assert expert_map is None, ("Expert Parallelism / expert_map "
"is currently not supported for "
"CompressedTensorsW4A4MoeMethod.")
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
cutlass_moe_fp4)

View File

@ -10,11 +10,10 @@ import torch
from torch.nn.parameter import Parameter
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.linear import LinearMethodBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
QuantizationConfig)
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
get_linear_quant_method)
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
@ -111,23 +110,8 @@ class GPTQConfig(QuantizationConfig):
return cls(weight_bits, group_size, desc_act, lm_head_quantized,
dynamic)
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
if isinstance(layer, FusedMoE):
# GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
from .moe_wna16 import MoeWNA16Config
config = {
"quant_method": "gptq",
"bits": self.weight_bits,
"group_size": self.group_size,
"sym": True, # GPTQ typically uses symmetric quantization
"lm_head": False,
}
return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix)
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["GPTQLinearMethod"]:
return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)

View File

@ -10,8 +10,11 @@ from torch.nn.parameter import Parameter
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.distributed import get_ep_group
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize)
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@ -20,9 +23,6 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
build_flashinfer_fp4_cutlass_moe_kernel,
flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
swap_w13_to_w31)
@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp, requantize_with_max_scale)
from vllm.model_executor.parameter import (ModelWeightParameter,
PerTensorScaleParameter)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.flashinfer import has_flashinfer_moe
@ -868,12 +869,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
def __init__(self, quant_config: ModelOptNvFp4Config):
self.quant_config = quant_config
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
detect_nvfp4_moe_support)
_nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
self.use_marlin = _nvfp4.use_marlin
self.cutlass_nvfp4_supported = cutlass_fp4_supported()
self.use_marlin = False
self.allow_flashinfer_cutlass = False
if envs.VLLM_USE_FLASHINFER_MOE_FP4:
if self.cutlass_nvfp4_supported and current_platform.is_cuda() \
and current_platform.is_device_capability(100):
logger.info_once(
"Using FlashInfer kernels for ModelOptNvFp4FusedMoE.")
self.allow_flashinfer_cutlass = True
else:
logger.warning_once(
"Flashinfer CUTLASS Fused MoE not supported "
"or found on the current platform.")
if not self.cutlass_nvfp4_supported:
if is_fp4_marlin_supported():
self.use_marlin = True
else:
raise ValueError("Current platform does not support NVFP4"
" quantization. Please use Blackwell and"
" above.")
self.fused_experts = None # type: ignore
@ -883,8 +900,29 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
):
if not self.allow_flashinfer_cutlass:
return
self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
moe_parallel_config)
logger.debug_once("FlashInferExperts")
# default to TP/EP case only
experts_kwargs: dict[str, Any] = {
"use_nvfp4_w4a4": True,
"use_dp": moe_parallel_config.dp_size > 1,
"ep_rank": moe_parallel_config.ep_rank,
"ep_size": moe_parallel_config.ep_size,
"tp_rank": moe_parallel_config.tp_rank,
"tp_size": moe_parallel_config.tp_size,
}
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
FlashInferExperts)
experts = FlashInferExperts(**experts_kwargs)
self.fused_experts = mk.FusedMoEModularKernel(
FlashInferCutlassMoEPrepareAndFinalize(
quant_dtype=torch.uint8,
#meaning 2x e2m1 packed in one, kernel requirement
),
experts,
)
# This method update self.fused_experts
# only prepare_finalize is not None call select_gemm_impl
@ -893,12 +931,32 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
def select_gemm_impl(self, prepare_finalize,
moe) -> mk.FusedMoEPermuteExpertsUnpermute:
assert moe is not None and prepare_finalize is not None
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501
select_nvfp4_gemm_impl)
assert moe is not None
assert prepare_finalize is not None
experts = None
all2all_manager = get_ep_group().device_communicator.all2all_manager
assert all2all_manager is not None
if self.allow_flashinfer_cutlass:
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
FlashInferExperts)
logger.debug_once("Using FlashInferExperts")
experts = FlashInferExperts(
use_nvfp4_w4a4=True,
use_dp=moe.moe_parallel_config.dp_size > 1,
ep_rank=moe.moe_parallel_config.ep_rank,
ep_size=moe.moe_parallel_config.ep_size,
tp_rank=moe.moe_parallel_config.tp_rank,
tp_size=moe.moe_parallel_config.tp_size,
)
else:
assert moe.dp_size > 1
logger.debug_once("Using CutlassExpertsFp4")
# Currently CutlassExpertsFp4 doesn't support DP
raise ValueError("CutlassExpertsFp4 doesn't support DP. "
"Use flashinfer CUTLASS FusedMoE backend instead "
"(set VLLM_USE_FLASHINFER_MOE_FP4=1)")
return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
logger)
return experts
def uses_weight_scale_2_pattern(self) -> bool:
"""
@ -1004,8 +1062,18 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
gemm1_weight_scale = layer.w13_weight_scale.data
if self.allow_flashinfer_cutlass:
gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
gemm1_weight, gemm1_weight_scale, dim=-2)
dim = -2
size = gemm1_weight.size(dim)
assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
half = size // 2
# Reorder weight
w1, w3 = gemm1_weight.split(half, dim=dim)
gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous()
# Reorder scale
s1, s3 = gemm1_weight_scale.split(half, dim=dim)
gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous()
layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
layer.w13_weight_scale = Parameter(gemm1_weight_scale,
@ -1149,15 +1217,49 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input)
else:
out = flashinfer_fp4_cutlass_moe_forward(
self.fused_experts,
layer,
x,
topk_weights,
topk_ids,
# TP or DP case
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
is_valid_flashinfer_cutlass_fused_moe)
assert is_valid_flashinfer_cutlass_fused_moe(
x, layer.w13_weight, layer.w2_weight), (
"Flashinfer CUTLASS Fused MoE not applicable!")
a1_gscale = layer.w13_input_scale_quant
a2_gscale = layer.w2_input_scale_quant
extra_expert_args = {
'g1_alphas': layer.g1_alphas,
'g2_alphas': layer.g2_alphas,
'out_dtype': x.dtype,
# Avoid confusion with a1_scale and a2_scale
# where are batch size related.
'a1_gscale': a1_gscale,
'a2_gscale': a2_gscale,
}
extra_prepare_args = {
'use_dp': layer.dp_size > 1,
'local_tokens': x.shape[0],
'a1_gscale': a1_gscale,
}
extra_finalize_args = {
'use_dp': layer.dp_size > 1,
'local_tokens': x.shape[0],
}
out = self.fused_experts(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False, # TODO(shuw): fix later, now output is high prec
activation=activation,
global_num_experts=global_num_experts,
expert_map=expert_map,
w1_scale=layer.w13_blockscale_swizzled,
w2_scale=layer.w2_blockscale_swizzled,
apply_router_weight_on_input=apply_router_weight_on_input,
extra_expert_args=extra_expert_args,
extra_prepare_args=extra_prepare_args,
extra_finalize_args=extra_finalize_args,
)
return out

View File

@ -1,154 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
from __future__ import annotations
from typing import Optional
import torch
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize)
from vllm.platforms import current_platform
logger = init_logger(__name__)
__all__ = [
"is_flashinfer_fp4_cutlass_moe_available",
"reorder_w1w3_to_w3w1",
"build_flashinfer_fp4_cutlass_moe_kernel",
"flashinfer_fp4_cutlass_moe_forward",
]
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
"""Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
and current_platform.is_device_capability(100))
def reorder_w1w3_to_w3w1(weight: torch.Tensor,
scale: torch.Tensor,
dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
"""Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
size = weight.size(dim)
assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
half = size // 2
w1, w3 = weight.split(half, dim=dim)
s1, s3 = scale.split(half, dim=dim)
return (torch.cat([w3, w1],
dim=dim).contiguous(), torch.cat([s3, s1],
dim=dim).contiguous())
def build_flashinfer_fp4_cutlass_moe_kernel(
moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
"""Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
experts = FlashInferExperts(
use_nvfp4_w4a4=True,
use_dp=moe_parallel_config.dp_size > 1,
ep_rank=moe_parallel_config.ep_rank,
ep_size=moe_parallel_config.ep_size,
tp_rank=moe_parallel_config.tp_rank,
tp_size=moe_parallel_config.tp_size,
)
logger.debug_once("FlashInferExperts (util)")
return mk.FusedMoEModularKernel(
FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
experts,
)
def flashinfer_fp4_cutlass_moe_forward(
fused_experts: mk.FusedMoEModularKernel,
layer: torch.nn.Module,
x: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
global_num_experts: int,
expert_map: Optional[torch.Tensor],
apply_router_weight_on_input: bool,
) -> torch.Tensor:
"""Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
assert is_valid_flashinfer_cutlass_fused_moe(
x, layer.w13_weight,
layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
a1_gscale = layer.w13_input_scale_quant
a2_gscale = layer.w2_input_scale_quant
extra_expert_args = {
"g1_alphas": layer.g1_alphas,
"g2_alphas": layer.g2_alphas,
# Avoid confusion with a1_scale and a2_scale
# where are batch size related.
"a1_gscale": a1_gscale,
"a2_gscale": a2_gscale,
"out_dtype": x.dtype,
}
extra_prepare_args = {
"use_dp": layer.dp_size > 1,
"local_tokens": x.shape[0],
"a1_gscale": a1_gscale,
}
extra_finalize_args = {
"use_dp": layer.dp_size > 1,
"local_tokens": x.shape[0],
}
return fused_experts(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False, # TODO(shuw): fix later, now output is high prec
activation=activation,
global_num_experts=global_num_experts,
expert_map=expert_map,
w1_scale=layer.w13_blockscale_swizzled,
w2_scale=layer.w2_blockscale_swizzled,
apply_router_weight_on_input=apply_router_weight_on_input,
extra_expert_args=extra_expert_args,
extra_prepare_args=extra_prepare_args,
extra_finalize_args=extra_finalize_args,
)
def select_nvfp4_gemm_impl(
allow_flashinfer_cutlass: bool,
moe, # FusedMoEConfig
logger):
"""Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
# lazy import
from vllm.distributed import get_ep_group
all2all_manager = get_ep_group().device_communicator.all2all_manager
assert all2all_manager is not None
if allow_flashinfer_cutlass:
logger.debug_once("Using FlashInferExperts")
return FlashInferExperts(
use_nvfp4_w4a4=True,
use_dp=moe.moe_parallel_config.dp_size > 1,
ep_rank=moe.moe_parallel_config.ep_rank,
ep_size=moe.moe_parallel_config.ep_size,
tp_rank=moe.moe_parallel_config.tp_rank,
tp_size=moe.moe_parallel_config.tp_size,
)
# native cutlass experts currently don't support DP; TP case won't call this
raise ValueError(
"CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
"Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")

View File

@ -1,59 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
is_flashinfer_fp4_cutlass_moe_available)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
is_fp4_marlin_supported)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
_logger = init_logger(__name__)
@dataclass(frozen=True)
class NvFp4Support:
"""Result container for NV-FP4 capability probing."""
cutlass_supported: bool
allow_flashinfer_cutlass: bool
use_marlin: bool
def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
"""Detect platform support for NV-FP4 fused-MoE path"""
cutlass_supported = cutlass_fp4_supported()
allow_flashinfer = (cutlass_supported
and is_flashinfer_fp4_cutlass_moe_available())
if allow_flashinfer:
_logger.info_once("Using FlashInfer kernels for %s.", class_name
or "NVFP4 path")
else:
if envs.VLLM_USE_FLASHINFER_MOE_FP4:
_logger.warning_once(
"FlashInfer kernels unavailable for %s on current platform.",
class_name or "NVFP4 path",
)
use_marlin = False
if not cutlass_supported:
if is_fp4_marlin_supported():
use_marlin = True
_logger.info_once("Falling back to Marlin FP4 MoE kernel.")
else:
raise ValueError(
"Current platform does not support NVFP4 quantization. "
"Please use Blackwell GPUs or enable FlashInfer.")
return NvFp4Support(
cutlass_supported=cutlass_supported,
allow_flashinfer_cutlass=allow_flashinfer,
use_marlin=use_marlin,
)

View File

@ -34,8 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import (
filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
pt_weights_iterator, safetensors_weights_iterator)
from vllm.model_executor.models import is_pooling_model
from vllm.model_executor.utils import (get_moe_expert_mapping,
get_packed_modules_mapping,
from vllm.model_executor.utils import (get_packed_modules_mapping,
set_weight_attrs)
from vllm.platforms import current_platform
@ -44,12 +43,6 @@ from vllm.platforms import current_platform
logger = init_logger(__name__)
def is_moe_model(model: torch.nn.Module) -> bool:
"""Checks if the model contains FusedMoE layers."""
return bool(any(
isinstance(module, FusedMoE) for module in model.modules()))
class BitsAndBytesModelLoader(BaseModelLoader):
"""Model loader to load model weights with BitAndBytes quantization."""
@ -68,8 +61,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# Store all module names (from transformers) that support
# BNB quantization.
self.target_modules: list[str] = []
# Store the mapping of expert parameters for MoE models.
self.expert_params_mapping: list[tuple[str, str, int, str]] = []
# mapping weight names from transformers to vllm.
self.weight_mapper: Callable = lambda name: name
self.pre_quant: bool = False
@ -422,8 +413,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# in case model has a mixture of disk-merged and disk-split
# weights with same last name.
self.target_modules.append(name)
elif isinstance(module, FusedMoE) and hasattr(
module.quant_method, "quant_config"):
elif (isinstance(module, FusedMoE)
and hasattr(module.quant_method, "quant_config")):
if not hasattr(model, "get_expert_mapping"):
raise AttributeError(
f"MoE Model {type(model).__name__} does not support "
"BitsAndBytes quantization yet. Ensure this model has "
"'get_expert_mapping' method.")
# TODO: support FusedMoE with prequant and 8bit.
if self.pre_quant:
raise ValueError(
@ -434,9 +430,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
"BitsAndBytes 8bit quantization with FusedMoE is not "
"supported yet.")
# Get the corresponding weight name using module name and
# expert_params_mapping.
for exp in self.expert_params_mapping:
# get_expert_mapping.
expert_mapping = model.get_expert_mapping()
for exp in expert_mapping:
weight_name = exp[1]
rep_name = name.replace("experts",
"") + weight_name.removesuffix(".")
@ -468,7 +464,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
elif isinstance(module, (RowParallelLinear, )):
self.column_sharded_weights_modules.append(name)
elif isinstance(module, FusedMoE):
expert_mapping = self.expert_params_mapping
expert_mapping = model.get_expert_mapping()
for exp in expert_mapping:
if exp[-1] == "w2":
weight_name = exp[1]
@ -520,13 +516,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
self.is_pool_model = is_pooling_model(model)
self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
if is_moe_model(model):
self.expert_params_mapping = get_moe_expert_mapping(model)
if not self.expert_params_mapping:
raise AttributeError(
f"MoE Model {type(model).__name__} does not support "
"BitsAndBytes quantization yet. Ensure this model has "
"'get_expert_mapping' method.")
# For some models like Molmo, we need to use hf_to_vllm_mapper
# to ensure correct loading of weights.
if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
@ -580,10 +569,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
"""
from bitsandbytes.functional import QuantState
if not self.expert_params_mapping:
if not hasattr(model, "get_expert_mapping"):
return dict()
expert_mapping = self.expert_params_mapping
expert_mapping = model.get_expert_mapping()
expert_qs_dict = {}
for name, module in model.named_modules():
if not isinstance(module, FusedMoE):

View File

@ -123,10 +123,16 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
return self.ctx.get_hf_config(AyaVisionConfig)
def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
# Temporary workaround since this processor has multiple image tokens
# See https://github.com/huggingface/transformers/issues/38350
processor._check_special_mm_tokens = lambda *args, **kwargs: None
return processor
def get_image_processor(self) -> GotOcr2ImageProcessor:
return self.get_hf_processor().image_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor(
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
if not mm_data:
if mm_data:
processed_outputs = self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(prompt=prompt, **mm_data),
dict(**mm_kwargs, **tok_kwargs),
)
pixel_values = processed_outputs["pixel_values"]
# split pixel values into patches corresponding to each image
images_spatial_crop = processed_outputs["images_spatial_crop"]
patches_per_image = [
x.prod().item() + 1 for x in images_spatial_crop
]
pixel_values = pixel_values.split(patches_per_image)
processed_outputs["pixel_values"] = pixel_values
else:
tokenizer = self.info.get_tokenizer()
return tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)
pixel_values = processed_outputs["pixel_values"]
# split pixel values into patches corresponding to each image
images_spatial_crop = processed_outputs["images_spatial_crop"]
patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
pixel_values = pixel_values.split(patches_per_image)
processed_outputs["pixel_values"] = pixel_values
processed_outputs = tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
return processed_outputs

View File

@ -761,6 +761,12 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
class Florence2ProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_hf_processor(self):
return self.ctx.get_hf_processor()
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}

View File

@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
def get_image_processor(self) -> FuyuImageProcessor:
return self.get_hf_processor().image_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}

View File

@ -46,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
from .interfaces import SupportsQuant
from .utils import (AutoWeightsLoader, extract_layer_index,
is_pp_missing_parameter, make_layers, maybe_prefix)
@ -69,7 +68,6 @@ class Gemma3nAltUp(nn.Module):
altup_num_inputs: int,
altup_coef_clip: float,
altup_active_idx: int,
quant_config: QuantizationConfig,
prefix: str,
):
super().__init__()
@ -82,7 +80,6 @@ class Gemma3nAltUp(nn.Module):
altup_num_inputs,
altup_num_inputs,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.correction_coefs",
return_bias=False,
)
@ -90,7 +87,6 @@ class Gemma3nAltUp(nn.Module):
altup_num_inputs,
altup_num_inputs**2,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.prediction_coefs",
return_bias=False,
)
@ -98,7 +94,6 @@ class Gemma3nAltUp(nn.Module):
hidden_size,
altup_num_inputs,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.modality_router",
return_bias=False,
)
@ -405,7 +400,6 @@ class Gemma3nDecoderLayer(nn.Module):
altup_num_inputs=config.altup_num_inputs,
altup_coef_clip=config.altup_coef_clip,
altup_active_idx=config.altup_active_idx,
quant_config=quant_config,
prefix=f"{prefix}.altup",
)
self.self_attn = Gemma3nAttention(
@ -533,7 +527,7 @@ class Gemma3nDecoderLayer(nn.Module):
@support_torch_compile
class Gemma3nTextModel(nn.Module, SupportsQuant):
class Gemma3nTextModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
@ -546,7 +540,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.embed_tokens",
)
self.embed_scale = torch.tensor(
@ -556,7 +549,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
self.embed_tokens_per_layer = VocabParallelEmbedding(
config.vocab_size_per_layer_input,
config.num_hidden_layers * config.hidden_size_per_layer_input,
quant_config=quant_config,
prefix=f"{prefix}.per_layer_embed_tokens",
)
self.embed_scale_per_layer = torch.tensor(
@ -590,7 +582,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
gather_output=True,
return_bias=False,
quant_config=quant_config,
prefix=f"{prefix}.altup_projections.{idx-1}",
prefix=f"{prefix}.{idx-1}.altup_projections",
) for idx in range(1, self.config.altup_num_inputs)
])
self.altup_unembed_projections = nn.ModuleList([
@ -601,7 +593,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
gather_output=True,
return_bias=False,
quant_config=quant_config,
prefix=f"{prefix}.altup_unembed_projections.{idx-1}",
prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
) for idx in range(1, self.config.altup_num_inputs)
])
@ -782,7 +774,7 @@ class Gemma3nModel(nn.Module):
**kwargs)
class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
class Gemma3nForConditionalGeneration(nn.Module):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",

View File

@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1}
def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
def get_image_processor(self) -> Glm4vImageProcessor:
return self.get_hf_processor().image_processor
def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
return self.get_hf_processor(**kwargs).video_processor
def get_video_processor(self) -> Glm4vVideoProcessor:
return self.get_hf_processor().video_processor
def _get_vision_info(
self,
@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
total_frames: int) -> list[int]:
video_processor = self.get_video_processor()
video_fps = metadata.get("fps", video_processor.fps)
video_fps = metadata.get("fps", 2.0)
meta_frames = metadata.get("total_num_frames", total_frames)
max_frame_idx = meta_frames - 1
duration = metadata.get("duration",
@ -1120,7 +1120,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
video_placeholder,
)
video_grid_thw_lst.append(video_outputs["video_grid_thw"])
grid_t = len(video_outputs["video_grid_thw"])
_, grid_h, grid_w = video_outputs["video_grid_thw"][0]
grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
video_grid_thw_lst.append(grid_thw)
pixel_values_videos_lst.append(
video_outputs["pixel_values_videos"])
video_outputs = dict(

View File

@ -392,7 +392,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> H2OVLProcessor:
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),

View File

@ -25,7 +25,8 @@ import torch
import torch.nn as nn
from timm.layers import LayerNorm, LayerNorm2d
from timm.models.regnet import RegStage
from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
SiglipVisionConfig)
from transformers.modeling_utils import no_init_weights
from vllm.config import VllmConfig
@ -79,9 +80,26 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
class HCXVisionProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config())
def get_hf_processor(
self,
**kwargs: object,
):
processor_cls = type(
AutoProcessor.from_pretrained(
self.ctx.model_config.model,
trust_remote_code=self.ctx.model_config.trust_remote_code,
))
return self.ctx.get_hf_processor(
processor_cls,
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}

View File

@ -88,7 +88,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
class Idefics3ProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
def get_hf_processor(
self,
*,
size: Optional[dict[str, int]] = None,
**kwargs: object,
) -> Idefics3Processor:
if size is not None:
kwargs["size"] = size
return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:

View File

@ -665,7 +665,14 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> BaseInternVLProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@ -875,12 +882,27 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
return max(max_frames_per_video, 1)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> InternVLProcessor:
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
kwargs["video_token"] = self.get_video_token()
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
video_token=self.get_video_token(),
**kwargs,
)

View File

@ -44,6 +44,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@ -978,8 +980,72 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
class KeyeProcessingInfo(BaseProcessingInfo):
def get_image_processor(self, **kwargs: object):
return self.get_hf_processor(**kwargs).image_processor
def get_hf_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
return self.ctx.get_hf_processor(
image_processor=self.get_image_processor(
min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
),
**kwargs,
)
def _get_image_processor_kwargs(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
if self.ctx.model_config.mm_processor_kwargs:
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
if min_pixels is not None:
kwargs["min_pixels"] = min_pixels
if size is None:
size = {"shortest_edge": min_pixels}
else:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
kwargs["max_pixels"] = max_pixels
if size is None:
size = {"longest_edge": max_pixels}
else:
size["longest_edge"] = max_pixels
if size is not None:
kwargs["size"] = size
return kwargs
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
return cached_image_processor_from_config(
self.ctx.model_config,
**self._get_image_processor_kwargs(
min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
**kwargs,
),
)
def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
@ -1180,6 +1246,20 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
def _get_data_parser(self) -> MultiModalDataParser:
return KeyeMultiModalDataParser()
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
return self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(text=prompt, **mm_data),
dict(**mm_kwargs, **tok_kwargs),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,

View File

@ -51,25 +51,6 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
if getattr(config, "norm_before_residual", False):
self._residual_norm = self._norm_before_residual
else:
self._residual_norm = self._norm_after_residual
def _norm_before_residual(
self,
hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
hidden_states = self.hidden_norm(hidden_states)
residual = hidden_states
return hidden_states, residual
def _norm_after_residual(
self,
hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
residual = hidden_states
hidden_states = self.hidden_norm(hidden_states)
return hidden_states, residual
def forward(
self,
positions: torch.Tensor,
@ -78,10 +59,9 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
residual: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor]:
residual = hidden_states
embeds = self.input_layernorm(embeds)
hidden_states, residual = self._residual_norm(
hidden_states=hidden_states)
hidden_states = self.hidden_norm(hidden_states)
hidden_states = torch.cat([embeds, hidden_states], dim=-1)
# Self Attention
@ -122,7 +102,7 @@ class LlamaModel(nn.Module):
self.layers = nn.ModuleList([
LlamaDecoderLayer(
config=self.config,
self.config,
prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
)
])

View File

@ -8,9 +8,11 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
import torch
import torch.nn as nn
from packaging.version import Version
from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
PixtralVisionConfig, PretrainedConfig,
SiglipVisionConfig)
from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.models.llava import LlavaProcessor
from transformers.models.pixtral import PixtralProcessor
@ -305,14 +307,29 @@ class PixtralHFMultiModalProcessor(
pixel_values = processed_outputs.get("pixel_values")
if pixel_values is not None:
# Avoid padding since we need the output for each image to be
# independent of other images for the cache to work correctly
image_sizes = processed_outputs["image_sizes"]
assert len(pixel_values) == len(image_sizes)
# Before/after https://github.com/huggingface/transformers/pull/35122
if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
images = mm_data["images"]
assert isinstance(images, list)
processed_outputs["pixel_values"] = [
p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
]
# Original output: (1, num_images, C, H, W)
# New output: (num_images, C, H, W)
assert (isinstance(pixel_values, list)
and len(pixel_values) == 1)
assert (isinstance(pixel_values[0], list)
and len(pixel_values[0]) == len(images))
processed_outputs["pixel_values"] = pixel_values[0]
else:
# Avoid padding since we need the output for each image to be
# independent of other images for the cache to work correctly
image_sizes = processed_outputs["image_sizes"]
assert len(pixel_values) == len(image_sizes)
processed_outputs["pixel_values"] = [
p[:, :h, :w]
for p, (h, w) in zip(pixel_values, image_sizes)
]
return processed_outputs
@ -767,10 +784,17 @@ class MantisProcessingInfo(LlavaProcessingInfo):
vision_info = self.get_vision_encoder_info()
kwargs.setdefault("patch_size", vision_info.get_patch_size())
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
# BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this
kwargs.setdefault("vision_feature_select_strategy", None)
else:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)

View File

@ -331,8 +331,10 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
return hf_processor
def get_image_processor(self, **kwargs: object):
return self.get_hf_processor(**kwargs).image_processor
def get_image_processor(self):
hf_processor = self.get_hf_processor()
image_processor = hf_processor.image_processor # type: ignore
return image_processor
def get_model_version(self):
return get_version_by_config(self.get_hf_config())

Some files were not shown because too many files have changed in this diff Show More