mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 03:57:02 +08:00
Revert "Merge commit '6e8d8c4afbddf725b34ef938616701869f5b3462' into sage/dbo-full-cudagraphsh"
This reverts commit 5215c80a4988e81d2f5971e02d50d3785cab5ae8, reversing changes made to dd2a94fd9d90d0c04772380c220b5ec81bd0b61e.
This commit is contained in:
parent
a0a11bc0b5
commit
9f04a6cf57
@ -104,6 +104,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
|
||||
@ -35,6 +35,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
@ -89,6 +90,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
@ -143,6 +145,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
@ -194,6 +197,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
@ -247,6 +251,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
@ -300,6 +305,7 @@
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -49,6 +50,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -81,6 +83,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -114,6 +117,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -149,6 +153,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -184,6 +189,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -49,6 +50,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -82,6 +84,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -115,6 +118,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -150,6 +154,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -186,6 +191,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -49,6 +50,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -81,6 +83,7 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -114,6 +117,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
@ -149,6 +153,7 @@
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
@ -25,6 +26,7 @@
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
@ -43,6 +45,7 @@
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
@ -57,7 +60,8 @@
|
||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||
"qps_list": [2],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"disable_log_requests": "",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
|
||||
@ -647,31 +647,13 @@ steps:
|
||||
- label: Blackwell Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- csrc/attention/mla/
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- csrc/
|
||||
- vllm/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
|
||||
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@ -822,7 +804,6 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_multi_loras_with_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
40
SECURITY.md
40
SECURITY.md
@ -1,45 +1,13 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting security issues
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||
|
||||
## Issue triage
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||
|
||||
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||
|
||||
## Threat model
|
||||
---
|
||||
|
||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
|
||||
## Issue severity
|
||||
|
||||
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
|
||||
|
||||
### CRITICAL Severity
|
||||
|
||||
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0.
|
||||
|
||||
### HIGH Severity
|
||||
|
||||
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
|
||||
|
||||
### MODERATE Severity
|
||||
|
||||
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
|
||||
|
||||
### LOW Severity
|
||||
|
||||
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
|
||||
|
||||
## Prenotification policy
|
||||
|
||||
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
|
||||
|
||||
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
|
||||
|
||||
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
|
||||
|
||||
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
|
||||
|
||||
@ -22,13 +22,6 @@ from vllm.utils import FlexibleArgumentParser
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
def ensure_divisibility(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator."""
|
||||
assert numerator % denominator == 0, (
|
||||
"intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
|
||||
)
|
||||
|
||||
|
||||
class BenchmarkConfig(TypedDict):
|
||||
BLOCK_SIZE_M: int
|
||||
BLOCK_SIZE_N: int
|
||||
@ -610,7 +603,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
ensure_divisibility(intermediate_size, args.tp_size)
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
|
||||
@ -1,156 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
import torch
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random_flash,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_benchmark(
|
||||
num_tokens: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
kv_cache_layout: str,
|
||||
num_iters: int,
|
||||
device: str = "cuda",
|
||||
) -> float:
|
||||
"""Return latency (seconds) for given num_tokens."""
|
||||
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||
|
||||
current_platform.seed_everything(42)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# create random key / value tensors [T, H, D].
|
||||
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||
value = torch.randn_like(key)
|
||||
|
||||
# prepare the slot mapping.
|
||||
# each token is assigned a unique slot in the KV-cache.
|
||||
num_slots = block_size * num_blocks
|
||||
if num_tokens > num_slots:
|
||||
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||
|
||||
key_caches, value_caches = create_kv_caches_with_random_flash(
|
||||
num_blocks,
|
||||
block_size,
|
||||
1, # num_layers
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
device=device,
|
||||
cache_layout=kv_cache_layout,
|
||||
)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||
|
||||
def run_cuda_benchmark(n_iters: int) -> float:
|
||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.synchronize()
|
||||
start = time.perf_counter()
|
||||
for _ in range(n_iters):
|
||||
ops.reshape_and_cache_flash(
|
||||
key,
|
||||
value,
|
||||
key_cache,
|
||||
value_cache,
|
||||
slot_mapping,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
end = time.perf_counter()
|
||||
return (end - start) / n_iters
|
||||
|
||||
# warm-up
|
||||
run_cuda_benchmark(3)
|
||||
|
||||
lat = run_cuda_benchmark(num_iters)
|
||||
|
||||
# free tensors to mitigate OOM when sweeping
|
||||
del key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return lat
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for layout in ["NHD", "HND"]:
|
||||
for exp in range(1, 17):
|
||||
n_tok = 2**exp
|
||||
lat = run_benchmark(
|
||||
num_tokens=n_tok,
|
||||
num_heads=args.num_heads,
|
||||
head_size=args.head_size,
|
||||
block_size=args.block_size,
|
||||
num_blocks=args.num_blocks,
|
||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
kv_cache_layout=layout,
|
||||
num_iters=args.iters,
|
||||
device="cuda",
|
||||
)
|
||||
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
||||
|
||||
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
|
||||
parser.add_argument("--num-heads", type=int, default=128)
|
||||
parser.add_argument(
|
||||
"--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||
default=128,
|
||||
)
|
||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||
parser.add_argument("--num-blocks", type=int, default=128 * 512)
|
||||
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=str,
|
||||
choices=["half", "bfloat16", "float"],
|
||||
default="bfloat16",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8"],
|
||||
default="auto",
|
||||
)
|
||||
|
||||
parser.add_argument("--iters", type=int, default=100)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
@ -4,16 +4,49 @@
|
||||
# ruff: noqa: E501
|
||||
import time
|
||||
|
||||
# Import DeepGEMM functions
|
||||
import deep_gemm
|
||||
import torch
|
||||
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
|
||||
|
||||
# Import vLLM functions
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_token_group_quant_fp8,
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
||||
|
||||
|
||||
# Copied from
|
||||
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
|
||||
def per_token_cast_to_fp8(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Convert tensor to FP8 format with per-token scaling."""
|
||||
assert x.dim() == 2 and x.size(1) % 128 == 0
|
||||
m, n = x.shape
|
||||
x_view = x.view(m, -1, 128)
|
||||
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
||||
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
|
||||
torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
|
||||
|
||||
|
||||
# Copied from
|
||||
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Convert tensor to FP8 format with per-block scaling."""
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
|
||||
dtype=x.dtype,
|
||||
device=x.device)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
|
||||
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
|
||||
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
|
||||
|
||||
def benchmark_shape(m: int,
|
||||
@ -36,14 +69,14 @@ def benchmark_shape(m: int,
|
||||
|
||||
# Pre-quantize B for all implementations
|
||||
# (weights can be pre-quantized offline)
|
||||
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
|
||||
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
|
||||
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
|
||||
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
|
||||
|
||||
# Block size configuration
|
||||
block_size = [128, 128]
|
||||
|
||||
# Pre-quantize A for all implementations
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
|
||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
@ -52,7 +85,7 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === DeepGEMM Implementation ===
|
||||
def deepgemm_gemm():
|
||||
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
|
||||
(B_deepgemm, B_scale_deepgemm),
|
||||
C_deepgemm)
|
||||
return C_deepgemm
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||
@ -262,26 +261,14 @@ __global__ void reshape_and_cache_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
// Used by vectorization_utils to copy/convert one element
|
||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||
struct CopyWithScaleOp {
|
||||
float scale;
|
||||
|
||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst = static_cast<OutT>(src);
|
||||
} else {
|
||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void reshape_and_cache_flash_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||
cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below
|
||||
cache_t* __restrict__ value_cache, // same above
|
||||
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
@ -295,58 +282,25 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
const int n_elems = num_heads * head_size;
|
||||
|
||||
// pointers to the beginning of the source row for this token.
|
||||
const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
|
||||
const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
|
||||
|
||||
// find the start position inside the kv-cache for this token.
|
||||
cache_t* __restrict__ key_dst =
|
||||
key_cache + block_idx * block_stride + block_offset * page_stride;
|
||||
cache_t* __restrict__ value_dst =
|
||||
value_cache + block_idx * block_stride + block_offset * page_stride;
|
||||
|
||||
// this is true for the NHD layout where `head_stride == head_size`
|
||||
const bool is_contiguous_heads = (head_stride == head_size);
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
if (is_contiguous_heads) {
|
||||
// NHD layout
|
||||
// kv cache: [num_blocks, block_size, num_heads, head_size]
|
||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
|
||||
blockDim.x, k_op);
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
|
||||
threadIdx.x, blockDim.x, v_op);
|
||||
|
||||
} else {
|
||||
// HND layout: heads are strided, but each head_size segment is contiguous
|
||||
// kv cache: [num_blocks, num_heads, block_size, head_size]
|
||||
const int lane = threadIdx.x & 31; // 0..31 within warp
|
||||
const int warp_id = threadIdx.x >> 5; // warp index within block
|
||||
const int warps_per_block = blockDim.x >> 5;
|
||||
|
||||
for (int head = warp_id; head < num_heads; head += warps_per_block) {
|
||||
const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
|
||||
const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
|
||||
|
||||
cache_t* __restrict__ k_dst_h =
|
||||
key_dst + static_cast<int64_t>(head) * head_stride;
|
||||
cache_t* __restrict__ v_dst_h =
|
||||
value_dst + static_cast<int64_t>(head) * head_stride;
|
||||
|
||||
// within each head, let the 32 threads of the warp perform the vector
|
||||
// copy
|
||||
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
|
||||
k_op);
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
|
||||
v_op);
|
||||
const int n = num_heads * head_size;
|
||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
||||
block_offset * page_stride +
|
||||
head_idx * head_stride + head_offset;
|
||||
scalar_t tgt_key = key[src_key_idx];
|
||||
scalar_t tgt_value = value[src_value_idx];
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
key_cache[tgt_key_value_idx] = tgt_key;
|
||||
value_cache[tgt_key_value_idx] = tgt_value;
|
||||
} else {
|
||||
key_cache[tgt_key_value_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
||||
value_cache[tgt_key_value_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -24,12 +24,9 @@
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/util_type.cuh>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cuda/std/functional>
|
||||
using AddOp = cuda::std::plus<float>;
|
||||
#else
|
||||
#include <hipcub/util_type.hpp>
|
||||
#include <hipcub/hipcub.hpp>
|
||||
using AddOp = cub::Sum;
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
@ -65,6 +62,7 @@ __launch_bounds__(TPB) __global__
|
||||
|
||||
const int thread_row_offset = blockIdx.x * num_cols;
|
||||
|
||||
cub::Sum sum;
|
||||
float threadData(-FLT_MAX);
|
||||
|
||||
// Don't touch finished rows.
|
||||
@ -94,7 +92,7 @@ __launch_bounds__(TPB) __global__
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
@ -15,7 +16,6 @@ ARG PYTHON_VERSION=3.12
|
||||
# Example:
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||
@ -206,16 +206,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED
|
||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
||||
ENV VLLM_USE_PRECOMPILED=""
|
||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
||||
export VLLM_USE_PRECOMPILED=1 && \
|
||||
echo "Using precompiled wheels"; \
|
||||
else \
|
||||
unset VLLM_USE_PRECOMPILED && \
|
||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
||||
fi
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -232,6 +223,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
@ -245,9 +238,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
|
||||
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
|
||||
echo "Cleaning up extra wheels in dist/..." && \
|
||||
# Identify the most recent manylinux1_x86_64 wheel
|
||||
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
|
||||
if [ -n "$KEEP_WHEEL" ]; then \
|
||||
echo "Keeping wheel: $KEEP_WHEEL"; \
|
||||
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
|
||||
fi; \
|
||||
fi
|
||||
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
@ -283,6 +289,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
@ -363,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
fi
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
# !bang
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
@ -427,33 +435,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
|
||||
CUDA_MINOR="${CUDA_MINOR%%.*}"
|
||||
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
|
||||
git clone --recursive --shallow-submodules \
|
||||
${DEEPGEMM_GIT_REPO} deepgemm
|
||||
echo "🏗️ Building DeepGEMM"
|
||||
pushd deepgemm
|
||||
git checkout ${DEEPGEMM_GIT_REF}
|
||||
# Build DeepGEMM
|
||||
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
|
||||
rm -rf build dist
|
||||
rm -rf *.egg-info
|
||||
python3 setup.py bdist_wheel
|
||||
uv pip install --system dist/*.whl
|
||||
popd
|
||||
rm -rf deepgemm
|
||||
else
|
||||
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||
fi
|
||||
BASH
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
@ -172,36 +172,6 @@ Multi-image input can be extended to perform video captioning. We show this with
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
#### Custom RGBA Background Color
|
||||
|
||||
When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`.
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Default white background (no configuration needed)
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
)
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
)
|
||||
```
|
||||
|
||||
!!! note
|
||||
- The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255
|
||||
- This setting only affects RGBA images with transparency; RGB images are unchanged
|
||||
- If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
|
||||
|
||||
### Video Inputs
|
||||
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
@ -508,20 +478,6 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
|
||||
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
|
||||
```
|
||||
|
||||
#### Custom RGBA Background Color
|
||||
|
||||
To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
|
||||
|
||||
```bash
|
||||
# Example: Black background for dark theme
|
||||
vllm serve llava-hf/llava-1.5-7b-hf \
|
||||
--media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}'
|
||||
|
||||
# Example: Custom gray background
|
||||
vllm serve llava-hf/llava-1.5-7b-hf \
|
||||
--media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}'
|
||||
```
|
||||
|
||||
### Audio Inputs
|
||||
|
||||
Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
|
||||
|
||||
@ -15,10 +15,6 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
||||
|
||||
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||
|
||||
!!! warning
|
||||
In vllm v0.10.0, speculative decoding with a draft model is not supported.
|
||||
If you use the following code, you will get a `NotImplementedError`.
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
|
||||
@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
response_format={
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
|
||||
vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
|
||||
|
||||
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
|
||||
|
||||
@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source.
|
||||
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
uv pip install -r requirements/cpu.txt
|
||||
uv pip install -e .
|
||||
pip install -r requirements/cpu.txt
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
!!! note
|
||||
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
|
||||
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
|
||||
|
||||
!!! example "Troubleshooting"
|
||||
If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your
|
||||
If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
|
||||
[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
|
||||
|
||||
```text
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
sudo apt-get update -y
|
||||
@ -6,34 +6,28 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
```
|
||||
|
||||
Second, clone the vLLM project:
|
||||
Second, clone vLLM project:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||
cd vllm_source
|
||||
```
|
||||
|
||||
Third, install required dependencies:
|
||||
Third, install Python packages for vLLM CPU backend building:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend auto
|
||||
uv pip install -r requirements/cpu.txt --torch-backend auto
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, build and install vLLM:
|
||||
Finally, build and install vLLM CPU backend:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
```
|
||||
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
If you want to develop vllm, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py develop
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
|
||||
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
|
||||
|
||||
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
|
||||
@ -40,32 +40,21 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
|
||||
. "$HOME/.cargo/env"
|
||||
```
|
||||
|
||||
Execute the following commands to build and install vLLM from source.
|
||||
Execute the following commands to build and install vLLM from the source.
|
||||
|
||||
!!! tip
|
||||
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
|
||||
Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
|
||||
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
uv pip install -v \
|
||||
--torch-backend auto \
|
||||
pip install -v \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
uv pip install dist/*.whl
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
pip install -v \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
@ -74,19 +63,19 @@ Execute the following commands to build and install vLLM from source.
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.s390x \
|
||||
--tag vllm-cpu-env .
|
||||
--tag vllm-cpu-env .
|
||||
|
||||
# Launch OpenAI server
|
||||
# Launching OpenAI server
|
||||
docker run --rm \
|
||||
--privileged true \
|
||||
--shm-size 4g \
|
||||
-p 8000:8000 \
|
||||
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
|
||||
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
|
||||
vllm-cpu-env \
|
||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||
--dtype float \
|
||||
other vLLM OpenAI server arguments
|
||||
--privileged=true \
|
||||
--shm-size=4g \
|
||||
-p 8000:8000 \
|
||||
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
|
||||
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
|
||||
vllm-cpu-env \
|
||||
--model=meta-llama/Llama-3.2-1B-Instruct \
|
||||
--dtype=float \
|
||||
other vLLM OpenAI server arguments
|
||||
```
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
|
||||
@ -713,7 +713,6 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
|
||||
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
|
||||
|
||||
### Pooling Models
|
||||
|
||||
|
||||
@ -423,6 +423,51 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# SmolVLM2-2.2B-Instruct
|
||||
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {"longest_edge": 384},
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
prompts = [
|
||||
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# omni-research/Tarsier-7b
|
||||
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "omni-research/Tarsier-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Intern-S1
|
||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1"
|
||||
@ -496,6 +541,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Nemontron_VL
|
||||
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
assert modality == "image"
|
||||
placeholder = "<image>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
messages = [
|
||||
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Stop tokens for InternVL
|
||||
# models variants may have different stop tokens
|
||||
# please refer to the model card for the correct "stop words":
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Keye-VL
|
||||
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
@ -551,41 +634,6 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=8,
|
||||
gpu_memory_utilization=0.4,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
|
||||
}
|
||||
]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
stop_token_ids = None
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-1.5
|
||||
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -828,6 +876,41 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=8,
|
||||
gpu_memory_utilization=0.4,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
|
||||
}
|
||||
]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
stop_token_ids = None
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -853,44 +936,6 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Nemontron_VL
|
||||
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
assert modality == "image"
|
||||
placeholder = "<image>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
messages = [
|
||||
[{"role": "user", "content": f"{placeholder}\n{question}"}]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Stop tokens for InternVL
|
||||
# models variants may have different stop tokens
|
||||
# please refer to the model card for the correct "stop words":
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# NVLM-D
|
||||
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1248,6 +1293,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
|
||||
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "omni-research/Tarsier2-Recap-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# SkyworkR1V
|
||||
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1281,111 +1357,6 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# SmolVLM2-2.2B-Instruct
|
||||
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {"longest_edge": 384},
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
prompts = [
|
||||
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Step3
|
||||
def run_step3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "stepfun-ai/step3-fp8"
|
||||
|
||||
# NOTE: Below are verified configurations for step3-fp8
|
||||
# on 8xH100 GPUs.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_batched_tokens=4096,
|
||||
gpu_memory_utilization=0.85,
|
||||
tensor_parallel_size=8,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
reasoning_parser="step3",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
|
||||
f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# omni-research/Tarsier-7b
|
||||
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "omni-research/Tarsier-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "omni-research/Tarsier2-Recap-7b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"aria": run_aria,
|
||||
"aya_vision": run_aya_vision,
|
||||
@ -1402,9 +1373,9 @@ model_example_map = {
|
||||
"idefics3": run_idefics3,
|
||||
"interns1": run_interns1,
|
||||
"internvl_chat": run_internvl,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"keye_vl": run_keye_vl,
|
||||
"kimi_vl": run_kimi_vl,
|
||||
"llama4": run_llama4,
|
||||
"llava": run_llava,
|
||||
"llava-next": run_llava_next,
|
||||
"llava-next-video": run_llava_next_video,
|
||||
@ -1414,8 +1385,8 @@ model_example_map = {
|
||||
"minicpmv": run_minicpmv,
|
||||
"mistral3": run_mistral3,
|
||||
"mllama": run_mllama,
|
||||
"llama4": run_llama4,
|
||||
"molmo": run_molmo,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"ovis": run_ovis,
|
||||
"paligemma": run_paligemma,
|
||||
@ -1430,7 +1401,6 @@ model_example_map = {
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
"skywork_chat": run_skyworkr1v,
|
||||
"smolvlm": run_smolvlm,
|
||||
"step3": run_step3,
|
||||
"tarsier": run_tarsier,
|
||||
"tarsier2": run_tarsier2,
|
||||
}
|
||||
|
||||
@ -197,53 +197,6 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_hyperclovax_seed_vision(
|
||||
question: str, image_urls: list[str]
|
||||
) -> ModelRequestData:
|
||||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
message = {"role": "user", "content": list()}
|
||||
for _image_url in image_urls:
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"image": _image_url,
|
||||
"ocr": "",
|
||||
"lens_keywords": "",
|
||||
"lens_local_keywords": "",
|
||||
}
|
||||
)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
}
|
||||
)
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[
|
||||
message,
|
||||
],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
@ -272,6 +225,34 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {"longest_edge": 384},
|
||||
},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
)
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1"
|
||||
|
||||
@ -335,36 +316,49 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
def load_hyperclovax_seed_vision(
|
||||
question: str, image_urls: list[str]
|
||||
) -> ModelRequestData:
|
||||
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=131072,
|
||||
tensor_parallel_size=8,
|
||||
trust_remote_code=True,
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
message = {"role": "user", "content": list()}
|
||||
for _image_url in image_urls:
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"image": _image_url,
|
||||
"ocr": "",
|
||||
"lens_keywords": "",
|
||||
"lens_local_keywords": "",
|
||||
}
|
||||
)
|
||||
message["content"].append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
"type": "text",
|
||||
"text": question,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[
|
||||
message,
|
||||
],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
@ -469,6 +463,40 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
|
||||
)
|
||||
|
||||
|
||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=131072,
|
||||
tensor_parallel_size=8,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
|
||||
@ -926,62 +954,6 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {"longest_edge": 384},
|
||||
},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
)
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "stepfun-ai/step3-fp8"
|
||||
|
||||
# NOTE: Below are verified configurations for step3-fp8
|
||||
# on 8xH100 GPUs.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_batched_tokens=4096,
|
||||
gpu_memory_utilization=0.85,
|
||||
tensor_parallel_size=8,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
reasoning_parser="step3",
|
||||
)
|
||||
|
||||
prompt = (
|
||||
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
|
||||
f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
|
||||
">assistant\n<think>\n"
|
||||
)
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=image_data,
|
||||
)
|
||||
|
||||
|
||||
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "omni-research/Tarsier-7b"
|
||||
|
||||
@ -1034,16 +1006,16 @@ model_example_map = {
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
"idefics3": load_idefics3,
|
||||
"interns1": load_interns1,
|
||||
"internvl_chat": load_internvl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
"keye_vl": load_keye_vl,
|
||||
"kimi_vl": load_kimi_vl,
|
||||
"llama4": load_llama4,
|
||||
"llava": load_llava,
|
||||
"llava-next": load_llava_next,
|
||||
"llava-onevision": load_llava_onevision,
|
||||
"llama4": load_llama4,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
@ -1056,7 +1028,6 @@ model_example_map = {
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
"qwen2_5_vl": load_qwen2_5_vl,
|
||||
"smolvlm": load_smolvlm,
|
||||
"step3": load_step3,
|
||||
"tarsier": load_tarsier,
|
||||
"tarsier2": load_tarsier2,
|
||||
}
|
||||
|
||||
@ -293,8 +293,6 @@ class Proxy:
|
||||
# add params to request
|
||||
kv_prepare_request = request.copy()
|
||||
kv_prepare_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in kv_prepare_request:
|
||||
kv_prepare_request["max_completion_tokens"] = 1
|
||||
|
||||
# prefill stage
|
||||
prefill_instance = self.schedule(self.prefill_cycler)
|
||||
|
||||
@ -128,8 +128,6 @@ async def handle_request():
|
||||
prefill_request = original_request_data.copy()
|
||||
# change max_tokens = 1 to let it only do prefill
|
||||
prefill_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in prefill_request:
|
||||
prefill_request["max_completion_tokens"] = 1
|
||||
|
||||
global count
|
||||
global prefill_instances
|
||||
|
||||
@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp
|
||||
openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
|
||||
openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
|
||||
pydantic >= 2.10
|
||||
prometheus_client >= 0.18.0
|
||||
pillow # Required for image processing
|
||||
|
||||
@ -11,4 +11,6 @@ torchaudio==2.7.1
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
|
||||
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
||||
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer_python==0.2.9rc2
|
||||
@ -22,9 +22,7 @@ aiohttp==3.10.11
|
||||
aiohttp-cors==0.8.1
|
||||
# via ray
|
||||
aiosignal==1.3.1
|
||||
# via
|
||||
# aiohttp
|
||||
# ray
|
||||
# via aiohttp
|
||||
albucore==0.0.16
|
||||
# via terratorch
|
||||
albumentations==1.4.6
|
||||
@ -139,7 +137,7 @@ contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cramjam==2.9.0
|
||||
# via fastparquet
|
||||
cupy-cuda12x==13.3.0
|
||||
cupy-cuda12x==13.5.1
|
||||
# via ray
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
@ -226,7 +224,6 @@ frozenlist==1.5.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
# ray
|
||||
fsspec==2024.9.0
|
||||
# via
|
||||
# datasets
|
||||
@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86
|
||||
opentelemetry-api==1.35.0
|
||||
# via
|
||||
# mlflow-skinny
|
||||
# opentelemetry-exporter-prometheus
|
||||
# opentelemetry-sdk
|
||||
# opentelemetry-semantic-conventions
|
||||
opentelemetry-exporter-prometheus==0.56b0
|
||||
# via ray
|
||||
opentelemetry-proto==1.36.0
|
||||
# via ray
|
||||
opentelemetry-sdk==1.35.0
|
||||
# via mlflow-skinny
|
||||
# via
|
||||
# mlflow-skinny
|
||||
# opentelemetry-exporter-prometheus
|
||||
# ray
|
||||
opentelemetry-semantic-conventions==0.56b0
|
||||
# via opentelemetry-sdk
|
||||
packaging==24.2
|
||||
@ -697,7 +702,9 @@ pqdm==0.2.0
|
||||
pretrainedmodels==0.7.4
|
||||
# via segmentation-models-pytorch
|
||||
prometheus-client==0.22.0
|
||||
# via ray
|
||||
# via
|
||||
# opentelemetry-exporter-prometheus
|
||||
# ray
|
||||
propcache==0.2.0
|
||||
# via yarl
|
||||
proto-plus==1.26.1
|
||||
@ -707,6 +714,7 @@ protobuf==5.28.3
|
||||
# google-api-core
|
||||
# googleapis-common-protos
|
||||
# mlflow-skinny
|
||||
# opentelemetry-proto
|
||||
# proto-plus
|
||||
# ray
|
||||
# tensorboardx
|
||||
@ -854,7 +862,7 @@ rasterio==1.4.3
|
||||
# rioxarray
|
||||
# terratorch
|
||||
# torchgeo
|
||||
ray==2.43.0
|
||||
ray==2.48.0
|
||||
# via -r requirements/test.in
|
||||
redis==5.2.0
|
||||
# via tensorizer
|
||||
|
||||
186
setup.py
186
setup.py
@ -7,6 +7,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@ -281,10 +282,69 @@ class cmake_build_ext(build_ext):
|
||||
self.copy_file(file, dst_file)
|
||||
|
||||
|
||||
class repackage_wheel(build_ext):
|
||||
class precompiled_wheel_utils:
|
||||
"""Extracts libraries and other files from an existing wheel."""
|
||||
|
||||
def get_base_commit_in_main_branch(self) -> str:
|
||||
@staticmethod
|
||||
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
temp_dir = None
|
||||
try:
|
||||
if not os.path.isfile(wheel_url_or_path):
|
||||
wheel_filename = wheel_url_or_path.split("/")[-1]
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||
print(f"Downloading wheel from {wheel_url_or_path} "
|
||||
f"to {wheel_path}")
|
||||
from urllib.request import urlretrieve
|
||||
urlretrieve(wheel_url_or_path, filename=wheel_path)
|
||||
else:
|
||||
wheel_path = wheel_url_or_path
|
||||
print(f"Using existing wheel at {wheel_path}")
|
||||
|
||||
package_data_patch = {}
|
||||
|
||||
with zipfile.ZipFile(wheel_path) as wheel:
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
]
|
||||
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy,
|
||||
wheel.filelist))
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
|
||||
for file in file_members:
|
||||
print(f"[extract] {file.filename}")
|
||||
target_path = os.path.join(".", file.filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with wheel.open(file.filename) as src, open(
|
||||
target_path, "wb") as dst:
|
||||
shutil.copyfileobj(src, dst)
|
||||
|
||||
pkg = os.path.dirname(file.filename).replace("/", ".")
|
||||
package_data_patch.setdefault(pkg, []).append(
|
||||
os.path.basename(file.filename))
|
||||
|
||||
return package_data_patch
|
||||
finally:
|
||||
if temp_dir is not None:
|
||||
print(f"Removing temporary directory {temp_dir}")
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@staticmethod
|
||||
def get_base_commit_in_main_branch() -> str:
|
||||
# Force to use the nightly wheel. This is mainly used for CI testing.
|
||||
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
|
||||
return "nightly"
|
||||
@ -297,6 +357,10 @@ class repackage_wheel(build_ext):
|
||||
]).decode("utf-8")
|
||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||
|
||||
# In Docker build context, .git may be immutable or missing.
|
||||
if envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||
return upstream_main_commit
|
||||
|
||||
# Check if the upstream_main_commit exists in the local repo
|
||||
try:
|
||||
subprocess.check_output(
|
||||
@ -329,92 +393,15 @@ class repackage_wheel(build_ext):
|
||||
"wheel may not be compatible with your dev branch: %s", err)
|
||||
return "nightly"
|
||||
|
||||
def run(self) -> None:
|
||||
assert _is_cuda(
|
||||
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is None:
|
||||
base_commit = self.get_base_commit_in_main_branch()
|
||||
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
# Fallback to nightly wheel if latest commit wheel is unavailable,
|
||||
# in this rare case, the nightly release CI hasn't finished on main.
|
||||
if not is_url_available(wheel_location):
|
||||
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
|
||||
import zipfile
|
||||
|
||||
if os.path.isfile(wheel_location):
|
||||
wheel_path = wheel_location
|
||||
print(f"Using existing wheel={wheel_path}")
|
||||
else:
|
||||
# Download the wheel from a given URL, assume
|
||||
# the filename is the last part of the URL
|
||||
wheel_filename = wheel_location.split("/")[-1]
|
||||
|
||||
import tempfile
|
||||
|
||||
# create a temporary directory to store the wheel
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||
|
||||
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
|
||||
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
try:
|
||||
urlretrieve(wheel_location, filename=wheel_path)
|
||||
except Exception as e:
|
||||
from setuptools.errors import SetupError
|
||||
|
||||
raise SetupError(
|
||||
f"Failed to get vLLM wheel from {wheel_location}") from e
|
||||
|
||||
with zipfile.ZipFile(wheel_path) as wheel:
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# "vllm/_version.py", # not available in nightly wheels yet
|
||||
]
|
||||
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
||||
|
||||
# vllm_flash_attn python code:
|
||||
# Regex from
|
||||
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
|
||||
for file in file_members:
|
||||
print(f"Extracting and including {file.filename} "
|
||||
"from existing wheel")
|
||||
package_name = os.path.dirname(file.filename).replace("/", ".")
|
||||
file_name = os.path.basename(file.filename)
|
||||
|
||||
if package_name not in package_data:
|
||||
package_data[package_name] = []
|
||||
|
||||
wheel.extract(file)
|
||||
if file_name.endswith(".py"):
|
||||
# python files shouldn't be added to package_data
|
||||
continue
|
||||
|
||||
package_data[package_name].append(file_name)
|
||||
|
||||
|
||||
def _no_device() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "empty"
|
||||
|
||||
|
||||
def _is_cuda() -> bool:
|
||||
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
|
||||
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||
return True
|
||||
has_cuda = torch.version.cuda is not None
|
||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
|
||||
and not (_is_neuron() or _is_tpu()))
|
||||
@ -639,16 +626,37 @@ package_data = {
|
||||
]
|
||||
}
|
||||
|
||||
# If using precompiled, extract and patch package_data (in advance of setup)
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is not None:
|
||||
wheel_url = wheel_location
|
||||
else:
|
||||
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
||||
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
with urlopen(wheel_url) as resp:
|
||||
if resp.status != 200:
|
||||
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
except Exception as e:
|
||||
print(f"[warn] Falling back to nightly wheel: {e}")
|
||||
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
|
||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
|
||||
wheel_url)
|
||||
for pkg, files in patch.items():
|
||||
package_data.setdefault(pkg, []).extend(files)
|
||||
|
||||
if _no_device():
|
||||
ext_modules = []
|
||||
|
||||
if not ext_modules:
|
||||
if not ext_modules or envs.VLLM_USE_PRECOMPILED:
|
||||
# Disable build_ext when using precompiled wheel
|
||||
cmdclass = {}
|
||||
else:
|
||||
cmdclass = {
|
||||
"build_ext":
|
||||
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
|
||||
}
|
||||
cmdclass = {"build_ext": cmake_build_ext}
|
||||
|
||||
setup(
|
||||
# static metadata should rather go in pyproject.toml
|
||||
@ -663,9 +671,7 @@ setup(
|
||||
["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile",
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [], # Kept for backwards compatibility
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
"flashinfer": ["flashinfer-python==0.2.9rc2"],
|
||||
"video": [] # Kept for backwards compatibility
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
package_data=package_data,
|
||||
|
||||
@ -136,15 +136,12 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"test_model",
|
||||
[
|
||||
TestAllReduceRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
|
||||
# TODO: Enable with torch==2.8.0
|
||||
# TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
|
||||
])
|
||||
@pytest.mark.parametrize("test_model", [
|
||||
TestAllReduceRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormModel,
|
||||
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
|
||||
TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("seq_len", [8])
|
||||
@pytest.mark.parametrize("hidden_size", [16])
|
||||
|
||||
@ -28,6 +28,7 @@ def test_mp_reducer(monkeypatch):
|
||||
max_model_len=32,
|
||||
gpu_memory_utilization=0.1,
|
||||
disable_log_stats=True,
|
||||
disable_log_requests=True,
|
||||
)
|
||||
|
||||
async_llm = AsyncLLM.from_engine_args(
|
||||
|
||||
@ -5,7 +5,7 @@ import json
|
||||
from argparse import ArgumentError
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
from typing import Annotated, Literal, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
@ -136,8 +136,6 @@ class DummyConfig:
|
||||
"""List with variable length"""
|
||||
list_literal: list[Literal[1, 2]] = field(default_factory=list)
|
||||
"""List with literal choices"""
|
||||
list_union: list[Union[str, type[object]]] = field(default_factory=list)
|
||||
"""List with union type"""
|
||||
literal_literal: Literal[Literal[1], Literal[2]] = 1
|
||||
"""Literal of literals with default 1"""
|
||||
json_tip: dict = field(default_factory=dict)
|
||||
@ -189,9 +187,6 @@ def test_get_kwargs():
|
||||
assert kwargs["list_literal"]["type"] is int
|
||||
assert kwargs["list_literal"]["nargs"] == "+"
|
||||
assert kwargs["list_literal"]["choices"] == [1, 2]
|
||||
# lists with unions should become str type.
|
||||
# If not, we cannot know which type to use for parsing
|
||||
assert kwargs["list_union"]["type"] is str
|
||||
# literals of literals should have merged choices
|
||||
assert kwargs["literal_literal"]["choices"] == [1, 2]
|
||||
# dict should have json tip in help
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.utils.deep_gemm import per_block_cast_to_fp8
|
||||
|
||||
|
||||
def per_token_cast_to_fp8(
|
||||
@ -20,6 +20,29 @@ def per_token_cast_to_fp8(
|
||||
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
|
||||
|
||||
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor, block_size_k: int,
|
||||
block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros(
|
||||
(
|
||||
int(math.ceil(m / block_size_k)) * block_size_k,
|
||||
int(math.ceil(n / block_size_n)) * block_size_n,
|
||||
),
|
||||
dtype=x.dtype,
|
||||
device=x.device,
|
||||
)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, block_size_k,
|
||||
x_padded.size(1) // block_size_k, block_size_n)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
|
||||
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
|
||||
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
return x_scaled_sub, scales
|
||||
|
||||
|
||||
def make_non_quant_weights(
|
||||
e: int,
|
||||
n: int,
|
||||
@ -76,9 +99,11 @@ def make_block_quant_fp8_weights(
|
||||
|
||||
for i in range(e):
|
||||
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
|
||||
block_size=[block_k, block_n])
|
||||
block_size_k=block_k,
|
||||
block_size_n=block_n)
|
||||
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
|
||||
block_size=[block_k, block_n])
|
||||
block_size_k=block_k,
|
||||
block_size_n=block_n)
|
||||
|
||||
return w1, w2, w1_s, w2_s
|
||||
|
||||
|
||||
@ -1,103 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||
BatchedDeepGemmExperts)
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
BatchedPrepareAndFinalize, BatchedTritonExperts)
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported
|
||||
|
||||
from .test_deepgemm import make_block_quant_fp8_weights
|
||||
|
||||
BLOCK_SIZE = [128, 128]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_deep_gemm_supported(),
|
||||
reason="Requires deep_gemm kernels")
|
||||
@pytest.mark.parametrize("E", [16, 32]) # number of experts
|
||||
@pytest.mark.parametrize("T", [256, 512]) # tokens per expert
|
||||
@pytest.mark.parametrize("K", [128, 256]) # hidden dim
|
||||
@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert
|
||||
@pytest.mark.parametrize("topk", [2, 4])
|
||||
def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
|
||||
monkeypatch):
|
||||
"""Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||
|
||||
device = "cuda"
|
||||
w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE)
|
||||
|
||||
M = E * T # total tokens
|
||||
a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
a.clamp_(fp8_info.min, fp8_info.max)
|
||||
|
||||
# random router outputs → top-k indices / weights
|
||||
router_logits = torch.randn(M, E, device=device, dtype=torch.float32)
|
||||
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
|
||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
|
||||
|
||||
# token number for each expert
|
||||
cnt = torch.bincount(topk_ids.flatten(), minlength=E)
|
||||
max_cnt = int(cnt.max().item())
|
||||
# next power of 2 for max token number
|
||||
max_num_tokens = 1 << (max_cnt - 1).bit_length()
|
||||
|
||||
prep_finalize = BatchedPrepareAndFinalize(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_local_experts=E,
|
||||
num_dispatchers=1,
|
||||
rank=0,
|
||||
)
|
||||
|
||||
# triton (reference)
|
||||
triton_experts = BatchedTritonExperts(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_dispatchers=1,
|
||||
use_fp8_w8a8=True,
|
||||
per_act_token_quant=False,
|
||||
block_shape=BLOCK_SIZE,
|
||||
)
|
||||
mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)
|
||||
|
||||
out_triton = mk_triton(
|
||||
hidden_states=a,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
global_num_experts=E,
|
||||
)
|
||||
|
||||
# deepgemm
|
||||
deepgemm_experts = BatchedDeepGemmExperts(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_dispatchers=1,
|
||||
block_shape=BLOCK_SIZE,
|
||||
per_act_token_quant=False,
|
||||
)
|
||||
mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)
|
||||
|
||||
out_deepgemm = mk_deepgemm(
|
||||
hidden_states=a,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
global_num_experts=E,
|
||||
)
|
||||
|
||||
diff = calc_diff(out_deepgemm, out_triton)
|
||||
assert diff < 1e-3, f"Output diff too large: {diff}"
|
||||
@ -12,8 +12,10 @@ import torch
|
||||
from tests.kernels.utils import baseline_scaled_mm
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import cdiv
|
||||
from vllm.utils.deep_gemm import per_block_cast_to_fp8
|
||||
|
||||
|
||||
def cdiv(a, b):
|
||||
return (a + b - 1) // b
|
||||
|
||||
|
||||
def per_token_cast_to_fp8(
|
||||
@ -30,6 +32,21 @@ def per_token_cast_to_fp8(
|
||||
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
|
||||
|
||||
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
|
||||
device=x.device,
|
||||
dtype=x.dtype)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
|
||||
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
|
||||
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
|
||||
(4, 8192, 7168, 4096),
|
||||
(4, 8192, 2048, 7168),
|
||||
|
||||
@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm
|
||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
|
||||
is_deep_gemm_supported)
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
|
||||
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
from .utils import make_test_weights
|
||||
@ -47,7 +46,7 @@ requires_deep_ep = pytest.mark.skipif(
|
||||
)
|
||||
|
||||
requires_deep_gemm = pytest.mark.skipif(
|
||||
not is_deep_gemm_supported(),
|
||||
not has_deep_gemm(),
|
||||
reason="Requires deep_gemm kernels",
|
||||
)
|
||||
|
||||
|
||||
@ -15,11 +15,16 @@ import torch
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8)
|
||||
from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
|
||||
per_block_cast_to_fp8)
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
|
||||
|
||||
BLOCK_SIZE = [128, 128]
|
||||
|
||||
requires_deep_gemm = pytest.mark.skipif(
|
||||
not has_deep_gemm(),
|
||||
reason="Requires deep_gemm kernels",
|
||||
)
|
||||
|
||||
|
||||
def make_block_quant_fp8_weights(
|
||||
e: int,
|
||||
@ -64,12 +69,8 @@ def make_block_quant_fp8_weights(
|
||||
dtype=torch.float32)
|
||||
|
||||
for i in range(e):
|
||||
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
|
||||
block_size=block_size,
|
||||
use_ue8m0=True)
|
||||
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
|
||||
block_size=block_size,
|
||||
use_ue8m0=True)
|
||||
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
|
||||
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
|
||||
|
||||
return w1, w2, w1_s, w2_s
|
||||
|
||||
@ -147,8 +148,7 @@ NUM_EXPERTS = [32]
|
||||
@pytest.mark.parametrize("mnk", MNKs)
|
||||
@pytest.mark.parametrize("topk", TOPKS)
|
||||
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
|
||||
@pytest.mark.skipif(not is_deep_gemm_supported(),
|
||||
reason="Requires deep_gemm kernels")
|
||||
@requires_deep_gemm
|
||||
def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
@ -5,7 +5,8 @@ from typing import Optional
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from tests.kernels.quant_utils import per_block_cast_to_int8
|
||||
from tests.kernels.quant_utils import (per_block_cast_to_fp8,
|
||||
per_block_cast_to_int8)
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
|
||||
@ -14,7 +15,6 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
from vllm.model_executor.layers.fused_moe.utils import (
|
||||
moe_kernel_quantize_input)
|
||||
from vllm.utils import round_up
|
||||
from vllm.utils.deep_gemm import per_block_cast_to_fp8
|
||||
|
||||
|
||||
def triton_moe(
|
||||
|
||||
@ -222,6 +222,25 @@ def native_per_token_group_quant_int8(x,
|
||||
DEFAULT_BLOCK_SHAPE = [128, 128]
|
||||
|
||||
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor,
|
||||
block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
block_m, block_n = block_shape
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
|
||||
dtype=x.dtype,
|
||||
device=x.device)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
|
||||
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
|
||||
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
return x_scaled_sub, scales
|
||||
|
||||
|
||||
def per_block_cast_to_int8(
|
||||
x: torch.Tensor,
|
||||
block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
|
||||
|
||||
@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||
|
||||
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
|
||||
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
|
||||
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
|
||||
|
||||
As = As_fp8.to(torch.float32)
|
||||
Bs = Bs_fp8.to(torch.float32)
|
||||
|
||||
@ -559,6 +559,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
|
||||
m_a_scales = m_g if per_act_token else 1
|
||||
n_b_scales = n_g if per_out_ch else 1
|
||||
|
||||
print("shape:", m_g, n_g, k_g)
|
||||
|
||||
# Create group-specific A and B (FP8) and output (FP16/FP32)
|
||||
a_g = to_fp8(torch.randn((m_g, k_g), device=device))
|
||||
b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
|
||||
@ -637,4 +639,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
|
||||
for g in range(num_experts):
|
||||
baseline = baseline_tensors[g]
|
||||
c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
|
||||
print(baseline)
|
||||
print(c)
|
||||
print("*")
|
||||
torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
|
||||
|
||||
@ -1,158 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Script to test multi loras service with tp >= 2
|
||||
"""
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
LORA_NAME_PATH_MAP = {
|
||||
"Alice": "charent/self_cognition_Alice",
|
||||
"Bob": "charent/self_cognition_Bob",
|
||||
"Cat": "charent/self_cognition_Bob", # same as Bob
|
||||
}
|
||||
|
||||
LORA_NAME_ID_MAP = {}
|
||||
INCREASE_LORA_ID = 0
|
||||
LORA_RANK = 8
|
||||
|
||||
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
||||
LORA_TEST_EXPECTED = [
|
||||
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
||||
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
return [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def make_add_lora_request(name: str, path: str):
|
||||
global INCREASE_LORA_ID, LORA_NAME_ID_MAP
|
||||
|
||||
INCREASE_LORA_ID += 1
|
||||
LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
|
||||
|
||||
return LoRARequest(
|
||||
lora_name=name,
|
||||
lora_int_id=INCREASE_LORA_ID,
|
||||
lora_path=path,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multi_loras_with_tp_sync():
|
||||
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2, # ensure max_loras < max_cpu_loras
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2, # ensure tp >= 2
|
||||
max_cpu_loras=4, # ensure max_cpu_loras >= 2
|
||||
)
|
||||
|
||||
def run_check_lora(fn, args, expected: list):
|
||||
fn(args)
|
||||
assert set(llm.llm_engine.list_loras()) == set(expected)
|
||||
|
||||
# simulate add loras with CLI args
|
||||
# likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
|
||||
[1],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
|
||||
[1, 2],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
|
||||
[1, 2, 3],
|
||||
)
|
||||
|
||||
# set temperature = 0 for greedy search
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=64)
|
||||
|
||||
def call_llm_get_outputs(prompt: str, lora_name: str):
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_int_id=LORA_NAME_ID_MAP[lora_name],
|
||||
lora_path=LORA_NAME_PATH_MAP[lora_name],
|
||||
)
|
||||
messages = format_chatml_messages(prompt)
|
||||
outputs = llm.chat(
|
||||
[messages],
|
||||
sampling_params,
|
||||
chat_template_kwargs={
|
||||
"enable_thinking": False
|
||||
}, # for those loras, ensure enable_thinking=False
|
||||
lora_request=lora_request,
|
||||
use_tqdm=False,
|
||||
)
|
||||
output_text = outputs[0].outputs[0].text
|
||||
return output_text
|
||||
|
||||
def reload_lora(name: str):
|
||||
"""
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
for dynamic lora loading and unloading
|
||||
"""
|
||||
remove_lora_response = llm.llm_engine.remove_lora(
|
||||
lora_id=LORA_NAME_ID_MAP[name])
|
||||
|
||||
add_lora_response = llm.llm_engine.add_lora(
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
|
||||
|
||||
print(f"{remove_lora_response=}, {add_lora_response=}")
|
||||
|
||||
def check_outputs(outputs: str, expected: str):
|
||||
print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
|
||||
print("\n----------------------------\n")
|
||||
assert outputs == expected
|
||||
|
||||
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# call Bob, ignore what it is output
|
||||
call_llm_get_outputs(prompt, "Bob")
|
||||
print("After call Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Bob Lora
|
||||
reload_lora("Bob")
|
||||
print("After reload Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Alice Lora
|
||||
reload_lora("Alice")
|
||||
print("After reload Alice:")
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
@ -4,6 +4,8 @@ from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
import vllm
|
||||
from vllm.assets.image import ImageAsset
|
||||
@ -183,6 +185,10 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
current_platform.is_rocm(),
|
||||
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
|
||||
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
|
||||
)
|
||||
def test_qwen25vl_lora(qwen25vl_lora_files):
|
||||
"""Test Qwen 2.5 VL model with LoRA"""
|
||||
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
|
||||
|
||||
@ -702,38 +702,13 @@ VLM_TEST_SETTINGS = {
|
||||
"smolvlm": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
|
||||
),
|
||||
"tarsier": VLMTestInfo(
|
||||
models=["omni-research/Tarsier-7b"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
patch_hf_runner=model_utils.tarsier_patch_hf_runner,
|
||||
),
|
||||
"tarsier2": VLMTestInfo(
|
||||
models=["omni-research/Tarsier2-Recap-7b"],
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
VLMTestType.VIDEO,
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.skip("Model initialization hangs")],
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
"chameleon-broadcast": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
|
||||
@ -818,15 +818,3 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
thinker.get_output_embeddings = lambda: thinker.lm_head
|
||||
hf_model.model = thinker
|
||||
return hf_model
|
||||
|
||||
|
||||
def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
from vllm.model_executor.models.tarsier import get_vision_encoder_info
|
||||
|
||||
vision_encoder_info = get_vision_encoder_info(hf_model.config)
|
||||
|
||||
hf_processor = hf_model.processor
|
||||
if hf_processor.patch_size is None:
|
||||
hf_processor.patch_size = vision_encoder_info.get_patch_size()
|
||||
|
||||
return hf_model
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("expected_toks_per_frame", [299])
|
||||
@pytest.mark.parametrize("num_frames", [32, 128])
|
||||
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
expected_toks_per_frame: int,
|
||||
expected_grid_t: int,
|
||||
fps: int,
|
||||
num_frames: int,
|
||||
):
|
||||
"""Ensure GLM4vMultiModalProcessor can handle video frames properly."""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
|
||||
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
|
||||
video, metadata = video_assets.np_ndarrays, video_assets.metadata
|
||||
metadata["fps"] = fps
|
||||
mm_data = {"video": [(video, metadata)]}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
|
||||
model_impl="transformers",
|
||||
)
|
||||
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
|
||||
|
||||
image_pil = ImageAsset('cherry_blossom').pil_image
|
||||
mm_data = {"image": image_pil}
|
||||
|
||||
@ -465,7 +465,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
is_available_online=False),
|
||||
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501
|
||||
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
|
||||
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo(
|
||||
@ -525,7 +526,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
}
|
||||
|
||||
_TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
|
||||
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
|
||||
"TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
|
||||
}
|
||||
|
||||
@ -34,7 +34,8 @@ def check_implementation(
|
||||
|
||||
with runner_test(model, **kwargs_test, **kwargs) as model_test:
|
||||
model_config = model_test.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
outputs_test = model_test.generate_greedy_logprobs(*args)
|
||||
|
||||
@ -134,7 +135,8 @@ def test_quantization(
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
transformers_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
@ -147,25 +149,6 @@ def test_quantization(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Layers live in `layers`
|
||||
"Qwen/Qwen3-Embedding-0.6B",
|
||||
# Layers live in `model.layers`
|
||||
"meta-llama/Llama-3.2-1B-Instruct"
|
||||
],
|
||||
)
|
||||
def test_embed_loading(vllm_runner, model):
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
model_impl="transformers") as model_test:
|
||||
model_config = model_test.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
@ -186,7 +169,8 @@ def test_classify(
|
||||
dtype=dtype,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
assert model_config.architecture == (
|
||||
model_config._get_transformers_backend_cls())
|
||||
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10
|
||||
NUM_REQUESTS = 10000
|
||||
|
||||
# Scenarios to test for num generated token.
|
||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
|
||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
|
||||
@ -3,10 +3,9 @@
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image, ImageChops
|
||||
|
||||
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
@ -36,115 +35,3 @@ def test_rgba_to_rgb():
|
||||
assert converted_image_numpy[i][j][0] == 255
|
||||
assert converted_image_numpy[i][j][1] == 255
|
||||
assert converted_image_numpy[i][j][2] == 255
|
||||
|
||||
|
||||
def test_rgba_to_rgb_custom_background(tmp_path):
|
||||
"""Test RGBA to RGB conversion with custom background colors."""
|
||||
# Create a simple RGBA image with transparent and opaque pixels
|
||||
rgba_image = Image.new("RGBA", (10, 10),
|
||||
(255, 0, 0, 255)) # Red with full opacity
|
||||
|
||||
# Make top-left quadrant transparent
|
||||
for i in range(5):
|
||||
for j in range(5):
|
||||
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
|
||||
|
||||
# Save the test image to tmp_path
|
||||
test_image_path = tmp_path / "test_rgba.png"
|
||||
rgba_image.save(test_image_path)
|
||||
|
||||
# Test 1: Default white background (backward compatibility)
|
||||
image_io_default = ImageMediaIO()
|
||||
converted_default = image_io_default.load_file(test_image_path)
|
||||
default_numpy = np.array(converted_default)
|
||||
|
||||
# Check transparent pixels are white
|
||||
assert default_numpy[0][0][0] == 255 # R
|
||||
assert default_numpy[0][0][1] == 255 # G
|
||||
assert default_numpy[0][0][2] == 255 # B
|
||||
# Check opaque pixels remain red
|
||||
assert default_numpy[5][5][0] == 255 # R
|
||||
assert default_numpy[5][5][1] == 0 # G
|
||||
assert default_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 2: Custom black background via kwargs
|
||||
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
|
||||
converted_black = image_io_black.load_file(test_image_path)
|
||||
black_numpy = np.array(converted_black)
|
||||
|
||||
# Check transparent pixels are black
|
||||
assert black_numpy[0][0][0] == 0 # R
|
||||
assert black_numpy[0][0][1] == 0 # G
|
||||
assert black_numpy[0][0][2] == 0 # B
|
||||
# Check opaque pixels remain red
|
||||
assert black_numpy[5][5][0] == 255 # R
|
||||
assert black_numpy[5][5][1] == 0 # G
|
||||
assert black_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 3: Custom blue background via kwargs (as list)
|
||||
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
|
||||
converted_blue = image_io_blue.load_file(test_image_path)
|
||||
blue_numpy = np.array(converted_blue)
|
||||
|
||||
# Check transparent pixels are blue
|
||||
assert blue_numpy[0][0][0] == 0 # R
|
||||
assert blue_numpy[0][0][1] == 0 # G
|
||||
assert blue_numpy[0][0][2] == 255 # B
|
||||
|
||||
# Test 4: Test with load_bytes method
|
||||
with open(test_image_path, 'rb') as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
|
||||
converted_green = image_io_green.load_bytes(image_data)
|
||||
green_numpy = np.array(converted_green)
|
||||
|
||||
# Check transparent pixels are green
|
||||
assert green_numpy[0][0][0] == 0 # R
|
||||
assert green_numpy[0][0][1] == 255 # G
|
||||
assert green_numpy[0][0][2] == 0 # B
|
||||
|
||||
|
||||
def test_rgba_background_color_validation():
|
||||
"""Test that invalid rgba_background_color values are properly rejected."""
|
||||
|
||||
# Test invalid types
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color="255,255,255")
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=255)
|
||||
|
||||
# Test wrong number of elements
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(255, 255))
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
|
||||
|
||||
# Test non-integer values
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(255, "255", 255))
|
||||
|
||||
# Test out of range values
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(256, 255, 255))
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="rgba_background_color must be a list or tuple"):
|
||||
ImageMediaIO(rgba_background_color=(255, -1, 255))
|
||||
|
||||
# Test that valid values work
|
||||
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise
|
||||
|
||||
@ -2,15 +2,16 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import nullcontext
|
||||
from typing import Optional, cast
|
||||
from types import MethodType
|
||||
from typing import cast
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import ProcessorMixin
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
|
||||
MultiModalKwargsItem,
|
||||
@ -1012,91 +1013,57 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
)
|
||||
|
||||
|
||||
class DummyProcessor:
|
||||
class _ProcessorProxy:
|
||||
|
||||
def __init__(self, a: int = 0, b: int = 0) -> None:
|
||||
def __init__(self, processor: ProcessorMixin) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.a = a
|
||||
self.b = b
|
||||
self.__processor = processor
|
||||
|
||||
def __getattr__(self, key: str):
|
||||
return getattr(self.__processor, key)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
a: int = 0,
|
||||
c: int = 0,
|
||||
return_tensors: Optional[str] = None,
|
||||
) -> dict[str, int]:
|
||||
return dict(a=a, c=c)
|
||||
text=None,
|
||||
images=None,
|
||||
videos=None,
|
||||
exists=None,
|
||||
return_tensors=None,
|
||||
):
|
||||
return dict(exists=exists)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||
("call_kwargs", "expected_kwargs"),
|
||||
[
|
||||
({"a": 1}, {}, {"a": 1, "b": 0}),
|
||||
({}, {"a": 1}, {"a": 1, "b": 0}),
|
||||
# inference_kwargs should take precedence
|
||||
({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
|
||||
# Should ignore extra kwargs
|
||||
({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
|
||||
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
|
||||
# Should ignore invalid kwargs
|
||||
({"does_not_exist": 100}, {"exists": None}),
|
||||
({"exists": 1}, {"exists": 1}),
|
||||
({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_hf_processor_init_kwargs(
|
||||
model_id,
|
||||
config_kwargs,
|
||||
inference_kwargs,
|
||||
expected_kwargs,
|
||||
):
|
||||
# Should not be used since there is nothing to convert to tokens
|
||||
mock_tokenizer = cast(AnyTokenizer, object())
|
||||
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
ctx = InputProcessingContext(
|
||||
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||
tokenizer=mock_tokenizer,
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
orig_get_hf_processor = processor.info.get_hf_processor
|
||||
|
||||
def get_hf_processor(self, **kwargs):
|
||||
assert kwargs == call_kwargs
|
||||
return _ProcessorProxy(orig_get_hf_processor())
|
||||
|
||||
processor.info.get_hf_processor = MethodType(get_hf_processor,
|
||||
processor.info)
|
||||
|
||||
out_kwargs = processor._call_hf_processor(
|
||||
prompt="",
|
||||
mm_data={},
|
||||
mm_kwargs=call_kwargs,
|
||||
tok_kwargs={},
|
||||
)
|
||||
|
||||
processor = ctx.get_hf_processor(
|
||||
DummyProcessor, # type: ignore[arg-type]
|
||||
**inference_kwargs,
|
||||
)
|
||||
|
||||
for k, v in expected_kwargs.items():
|
||||
assert getattr(processor, k) == v
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
|
||||
@pytest.mark.parametrize(
|
||||
("config_kwargs", "inference_kwargs", "expected_kwargs"),
|
||||
[
|
||||
({"a": 1}, {}, {"a": 1, "c": 0}),
|
||||
({}, {"a": 1}, {"a": 1, "c": 0}),
|
||||
# inference_kwargs should take precedence
|
||||
({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
|
||||
# Should ignore extra kwargs
|
||||
({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
|
||||
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_hf_processor_call_kwargs(
|
||||
model_id,
|
||||
config_kwargs,
|
||||
inference_kwargs,
|
||||
expected_kwargs,
|
||||
):
|
||||
# Should not be used since there is nothing to convert to tokens
|
||||
mock_tokenizer = cast(AnyTokenizer, object())
|
||||
|
||||
ctx = InputProcessingContext(
|
||||
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||
tokenizer=mock_tokenizer,
|
||||
)
|
||||
|
||||
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
|
||||
|
||||
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
|
||||
assert result == expected_kwargs
|
||||
assert out_kwargs == expected_kwargs
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
|
||||
def test_llama(vllm_runner, example_prompts, model_path):
|
||||
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=20)
|
||||
print(vllm_outputs)
|
||||
assert vllm_outputs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
|
||||
def test_qwen(vllm_runner, example_prompts, model_path):
|
||||
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=20)
|
||||
print(vllm_outputs)
|
||||
assert vllm_outputs
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from torch_xla._internal import tpu
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
@ -28,31 +27,25 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
|
||||
yield
|
||||
|
||||
|
||||
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
|
||||
def setup_vllm(num_loras: int) -> vllm.LLM:
|
||||
return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
|
||||
num_scheduler_steps=1,
|
||||
max_model_len=256,
|
||||
max_seq_len_to_capture=256,
|
||||
max_num_seqs=8,
|
||||
tensor_parallel_size=tp,
|
||||
enable_lora=True,
|
||||
max_loras=num_loras,
|
||||
max_lora_rank=8)
|
||||
|
||||
|
||||
TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips()
|
||||
] if tpu.num_available_chips() > 1 else [1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_single_lora(tp: int):
|
||||
def test_single_lora():
|
||||
"""
|
||||
This test ensures we can run a single LoRA adapter on the TPU backend.
|
||||
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
|
||||
will force Qwen2.5-3B-Instruct to claim 1+1=1.
|
||||
"""
|
||||
|
||||
llm = setup_vllm(1, tp)
|
||||
llm = setup_vllm(1)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
@ -70,8 +63,7 @@ def test_single_lora(tp: int):
|
||||
assert int(answer) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_lora_hotswapping(tp: int):
|
||||
def test_lora_hotswapping():
|
||||
"""
|
||||
This test ensures we can run multiple LoRA adapters on the TPU backend, even
|
||||
if we only have space to store 1.
|
||||
@ -87,7 +79,7 @@ def test_lora_hotswapping(tp: int):
|
||||
for i in range(1, 5)
|
||||
]
|
||||
|
||||
llm = setup_vllm(1, tp)
|
||||
llm = setup_vllm(1)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
@ -102,8 +94,7 @@ def test_lora_hotswapping(tp: int):
|
||||
assert int(answer) == i + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_multi_lora(tp: int):
|
||||
def test_multi_lora():
|
||||
"""
|
||||
This test ensures we can run multiple LoRA adapters on the TPU backend, when
|
||||
we have enough space to store all of them.
|
||||
@ -118,7 +109,7 @@ def test_multi_lora(tp: int):
|
||||
for i in range(1, 5)
|
||||
]
|
||||
|
||||
llm = setup_vllm(4, tp)
|
||||
llm = setup_vllm(4)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
|
||||
@ -26,10 +26,12 @@ if not current_platform.is_cuda():
|
||||
TEXT_ENGINE_ARGS = AsyncEngineArgs(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
disable_log_requests=True,
|
||||
)
|
||||
|
||||
VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
|
||||
enforce_eager=True)
|
||||
enforce_eager=True,
|
||||
disable_log_requests=True)
|
||||
|
||||
TEXT_PROMPT = "Hello my name is Robert and"
|
||||
|
||||
|
||||
@ -213,29 +213,3 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
assert len(num_accepted_tokens_per_pos) == 1
|
||||
assert isinstance(num_accepted_tokens_per_pos[0], Vector)
|
||||
assert len(num_accepted_tokens_per_pos[0].values) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
|
||||
def test_skip_tokenizer_initialization(model: str,
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
llm = LLM(
|
||||
model=model,
|
||||
skip_tokenizer_init=True,
|
||||
enforce_eager=True,
|
||||
)
|
||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||
|
||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||
llm.generate("abc", sampling_params)
|
||||
|
||||
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
||||
sampling_params=sampling_params)
|
||||
assert len(outputs) > 0
|
||||
completions = outputs[0].outputs
|
||||
assert len(completions) > 0
|
||||
assert completions[0].text == ""
|
||||
assert completions[0].token_ids
|
||||
|
||||
@ -25,6 +25,7 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
|
||||
engine_args = AsyncEngineArgs(
|
||||
model="ibm-research/PowerMoE-3b",
|
||||
enforce_eager=True,
|
||||
disable_log_requests=True,
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
|
||||
data_parallel_size=DP_SIZE,
|
||||
)
|
||||
|
||||
@ -44,9 +44,9 @@ from vllm.attention.layer import Attention
|
||||
from vllm.attention.ops.paged_attn import PagedAttention
|
||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
|
||||
make_tensor_with_pad)
|
||||
from vllm.utils.flashinfer import use_trtllm_decode_attention
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -56,6 +56,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class FlashInferBackend(AttentionBackend):
|
||||
cached_sm100a_supported: Optional[bool] = None
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
@ -122,6 +123,47 @@ class FlashInferBackend(AttentionBackend):
|
||||
else:
|
||||
raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
|
||||
|
||||
@staticmethod
|
||||
def use_trtllm_decode_attention(
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
kv_cache_dtype: str,
|
||||
num_qo_heads: Optional[int],
|
||||
num_kv_heads: Optional[int],
|
||||
attn_head_size: Optional[int],
|
||||
) -> bool:
|
||||
if FlashInferBackend.cached_sm100a_supported is None:
|
||||
FlashInferBackend.cached_sm100a_supported = (
|
||||
current_platform.has_device_capability(100))
|
||||
if not FlashInferBackend.cached_sm100a_supported:
|
||||
return False
|
||||
# Check if the dimensions are supported by TRTLLM decode attention
|
||||
if (attn_head_size is None or num_qo_heads is None
|
||||
or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
|
||||
or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
|
||||
return False
|
||||
env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
|
||||
if env_value is not None:
|
||||
logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
|
||||
env_value)
|
||||
# Environment variable is set - respect it
|
||||
# Making the conditional check for zero because
|
||||
# the path is automatically enabled if the batch size condition
|
||||
# is satisfied.
|
||||
no_use_trtllm = (env_value == "0")
|
||||
if not no_use_trtllm:
|
||||
logger.info_once("Using TRTLLM decode attention.")
|
||||
return not no_use_trtllm
|
||||
else:
|
||||
# Environment variable not set - use auto-detection
|
||||
use_trtllm = (FlashInferBackend.cached_sm100a_supported
|
||||
and batch_size <= 256 and max_seq_len < 131072
|
||||
and kv_cache_dtype == "auto")
|
||||
if use_trtllm:
|
||||
logger.warning_once(
|
||||
"Using TRTLLM decode attention (auto-detected).")
|
||||
return use_trtllm
|
||||
|
||||
|
||||
@dataclass
|
||||
class PerLayerParameters:
|
||||
@ -1114,7 +1156,7 @@ class FlashInferImpl(AttentionImpl):
|
||||
assert decode_meta.decode_wrapper._sm_scale == softmax_scale
|
||||
# TODO: @pavanimajety Remove this once the switch happens
|
||||
# inside flashinfer.
|
||||
if not use_trtllm_decode_attention(
|
||||
if not FlashInferBackend.use_trtllm_decode_attention(
|
||||
num_decode_tokens, attn_metadata.max_decode_seq_len,
|
||||
kv_cache_dtype, attn_metadata.num_qo_heads,
|
||||
attn_metadata.num_kv_heads, attn_metadata.head_dim):
|
||||
|
||||
@ -267,15 +267,8 @@ def _support_torch_compile(
|
||||
code.co_filename)
|
||||
return inline_call(parent, func, args, kwargs)
|
||||
|
||||
# Disable the C++ compilation of symbolic shape guards. C++-fication
|
||||
# of symbolic shape guards can improve guard overhead. But, since
|
||||
# vllm skip guards anyways, setting this flag to False can improve
|
||||
# compile time.
|
||||
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards",
|
||||
False), patch.object(
|
||||
InliningInstructionTranslator,
|
||||
'inline_call',
|
||||
patched_inline_call):
|
||||
with patch.object(InliningInstructionTranslator, 'inline_call',
|
||||
patched_inline_call):
|
||||
output = self.compiled_callable(*args, **kwargs)
|
||||
return output
|
||||
|
||||
|
||||
@ -164,6 +164,3 @@ class AttnFusionPass(VllmInductorPass):
|
||||
logger.debug("Fused quantization onto %s attention nodes", count)
|
||||
self.dump_graph(graph, "after_attn_fusion")
|
||||
self.end_and_log()
|
||||
|
||||
def uuid(self):
|
||||
return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern)
|
||||
|
||||
@ -76,10 +76,9 @@ class InductorPass(CustomGraphPass):
|
||||
for src in srcs:
|
||||
if isinstance(src, str):
|
||||
src_str = src
|
||||
elif isinstance(src, (types.FunctionType, type)):
|
||||
elif isinstance(src, types.FunctionType):
|
||||
src_str = inspect.getsource(src)
|
||||
else:
|
||||
# object instance
|
||||
src_str = inspect.getsource(src.__class__)
|
||||
hasher.update(src_str.encode("utf-8"))
|
||||
return hasher.hexdigest()
|
||||
|
||||
@ -7,7 +7,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_cuda():
|
||||
if current_platform.is_cuda_alike():
|
||||
from .fusion import FusionPass
|
||||
from .collective_fusion import AllReduceFusionPass, AsyncTPPass
|
||||
from .fusion_attn import AttnFusionPass
|
||||
|
||||
@ -11,7 +11,6 @@ import textwrap
|
||||
import uuid
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from collections.abc import Mapping
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
|
||||
replace)
|
||||
@ -39,8 +38,8 @@ from vllm.transformers_utils.config import (
|
||||
ConfigFormat, get_config, get_hf_image_processor_config,
|
||||
get_hf_text_config, get_pooling_config,
|
||||
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
|
||||
maybe_override_with_speculators_target_model, try_get_generation_config,
|
||||
try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope)
|
||||
try_get_generation_config, try_get_safetensors_metadata,
|
||||
try_get_tokenizer_config, uses_mrope)
|
||||
from vllm.transformers_utils.s3_utils import S3Model
|
||||
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
|
||||
# yapf conflicts with isort for this block
|
||||
@ -535,15 +534,6 @@ class ModelConfig:
|
||||
"affect the random state of the Python process that "
|
||||
"launched vLLM.", self.seed)
|
||||
|
||||
if self.runner != "draft":
|
||||
# If we're not running the draft model, check for speculators config
|
||||
# If speculators config, set model / tokenizer to be target model
|
||||
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
revision=self.revision,
|
||||
trust_remote_code=self.trust_remote_code)
|
||||
|
||||
# Keep set served_model_name before maybe_model_redirect(self.model)
|
||||
self.served_model_name = get_served_model_name(self.model,
|
||||
self.served_model_name)
|
||||
@ -615,8 +605,8 @@ class ModelConfig:
|
||||
self.config_format,
|
||||
hf_overrides_kw=hf_overrides_kw,
|
||||
hf_overrides_fn=hf_overrides_fn)
|
||||
|
||||
self.hf_config = hf_config
|
||||
|
||||
self.hf_text_config = get_hf_text_config(self.hf_config)
|
||||
self.attention_chunk_size = getattr(self.hf_text_config,
|
||||
"attention_chunk_size", None)
|
||||
@ -812,17 +802,12 @@ class ModelConfig:
|
||||
def _get_transformers_backend_cls(self) -> str:
|
||||
"""Determine which Transformers backend class will be used if
|
||||
`model_impl` is set to `transformers` or `auto`."""
|
||||
if getattr(self, "runner_type", self.runner) == "pooling":
|
||||
return "TransformersModel"
|
||||
if self.hf_config != self.hf_text_config:
|
||||
# If 'hf_text_config' is the same as 'hf_config'. If not, it is
|
||||
# probably a composite config, i.e. multimodal
|
||||
return "TransformersForMultimodalLM"
|
||||
return "TransformersForCausalLM"
|
||||
|
||||
def using_transformers_backend(self) -> bool:
|
||||
"""Check if the model is using the Transformers backend class."""
|
||||
return self.architecture == self._get_transformers_backend_cls()
|
||||
else:
|
||||
return "TransformersForCausalLM"
|
||||
|
||||
@property
|
||||
def registry(self):
|
||||
@ -885,12 +870,6 @@ class ModelConfig:
|
||||
|
||||
return None
|
||||
|
||||
def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
|
||||
mm_config = self.get_multimodal_config()
|
||||
|
||||
self.disable_mm_preprocessor_cache = value
|
||||
mm_config.disable_mm_preprocessor_cache = value
|
||||
|
||||
def _get_encoder_config(self):
|
||||
return get_sentence_transformer_tokenizer_config(
|
||||
self.model, self.revision)
|
||||
@ -1580,18 +1559,7 @@ class ModelConfig:
|
||||
return self.multimodal_config
|
||||
|
||||
def try_get_generation_config(self) -> dict[str, Any]:
|
||||
"""
|
||||
This method attempts to retrieve the non-default values of the
|
||||
generation config for this model.
|
||||
|
||||
The generation config can contain information about special tokens, as
|
||||
well as sampling parameters. Which is why this method exists separately
|
||||
to `get_diff_sampling_param`.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the non-default generation config.
|
||||
"""
|
||||
if self.generation_config in {"auto", "vllm"}:
|
||||
if self.generation_config in ("auto", "vllm"):
|
||||
config = try_get_generation_config(
|
||||
self.hf_config_path or self.model,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
@ -1610,18 +1578,13 @@ class ModelConfig:
|
||||
|
||||
def get_diff_sampling_param(self) -> dict[str, Any]:
|
||||
"""
|
||||
This method returns a dictionary containing the non-default sampling
|
||||
parameters with `override_generation_config` applied.
|
||||
|
||||
The default sampling parameters are:
|
||||
|
||||
- vLLM's neutral defaults if `self.generation_config="vllm"`
|
||||
- the model's defaults if `self.generation_config="auto"`
|
||||
- as defined in `generation_config.json` if
|
||||
`self.generation_config="path/to/generation_config/dir"`
|
||||
This method returns a dictionary containing the parameters
|
||||
that differ from the default sampling parameters. If
|
||||
`generation_config` is `"vllm"`, an empty dictionary is returned.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the non-default sampling parameters.
|
||||
dict[str, Any]: A dictionary with the differing sampling
|
||||
parameters, if `generation_config` is `"vllm"` an empty dictionary.
|
||||
"""
|
||||
if self.generation_config == "vllm":
|
||||
config = {}
|
||||
@ -3021,13 +2984,10 @@ class SpeculativeConfig:
|
||||
"Chunked prefill and EAGLE are not compatible "
|
||||
"when using V0.")
|
||||
|
||||
from vllm.transformers_utils.configs import (
|
||||
SpeculatorsConfig)
|
||||
from vllm.transformers_utils.configs.eagle import (
|
||||
EAGLEConfig)
|
||||
|
||||
if isinstance(self.draft_model_config.hf_config,
|
||||
(EAGLEConfig, SpeculatorsConfig)):
|
||||
EAGLEConfig):
|
||||
pass
|
||||
else:
|
||||
eagle_config = EAGLEConfig(
|
||||
@ -3186,19 +3146,10 @@ class SpeculativeConfig:
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{self.disable_by_batch_size=}")
|
||||
|
||||
from vllm.transformers_utils.configs import SpeculatorsConfig
|
||||
|
||||
eagle3_target_supported = ["llama"]
|
||||
if self.draft_model_config and isinstance(
|
||||
self.draft_model_config.hf_config, SpeculatorsConfig):
|
||||
eagle3_target_supported.append("qwen")
|
||||
|
||||
if self.method == "eagle3" and self.target_model_config and not any(
|
||||
supported_model in
|
||||
self.target_model_config.hf_text_config.model_type
|
||||
for supported_model in eagle3_target_supported):
|
||||
if self.method == "eagle3" and self.target_model_config and \
|
||||
"llama" not in self.target_model_config.hf_text_config.model_type:
|
||||
raise ValueError(
|
||||
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
|
||||
"Eagle3 is only supported for Llama models. "
|
||||
f"Got {self.target_model_config.hf_text_config.model_type=}")
|
||||
|
||||
return self
|
||||
@ -3392,16 +3343,7 @@ class MultiModalConfig:
|
||||
999 if envs.VLLM_USE_V1 else 1,
|
||||
)
|
||||
|
||||
def merge_mm_processor_kwargs(
|
||||
self,
|
||||
inference_kwargs: Mapping[str, object],
|
||||
) -> dict[str, object]:
|
||||
"""
|
||||
Get the keyword arguments to pass to the multi-modal processor
|
||||
according to the extra arguments passed during inference.
|
||||
"""
|
||||
kwargs = self.mm_processor_kwargs or {}
|
||||
return kwargs | dict(inference_kwargs)
|
||||
# TODO: Add configs to init vision tower or not.
|
||||
|
||||
|
||||
@config
|
||||
|
||||
@ -1,257 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import uuid
|
||||
from typing import Any, Optional
|
||||
|
||||
import ray
|
||||
import torch
|
||||
from ray.exceptions import RayChannelError
|
||||
from ray.experimental.channel.communicator import (Communicator,
|
||||
TorchTensorAllocator)
|
||||
from torch.distributed import ReduceOp
|
||||
|
||||
from vllm.distributed.device_communicators.base_device_communicator import (
|
||||
DeviceCommunicatorBase)
|
||||
from vllm.distributed.parallel_state import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import current_stream
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayPPCommunicator(Communicator):
|
||||
"""
|
||||
Communicator to be used for pipeline parallelism in Ray Compiled Graph.
|
||||
This is wraps around the vLLM _PP GroupCoordinator.
|
||||
|
||||
This class is not thread-safe.
|
||||
"""
|
||||
|
||||
_comm: Optional[DeviceCommunicatorBase]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
world_size: int,
|
||||
comm_id: Any,
|
||||
rank: Optional[int],
|
||||
actor_handles: list["ray.actor.ActorHandle"],
|
||||
cuda_stream: Optional[torch.cuda.Stream],
|
||||
use_communication_streams: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize a RayPPCommunicator that can be used to communicate with
|
||||
other Ray Compiled Graph actors for pipeline parallelism.
|
||||
|
||||
Args:
|
||||
world_size: The number of participating actors.
|
||||
comm_id: A unique communicator ID. This is just to conform with
|
||||
the Ray Communicator API and is not used.
|
||||
rank: The rank of this actor. If None, then the caller is not a
|
||||
participant of the RayPPCommunicator group (e.g., the Ray
|
||||
driver).
|
||||
actor_handles: A list of actor handles.
|
||||
cuda_stream: A CUDA stream to dispatch communication ops to. This
|
||||
is not supported.
|
||||
use_communication_streams: Whether to use communication streams.
|
||||
This is not supported.
|
||||
"""
|
||||
self._world_size = world_size
|
||||
self._rank: Optional[int] = None
|
||||
self._actor_handles = actor_handles
|
||||
if use_communication_streams:
|
||||
raise NotImplementedError(
|
||||
"use_communication_streams is not supported")
|
||||
if cuda_stream is not None and cuda_stream != current_stream():
|
||||
raise ValueError(
|
||||
"cuda_stream other than the current stream is not supported")
|
||||
|
||||
if rank is not None:
|
||||
# Rank is not None, this is Ray worker
|
||||
assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned"
|
||||
|
||||
self._comm = get_pp_group().device_communicator
|
||||
|
||||
# Since we wrap around the vLLM _PP communicator, we use
|
||||
# the rank from the vLLM communicator, and ignore the rank
|
||||
# passed in from Ray.
|
||||
# TODO(rui): refactor the Ray Communicator API so that
|
||||
# it also supports no rank passed in.
|
||||
self._rank = self._comm.rank_in_group
|
||||
|
||||
self._build_actor_rank_mapping()
|
||||
else:
|
||||
# Rank is None, this is Ray driver
|
||||
self._comm = None
|
||||
|
||||
self._closed = False
|
||||
|
||||
def _build_actor_rank_mapping(self):
|
||||
"""
|
||||
Use collective communication to build a mapping from actor IDs to ranks.
|
||||
This should be called once during initialization.
|
||||
"""
|
||||
if self._comm is None:
|
||||
return {}
|
||||
|
||||
current_actor = ray.get_runtime_context().current_actor
|
||||
actor_id_str = current_actor._actor_id.hex()
|
||||
|
||||
# Ray actor IDs are 32-character hex strings (128 bits)
|
||||
ACTOR_ID_LEN = 32
|
||||
actor_id_bytes = actor_id_str.encode('utf-8')
|
||||
assert len(
|
||||
actor_id_bytes
|
||||
) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}"
|
||||
|
||||
actor_id_tensor = torch.frombuffer(
|
||||
actor_id_bytes, dtype=torch.uint8).to(self._comm.device)
|
||||
|
||||
# All-gather full actor IDs from all actors
|
||||
gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
|
||||
|
||||
# Build mapping: actor_id -> device_comm_rank
|
||||
self._actor_id_to_rank = {}
|
||||
for rank in range(self._world_size):
|
||||
start_idx = rank * ACTOR_ID_LEN
|
||||
end_idx = (rank + 1) * ACTOR_ID_LEN
|
||||
actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy(
|
||||
).tobytes()
|
||||
actor_id = actor_bytes.decode('utf-8')
|
||||
self._actor_id_to_rank[actor_id] = rank
|
||||
|
||||
def initialize(self, rank: int) -> None:
|
||||
# No additional initialization is needed.
|
||||
pass
|
||||
|
||||
def get_actor_handles(self) -> list["ray.actor.ActorHandle"]:
|
||||
return self._actor_handles
|
||||
|
||||
def get_rank(self, actor: ray.actor.ActorHandle) -> int:
|
||||
"""
|
||||
Return the given actor's rank using device communicator collective ops.
|
||||
"""
|
||||
assert hasattr(self, '_actor_id_to_rank'), (
|
||||
"Actor rank mapping not built. "
|
||||
"This should have been done during initialization.")
|
||||
|
||||
actor_id_str = actor._actor_id.hex()
|
||||
|
||||
if actor_id_str in self._actor_id_to_rank:
|
||||
return self._actor_id_to_rank[actor_id_str] # type: ignore
|
||||
else:
|
||||
raise ValueError(f"Actor {actor} not found in communicator group")
|
||||
|
||||
def get_self_rank(self) -> Optional[int]:
|
||||
"""
|
||||
Return this actor's rank.
|
||||
"""
|
||||
return self._rank
|
||||
|
||||
def get_world_size(self) -> int:
|
||||
"""
|
||||
Return the number of ranks in the RayPPCommunicator group.
|
||||
"""
|
||||
return self._world_size
|
||||
|
||||
def send(self, buf: "torch.Tensor", peer_rank: int) -> None:
|
||||
"""
|
||||
Send a torch.Tensor to a peer.
|
||||
|
||||
This returns when the send kernel has been queued, but the kernel may
|
||||
not have completed. Therefore, the caller should ensure that there are
|
||||
no concurrent writes to the sent `buf` until the send has finished.
|
||||
That is, either all writes should be submitted on the current stream
|
||||
(self._cuda_stream) or, if on a different stream, that stream should
|
||||
synchronize with the current stream.
|
||||
|
||||
Args:
|
||||
buf: The torch.Tensor to send. It should already be on this
|
||||
actor's default device.
|
||||
peer_rank: The rank of the actor to send to.
|
||||
"""
|
||||
if self._closed:
|
||||
raise RayChannelError("RayPPCommunicator has been destroyed.")
|
||||
|
||||
assert self._comm is not None
|
||||
self._comm.send(buf, peer_rank)
|
||||
|
||||
def recv(
|
||||
self,
|
||||
shape: tuple[int],
|
||||
dtype: "torch.dtype",
|
||||
peer_rank: int,
|
||||
allocator: TorchTensorAllocator,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Receive a torch.Tensor from a peer and synchronize the current stream.
|
||||
|
||||
After this call returns, the receive buffer is safe to read from from
|
||||
any stream. An RayChannelError will be raised if an error occurred
|
||||
(e.g., remote actor died), and the buffer is not safe to read.
|
||||
|
||||
Args:
|
||||
shape: The shape of the tensor to receive.
|
||||
dtype: The dtype of the tensor to receive.
|
||||
peer_rank: The rank of the actor to receive from.
|
||||
allocator: The allocator to use to create the received tensor.
|
||||
This is ignored for this implementation.
|
||||
"""
|
||||
if self._closed:
|
||||
raise RayChannelError("RayPPCommunicator has been destroyed.")
|
||||
|
||||
assert self._comm is not None
|
||||
size = torch.Size(shape)
|
||||
buf = self._comm.recv(size, dtype, src=peer_rank)
|
||||
|
||||
# Buffer values are undefined if NCCL ops are aborted. Therefore, we
|
||||
# need to synchronize here and check that the channel is still
|
||||
# open to ensure that the receive buffer is valid.
|
||||
# TODO(swang): Avoid CUDA synchronization.
|
||||
current_stream().synchronize()
|
||||
|
||||
if self._closed:
|
||||
raise RayChannelError("RayPPCommunicator has been destroyed.")
|
||||
return buf
|
||||
|
||||
def allgather(
|
||||
self,
|
||||
send_buf: "torch.Tensor",
|
||||
recv_buf: "torch.Tensor",
|
||||
):
|
||||
raise NotImplementedError("allgather is not supported")
|
||||
|
||||
def allreduce(
|
||||
self,
|
||||
send_buf: "torch.Tensor",
|
||||
recv_buf: "torch.Tensor",
|
||||
op: ReduceOp = ReduceOp.SUM,
|
||||
):
|
||||
raise NotImplementedError("allreduce is not supported")
|
||||
|
||||
def reducescatter(
|
||||
self,
|
||||
send_buf: "torch.Tensor",
|
||||
recv_buf: "torch.Tensor",
|
||||
op: ReduceOp = ReduceOp.SUM,
|
||||
):
|
||||
raise NotImplementedError("reducescatter is not supported")
|
||||
|
||||
@property
|
||||
def recv_stream(self):
|
||||
return torch.cuda.StreamContext(current_stream())
|
||||
|
||||
@property
|
||||
def send_stream(self):
|
||||
return torch.cuda.StreamContext(current_stream())
|
||||
|
||||
def destroy(self) -> None:
|
||||
# Just sets a flag, vLLM manages the lifecycle of the underlying
|
||||
# _PP GroupCoordinator.
|
||||
self._closed = True
|
||||
|
||||
def get_transport_name(self) -> str:
|
||||
return "nccl"
|
||||
|
||||
@classmethod
|
||||
def generate_communicator_id(cls) -> Any:
|
||||
return uuid.uuid4()
|
||||
@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
@ -217,12 +217,10 @@ Additionally, list elements can be passed individually using `+`:
|
||||
elif contains_type(type_hints, list):
|
||||
type_hint = get_type(type_hints, list)
|
||||
types = get_args(type_hint)
|
||||
list_type = types[0]
|
||||
if get_origin(list_type) is Union:
|
||||
msg = "List type must contain str if it is a Union."
|
||||
assert str in get_args(list_type), msg
|
||||
list_type = str
|
||||
kwargs[name]["type"] = list_type
|
||||
assert len(types) == 1, (
|
||||
"List type must have exactly one type. Got "
|
||||
f"{type_hint} with types {types}")
|
||||
kwargs[name]["type"] = types[0]
|
||||
kwargs[name]["nargs"] = "+"
|
||||
elif contains_type(type_hints, int):
|
||||
kwargs[name]["type"] = int
|
||||
@ -986,28 +984,8 @@ class EngineArgs:
|
||||
provided as a JSON string input via CLI arguments or directly as a
|
||||
dictionary from the engine.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.transformers_utils.configs.speculators.base import (
|
||||
SpeculatorsConfig)
|
||||
|
||||
if self.speculative_config is None:
|
||||
hf_config = get_config(self.hf_config_path or self.model,
|
||||
self.trust_remote_code, self.revision,
|
||||
self.code_revision, self.config_format)
|
||||
|
||||
# if loading a SpeculatorsConfig, load the specualtive_config
|
||||
# details from the config directly
|
||||
# no user input required / expected
|
||||
if isinstance(hf_config, SpeculatorsConfig):
|
||||
# We create one since we dont create one
|
||||
self.speculative_config = {}
|
||||
self.speculative_config[
|
||||
"num_speculative_tokens"] = hf_config.num_lookahead_tokens
|
||||
self.speculative_config["model"] = self.model
|
||||
self.speculative_config["method"] = hf_config.method
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
|
||||
# Note(Shangming): These parameters are not obtained from the cli arg
|
||||
# '--speculative-config' and must be passed in when creating the engine
|
||||
@ -1227,18 +1205,6 @@ class EngineArgs:
|
||||
enable_multimodal_encoder_data_parallel,
|
||||
)
|
||||
|
||||
supports_mm_preprocessor_cache = (self.data_parallel_size == 1
|
||||
or data_parallel_external_lb)
|
||||
if (not supports_mm_preprocessor_cache
|
||||
and model_config.is_multimodal_model
|
||||
and not model_config.disable_mm_preprocessor_cache):
|
||||
logger.warning(
|
||||
"Multi-modal preprocessor cache is not compatible "
|
||||
"with data parallelism when there does not exist a "
|
||||
"one-to-one correspondance between API process and "
|
||||
"EngineCore process, so the cache will be disabled.")
|
||||
model_config.set_disable_mm_preprocessor_cache(True)
|
||||
|
||||
speculative_config = self.create_speculative_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
@ -1712,23 +1678,7 @@ class EngineArgs:
|
||||
@dataclass
|
||||
class AsyncEngineArgs(EngineArgs):
|
||||
"""Arguments for asynchronous vLLM engine."""
|
||||
enable_log_requests: bool = False
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"`disable_log_requests` is deprecated and has been replaced with "
|
||||
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
|
||||
"`enable_log_requests` instead.")
|
||||
def disable_log_requests(self) -> bool:
|
||||
return not self.enable_log_requests
|
||||
|
||||
@disable_log_requests.setter
|
||||
@deprecated(
|
||||
"`disable_log_requests` is deprecated and has been replaced with "
|
||||
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
|
||||
"`enable_log_requests` instead.")
|
||||
def disable_log_requests(self, value: bool):
|
||||
self.enable_log_requests = not value
|
||||
disable_log_requests: bool = False
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser,
|
||||
@ -1739,15 +1689,9 @@ class AsyncEngineArgs(EngineArgs):
|
||||
load_general_plugins()
|
||||
if not async_args_only:
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument('--enable-log-requests',
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=AsyncEngineArgs.enable_log_requests,
|
||||
help='Enable logging requests.')
|
||||
parser.add_argument('--disable-log-requests',
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=not AsyncEngineArgs.enable_log_requests,
|
||||
help='[DEPRECATED] Disable logging requests.',
|
||||
deprecated=True)
|
||||
action='store_true',
|
||||
help='Disable logging requests.')
|
||||
current_platform.pre_register_and_update(parser)
|
||||
return parser
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Device, deprecate_kwargs, weak_bind
|
||||
from vllm.utils import Device, weak_bind
|
||||
|
||||
logger = init_logger(__name__)
|
||||
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
||||
@ -554,20 +554,14 @@ class AsyncLLMEngine(EngineClient):
|
||||
return LLMEngine._get_executor_cls(engine_config)
|
||||
|
||||
@classmethod
|
||||
@deprecate_kwargs(
|
||||
"disable_log_requests",
|
||||
additional_message=("This argument will have no effect. "
|
||||
"Use `enable_log_requests` instead."),
|
||||
)
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
|
||||
enable_log_requests: bool = False,
|
||||
disable_log_stats: bool = False,
|
||||
disable_log_requests: bool = True, # Deprecated, will be removed
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
|
||||
disable_log_requests: bool = False,
|
||||
disable_log_stats: bool = False,
|
||||
) -> "AsyncLLMEngine":
|
||||
"""Create an AsyncLLMEngine from the EngineArgs."""
|
||||
|
||||
@ -575,7 +569,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
vllm_config=vllm_config,
|
||||
executor_class=cls._get_executor_cls(vllm_config),
|
||||
start_engine_loop=start_engine_loop,
|
||||
log_requests=enable_log_requests,
|
||||
log_requests=not disable_log_requests,
|
||||
log_stats=not disable_log_stats,
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
@ -604,7 +598,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_requests=engine_args.disable_log_requests,
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@ -34,7 +34,6 @@ from vllm.outputs import RequestOutput
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import deprecate_kwargs
|
||||
from vllm.worker.model_runner_base import InputProcessingError
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -121,20 +120,10 @@ class MQLLMEngine:
|
||||
return ENGINE_DEAD_ERROR()
|
||||
|
||||
@classmethod
|
||||
@deprecate_kwargs(
|
||||
"disable_log_requests",
|
||||
additional_message=("This argument will have no effect. "
|
||||
"Use `enable_log_requests` instead."),
|
||||
)
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
usage_context: UsageContext,
|
||||
enable_log_requests: bool,
|
||||
disable_log_stats: bool,
|
||||
ipc_path: str,
|
||||
disable_log_requests: bool = True, # Deprecated, will be removed
|
||||
) -> "MQLLMEngine":
|
||||
def from_vllm_config(cls, vllm_config: VllmConfig,
|
||||
usage_context: UsageContext,
|
||||
disable_log_requests: bool, disable_log_stats: bool,
|
||||
ipc_path: str) -> "MQLLMEngine":
|
||||
# Setup plugins for each process
|
||||
from vllm.plugins import load_general_plugins
|
||||
load_general_plugins()
|
||||
@ -147,7 +136,7 @@ class MQLLMEngine:
|
||||
ipc_path=ipc_path,
|
||||
usage_context=usage_context,
|
||||
use_async_sockets=use_async_sockets,
|
||||
log_requests=enable_log_requests,
|
||||
log_requests=(not disable_log_requests),
|
||||
log_stats=(not disable_log_stats),
|
||||
)
|
||||
|
||||
@ -161,7 +150,7 @@ class MQLLMEngine:
|
||||
ipc_path=ipc_path,
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_requests=engine_args.disable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
)
|
||||
|
||||
@ -447,7 +436,7 @@ def signal_handler(*_) -> None:
|
||||
|
||||
def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
|
||||
ipc_path: str, disable_log_stats: bool,
|
||||
enable_log_requests: bool, engine_alive):
|
||||
disable_log_requests: bool, engine_alive):
|
||||
try:
|
||||
# Ensure we can serialize transformer config before spawning
|
||||
maybe_register_config_serialize_by_value()
|
||||
@ -456,7 +445,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
disable_log_stats=disable_log_stats,
|
||||
enable_log_requests=enable_log_requests,
|
||||
disable_log_requests=disable_log_requests,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import uvloop
|
||||
@ -16,10 +18,10 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
||||
validate_parsed_serve_args)
|
||||
from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
|
||||
show_filtered_argument_or_group_from_help)
|
||||
from vllm.executor.multiproc_worker_utils import _add_prefix
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri,
|
||||
set_process_title)
|
||||
from vllm.utils import FlexibleArgumentParser, get_tcp_uri
|
||||
from vllm.v1.engine.core import EngineCoreProc
|
||||
from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
@ -75,7 +77,7 @@ def run_headless(args: argparse.Namespace):
|
||||
|
||||
if args.api_server_count > 1:
|
||||
raise ValueError("api_server_count can't be set in headless mode")
|
||||
|
||||
# set_process_title("Headless_ProcManager")
|
||||
# Create the EngineConfig.
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
@ -140,6 +142,8 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
|
||||
orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
|
||||
|
||||
# set_process_title("ProcManager")
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
@ -163,9 +167,8 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
|
||||
if model_config.is_multimodal_model and not (
|
||||
orig_disable_mm_preprocessor_cache):
|
||||
logger.warning(
|
||||
"Multi-modal preprocessor cache is not compatible "
|
||||
"with api_server_count > 1, so the cache will be disabled.")
|
||||
logger.warning("Multi-model preprocessor cache will be disabled "
|
||||
"for api_server_count > 1")
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
log_stats = not engine_args.disable_log_stats
|
||||
@ -224,10 +227,12 @@ def run_api_server_worker_proc(listen_address,
|
||||
**uvicorn_kwargs) -> None:
|
||||
"""Entrypoint for individual API server worker processes."""
|
||||
|
||||
# Set process title and add process-specific prefix to stdout and stderr.
|
||||
server_index = client_config.get("client_index", 0) if client_config else 0
|
||||
set_process_title("APIServer", str(server_index))
|
||||
decorate_logs()
|
||||
# Add process-specific prefix to stdout and stderr.
|
||||
from multiprocessing import current_process
|
||||
process_name = current_process().name
|
||||
pid = os.getpid()
|
||||
_add_prefix(sys.stdout, process_name, pid)
|
||||
_add_prefix(sys.stderr, process_name, pid)
|
||||
|
||||
uvloop.run(
|
||||
run_server_worker(listen_address, sock, args, client_config,
|
||||
|
||||
@ -11,6 +11,7 @@ import multiprocessing
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from argparse import Namespace
|
||||
@ -94,15 +95,15 @@ from vllm.entrypoints.openai.serving_transcription import (
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
|
||||
log_non_default_args, with_cancellation)
|
||||
from vllm.executor.multiproc_worker_utils import _add_prefix
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
|
||||
get_open_zmq_ipc_path, is_valid_ipv6_address,
|
||||
set_ulimit)
|
||||
from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
|
||||
is_valid_ipv6_address, set_process_title, set_ulimit)
|
||||
from vllm.v1.metrics.prometheus import get_prometheus_registry
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
@ -199,18 +200,15 @@ async def build_async_engine_client_from_engine_args(
|
||||
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
async_llm: Optional[AsyncLLM] = None
|
||||
client_count = client_config.pop(
|
||||
"client_count") if client_config else 1
|
||||
client_index = client_config.pop(
|
||||
"client_index") if client_config else 0
|
||||
try:
|
||||
async_llm = AsyncLLM.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_requests=engine_args.disable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
client_addresses=client_config,
|
||||
client_count=client_count,
|
||||
client_index=client_index)
|
||||
|
||||
# Don't keep the dummy data in memory
|
||||
@ -230,7 +228,7 @@ async def build_async_engine_client_from_engine_args(
|
||||
engine_client = AsyncLLMEngine.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
enable_log_requests=engine_args.enable_log_requests,
|
||||
disable_log_requests=engine_args.disable_log_requests,
|
||||
disable_log_stats=engine_args.disable_log_stats)
|
||||
yield engine_client
|
||||
finally:
|
||||
@ -275,7 +273,7 @@ async def build_async_engine_client_from_engine_args(
|
||||
target=run_mp_engine,
|
||||
args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
|
||||
engine_args.disable_log_stats,
|
||||
engine_args.enable_log_requests, engine_alive))
|
||||
engine_args.disable_log_requests, engine_alive))
|
||||
engine_process.start()
|
||||
engine_pid = engine_process.pid
|
||||
assert engine_pid is not None, "Engine process failed to start."
|
||||
@ -1573,10 +1571,10 @@ async def init_app_state(
|
||||
else:
|
||||
served_model_names = [args.model]
|
||||
|
||||
if args.enable_log_requests:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
else:
|
||||
if args.disable_log_requests:
|
||||
request_logger = None
|
||||
else:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
|
||||
base_model_paths = [
|
||||
BaseModelPath(name=name, model_path=args.model)
|
||||
@ -1810,7 +1808,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
||||
"""Run a single-worker API server."""
|
||||
|
||||
# Add process-specific prefix to stdout and stderr.
|
||||
decorate_logs("APIServer")
|
||||
process_name = "APIServer"
|
||||
pid = os.getpid()
|
||||
_add_prefix(sys.stdout, process_name, pid)
|
||||
_add_prefix(sys.stderr, process_name, pid)
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
|
||||
@ -1827,7 +1828,7 @@ async def run_server_worker(listen_address,
|
||||
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
||||
|
||||
server_index = client_config.get("client_index", 0) if client_config else 0
|
||||
|
||||
set_process_title("APIServer", str(server_index))
|
||||
# Load logging config for uvicorn if specified
|
||||
log_config = load_log_config(args.log_config_file)
|
||||
if log_config is not None:
|
||||
|
||||
@ -859,15 +859,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
'are supported.'
|
||||
)
|
||||
|
||||
# if tool_choice is "required" but the "tools" list is empty,
|
||||
# override the data to behave like "none" to align with
|
||||
# OpenAI’s behavior.
|
||||
if data["tool_choice"] == "required" and isinstance(
|
||||
data["tools"], list) and len(data["tools"]) == 0:
|
||||
data["tool_choice"] = "none"
|
||||
del data["tools"]
|
||||
return data
|
||||
|
||||
# ensure that if "tool_choice" is specified as an object,
|
||||
# it matches a valid tool
|
||||
correct_usage_message = 'Correct usage: `{"type": "function",' \
|
||||
|
||||
@ -324,10 +324,10 @@ async def run_batch(
|
||||
else:
|
||||
served_model_names = [args.model]
|
||||
|
||||
if args.enable_log_requests:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
else:
|
||||
if args.disable_log_requests:
|
||||
request_logger = None
|
||||
else:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
|
||||
base_model_paths = [
|
||||
BaseModelPath(name=name, model_path=args.model)
|
||||
|
||||
30
vllm/envs.py
30
vllm/envs.py
@ -55,7 +55,6 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
|
||||
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
|
||||
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
|
||||
VLLM_XLA_USE_SPMD: bool = False
|
||||
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
||||
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
|
||||
@ -69,6 +68,7 @@ if TYPE_CHECKING:
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
||||
VLLM_NO_DEPRECATION_WARNING: bool = False
|
||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||
@ -126,7 +126,6 @@ if TYPE_CHECKING:
|
||||
VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
|
||||
VLLM_TPU_USING_PATHWAYS: bool = False
|
||||
VLLM_USE_DEEP_GEMM: bool = False
|
||||
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
|
||||
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
||||
@ -214,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Target device of vLLM, supporting [cuda (by default),
|
||||
# rocm, neuron, cpu]
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||
|
||||
# Maximum number of compilation jobs to run in parallel.
|
||||
# By default this is the number of CPUs
|
||||
@ -229,8 +228,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# If set, vllm will use precompiled binaries (*.so)
|
||||
"VLLM_USE_PRECOMPILED":
|
||||
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
|
||||
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
|
||||
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
|
||||
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
|
||||
|
||||
# Used to mark that setup.py is running in a Docker build context,
|
||||
# in order to force the use of precompiled binaries.
|
||||
"VLLM_DOCKER_BUILD_CONTEXT":
|
||||
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
|
||||
("1", "true"),
|
||||
|
||||
# Whether to force using nightly wheel in python build.
|
||||
# This is used for testing the nightly wheel in python build.
|
||||
@ -500,13 +505,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
|
||||
),
|
||||
|
||||
# If the env var is set, it uses a Ray Communicator wrapping
|
||||
# vLLM's pipeline parallelism communicator to interact with Ray's
|
||||
# Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
|
||||
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
|
||||
"VLLM_USE_RAY_WRAPPED_PP_COMM":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
|
||||
|
||||
# Use dedicated multiprocess context for workers.
|
||||
# Both spawn and fork work
|
||||
"VLLM_WORKER_MULTIPROC_METHOD":
|
||||
@ -911,14 +909,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_DEEP_GEMM":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
||||
|
||||
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
|
||||
# JIT all the required kernels before model execution so there is no
|
||||
# JIT'ing in the hot-path. However, this warmup increases the engine
|
||||
# startup time by a couple of minutes.
|
||||
# Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
|
||||
"VLLM_SKIP_DEEP_GEMM_WARMUP":
|
||||
lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
|
||||
|
||||
# Allow use of FlashInfer MoE kernels for fused moe ops.
|
||||
"VLLM_USE_FLASHINFER_MOE_FP8":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
|
||||
|
||||
@ -3,20 +3,21 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Queue
|
||||
from multiprocessing.connection import wait
|
||||
from multiprocessing.process import BaseProcess
|
||||
from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
|
||||
from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
|
||||
TypeVar, Union)
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
|
||||
run_method)
|
||||
from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -24,6 +25,10 @@ T = TypeVar('T')
|
||||
|
||||
_TERMINATE = "TERMINATE" # sentinel
|
||||
|
||||
# ANSI color codes
|
||||
CYAN = '\033[1;36m'
|
||||
RESET = '\033[0;0m'
|
||||
|
||||
JOIN_TIMEOUT_S = 2
|
||||
|
||||
|
||||
@ -208,7 +213,9 @@ def _run_worker_process(
|
||||
|
||||
# Add process-specific prefix to stdout and stderr
|
||||
process_name = get_mp_context().current_process().name
|
||||
decorate_logs(process_name)
|
||||
pid = os.getpid()
|
||||
_add_prefix(sys.stdout, process_name, pid)
|
||||
_add_prefix(sys.stderr, process_name, pid)
|
||||
|
||||
# Initialize worker
|
||||
worker = worker_factory(vllm_config, rank)
|
||||
@ -253,6 +260,33 @@ def _run_worker_process(
|
||||
logger.info("Worker exiting")
|
||||
|
||||
|
||||
def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
|
||||
"""Prepend each output line with process-specific prefix"""
|
||||
|
||||
prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
|
||||
file_write = file.write
|
||||
|
||||
def write_with_prefix(s: str):
|
||||
if not s:
|
||||
return
|
||||
if file.start_new_line: # type: ignore[attr-defined]
|
||||
file_write(prefix)
|
||||
idx = 0
|
||||
while (next_idx := s.find('\n', idx)) != -1:
|
||||
next_idx += 1
|
||||
file_write(s[idx:next_idx])
|
||||
if next_idx == len(s):
|
||||
file.start_new_line = True # type: ignore[attr-defined]
|
||||
return
|
||||
file_write(prefix)
|
||||
idx = next_idx
|
||||
file_write(s[idx:])
|
||||
file.start_new_line = False # type: ignore[attr-defined]
|
||||
|
||||
file.start_new_line = True # type: ignore[attr-defined]
|
||||
file.write = write_with_prefix # type: ignore[method-assign]
|
||||
|
||||
|
||||
def set_multiprocessing_worker_envs(parallel_config):
|
||||
""" Set up environment variables that should be used when there are workers
|
||||
in a multiprocessing environment. This should be called by the parent
|
||||
|
||||
@ -608,21 +608,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
|
||||
|
||||
forward_dag = MultiOutputNode(outputs)
|
||||
|
||||
if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
|
||||
from ray.experimental.channel.accelerator_context import (
|
||||
register_accelerator_context)
|
||||
|
||||
from vllm.distributed.device_communicators.ray_communicator import (
|
||||
RayPPCommunicator)
|
||||
register_accelerator_context(torch_module_name="cuda",
|
||||
communicator_cls=RayPPCommunicator)
|
||||
logger.info("Using RayPPCommunicator "
|
||||
"(which wraps vLLM _PP GroupCoordinator) "
|
||||
"for Ray Compiled Graph communication.")
|
||||
else:
|
||||
logger.info("Using Ray's NCCL communicator for "
|
||||
"Ray Compiled Graph communication.")
|
||||
|
||||
return forward_dag.experimental_compile(
|
||||
enable_asyncio=enable_asyncio,
|
||||
_overlap_gpu_communication=envs.
|
||||
|
||||
@ -11,7 +11,7 @@ from typing_extensions import TypeVar
|
||||
from vllm.jsontree import JSONTree, json_map_leaves
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.utils import get_allowed_kwarg_only_overrides
|
||||
from vllm.utils import resolve_mm_processor_kwargs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@ -154,11 +154,14 @@ class InputProcessingContext(InputContext):
|
||||
assert callable(hf_processor)
|
||||
|
||||
mm_config = self.model_config.get_multimodal_config()
|
||||
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
|
||||
base_kwargs = mm_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
allowed_kwargs = get_allowed_kwarg_only_overrides(
|
||||
merged_kwargs = resolve_mm_processor_kwargs(
|
||||
base_kwargs,
|
||||
kwargs,
|
||||
hf_processor,
|
||||
merged_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
@ -170,9 +173,7 @@ class InputProcessingContext(InputContext):
|
||||
return x
|
||||
|
||||
try:
|
||||
output = hf_processor(**data,
|
||||
**allowed_kwargs,
|
||||
return_tensors="pt")
|
||||
output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
|
||||
# this emulates output.to(dtype=self.model_config.dtype)
|
||||
if isinstance(output, BatchFeature):
|
||||
cast_output = json_map_leaves(maybe_cast_dtype, output.data)
|
||||
@ -188,7 +189,7 @@ class InputProcessingContext(InputContext):
|
||||
|
||||
except Exception as exc:
|
||||
msg = (f"Failed to apply {type(hf_processor).__name__} "
|
||||
f"on data={data} with kwargs={allowed_kwargs}")
|
||||
f"on data={data} with kwargs={merged_kwargs}")
|
||||
|
||||
raise ValueError(msg) from exc
|
||||
|
||||
|
||||
@ -682,14 +682,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
||||
def slice_lora_b(
|
||||
self, lora_b: list[Union[torch.Tensor, None]]
|
||||
) -> list[Union[torch.Tensor, None]]:
|
||||
sliced_lora_b = [None] * self.n_slices
|
||||
for i, (shard_id, shard_size) in enumerate(
|
||||
zip(self.output_ids, self.output_slices)):
|
||||
if (lora_b_i := lora_b[i]) is not None:
|
||||
sliced_lora_b[i] = lora_b_i[:,
|
||||
shard_size * shard_id:shard_size *
|
||||
(shard_id + 1)]
|
||||
return sliced_lora_b
|
||||
lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
|
||||
(shard_id + 1)]
|
||||
return lora_b
|
||||
|
||||
def slice_bias(
|
||||
self, bias: list[Union[torch.Tensor,
|
||||
|
||||
@ -4,9 +4,7 @@ import functools
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as env
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
@ -19,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8)
|
||||
from vllm.utils import has_deep_gemm, run_once
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -84,65 +82,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
|
||||
return True
|
||||
|
||||
|
||||
@run_once
|
||||
def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
num_topk: int):
|
||||
"""
|
||||
DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
|
||||
input tensor shapes. In this function, we construct all possible input
|
||||
tensor shapes so all the kernels are JIT'ed and cached.
|
||||
Note that this warmup is expected to happen during the model profile
|
||||
call and not during actual model inference.
|
||||
"""
|
||||
|
||||
assert w1.size(0) == w2.size(0), (
|
||||
"w1 and w2 must have the same number of experts")
|
||||
|
||||
block_m = deep_gemm_block_shape()[0]
|
||||
num_experts = w1.size(0)
|
||||
device = w1.device
|
||||
|
||||
# This is the maximum GroupedGemm M size that we expect to run
|
||||
# the grouped_gemm with.
|
||||
MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE,
|
||||
num_topk,
|
||||
num_experts,
|
||||
block_m,
|
||||
expert_tokens_meta=None)
|
||||
# Distribute expert-ids evenly.
|
||||
MAX_BLOCKS = MAX_M // block_m
|
||||
expert_ids_block = torch.randint(low=0,
|
||||
high=num_experts,
|
||||
size=(MAX_BLOCKS, ),
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
|
||||
|
||||
def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
|
||||
|
||||
_, n, k = w.size()
|
||||
a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
|
||||
a1q_scales = torch.empty((MAX_M, k // block_m),
|
||||
device=device,
|
||||
dtype=torch.float32)
|
||||
out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
|
||||
|
||||
pbar = tqdm(total=MAX_BLOCKS,
|
||||
desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})")
|
||||
num_tokens = MAX_M
|
||||
while num_tokens > 0:
|
||||
m_grouped_fp8_gemm_nt_contiguous(
|
||||
(a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
|
||||
out[:num_tokens], expert_ids[:num_tokens])
|
||||
pbar.update(1)
|
||||
num_tokens = num_tokens - block_m
|
||||
|
||||
_warmup(w1, w1_scale)
|
||||
_warmup(w2, w2_scale)
|
||||
|
||||
|
||||
class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
|
||||
def __init__(self):
|
||||
@ -217,20 +156,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
):
|
||||
assert self.block_shape is not None
|
||||
assert a1q_scale is not None
|
||||
assert w1_scale is not None
|
||||
assert w2_scale is not None
|
||||
|
||||
if not env.VLLM_SKIP_DEEP_GEMM_WARMUP:
|
||||
# DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing
|
||||
# to happen during actual model-inference. The
|
||||
# `warmup_deepgemm_kernels` function is a `run_once` decorated
|
||||
# function that executes during the model profile run. This warmup
|
||||
# should create all the required JITs for the current model.
|
||||
warmup_deepgemm_gg_contiguous_kernels(w1,
|
||||
w2,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
num_topk=topk_ids.size(1))
|
||||
|
||||
a1q = hidden_states
|
||||
_, N, K = w1.size()
|
||||
|
||||
@ -144,13 +144,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
"apply_router_weight_on_input is only implemented for topk=1")
|
||||
a1 = a1 * topk_weights.to(a1.dtype)
|
||||
|
||||
if quant_config.is_block_quantized:
|
||||
# Quant and Dispatch
|
||||
if quant_config.per_act_token_quant:
|
||||
a1q, a1q_scale = moe_kernel_quantize_input(
|
||||
a1,
|
||||
a1_scale,
|
||||
quant_dtype=quant_config.quant_dtype,
|
||||
per_act_token_quant=quant_config.per_act_token_quant,
|
||||
per_act_token_quant=True,
|
||||
block_shape=quant_config.block_shape,
|
||||
)
|
||||
if a1q_scale is not None and a1q_scale.numel() == 1:
|
||||
@ -163,10 +162,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
rank_topk_weights=topk_weights,
|
||||
num_experts=num_experts)
|
||||
else:
|
||||
# Dispatch and Quant
|
||||
# DeepEP kernels only support dispatching block-quantized
|
||||
# activation scales.
|
||||
# Dispatch in bfloat16
|
||||
# DeepEP kernels only support dispatching per-token-quant
|
||||
# quantization. dispatch in bfloat16.
|
||||
(expert_x, _, expert_tokens_meta, expert_topk_ids,
|
||||
expert_topk_weights) = self._do_dispatch(
|
||||
tokens=a1,
|
||||
@ -174,7 +171,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
rank_topk_ids=topk_ids,
|
||||
rank_topk_weights=topk_weights,
|
||||
num_experts=num_experts)
|
||||
# Quantize after dispatch.
|
||||
# quantize now
|
||||
expert_x_scale = None
|
||||
if expert_x.numel() != 0:
|
||||
expert_x, expert_x_scale = moe_kernel_quantize_input(
|
||||
|
||||
@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,
|
||||
|
||||
def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
|
||||
num_experts: int, bit: int):
|
||||
return current_platform.is_cuda() and bit == 4 and \
|
||||
group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
|
||||
return bit == 4 and group_size in [32, 64, 128] and \
|
||||
num_valid_tokens / num_experts <= 6
|
||||
|
||||
|
||||
def get_default_config(
|
||||
|
||||
@ -17,14 +17,9 @@ from vllm.model_executor.layers.fused_moe import (
|
||||
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
|
||||
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
|
||||
FusedMoeWeightScaleSupported)
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa
|
||||
FlashInferCutlassMoEPrepareAndFinalize)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
|
||||
WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
|
||||
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
|
||||
build_flashinfer_fp4_cutlass_moe_kernel,
|
||||
flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
check_moe_marlin_supports_layer, marlin_make_workspace_new,
|
||||
marlin_moe_permute_scales)
|
||||
@ -33,7 +28,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||
prepare_moe_fp8_layer_for_marlin)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
swizzle_blockscale)
|
||||
cutlass_fp4_supported, swizzle_blockscale)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
@ -101,14 +96,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def __init__(self):
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
|
||||
detect_nvfp4_moe_support)
|
||||
_nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
|
||||
self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
|
||||
self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
|
||||
self.use_marlin = _nvfp4.use_marlin
|
||||
self.use_marlin = not cutlass_fp4_supported()
|
||||
self.group_size = 16
|
||||
self.fused_experts = None # type: ignore[assignment]
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module, num_experts: int,
|
||||
hidden_size: int, intermediate_size_per_partition: int,
|
||||
@ -211,14 +200,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
|
||||
requires_grad=False)
|
||||
|
||||
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
|
||||
if self.allow_flashinfer_cutlass:
|
||||
w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
|
||||
layer.w13_weight_scale.data,
|
||||
dim=-2)
|
||||
layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
|
||||
layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
|
||||
|
||||
if not torch.allclose(layer.w13_weight_global_scale[:, 0],
|
||||
layer.w13_weight_global_scale[:, 1]):
|
||||
logger.warning_once(
|
||||
@ -265,21 +246,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
layer.w2_input_scale_quant = torch.nn.Parameter(
|
||||
(layer.w2_input_global_scale), requires_grad=False)
|
||||
|
||||
def maybe_swap_experts_impl(self, moe_parallel_config):
|
||||
if not self.allow_flashinfer_cutlass:
|
||||
return
|
||||
self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
|
||||
moe_parallel_config)
|
||||
|
||||
def select_gemm_impl(self, prepare_finalize, moe):
|
||||
"""Return the appropriate GEMM experts implementation."""
|
||||
assert moe is not None and prepare_finalize is not None
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501
|
||||
select_nvfp4_gemm_impl)
|
||||
|
||||
return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
|
||||
logger)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
@ -337,23 +303,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
global_num_experts=global_num_experts,
|
||||
expert_map=expert_map)
|
||||
|
||||
# FlashInfer fused experts path
|
||||
if self.fused_experts is not None:
|
||||
return flashinfer_fp4_cutlass_moe_forward(
|
||||
self.fused_experts,
|
||||
layer,
|
||||
x,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation=activation,
|
||||
global_num_experts=global_num_experts,
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
assert expert_map is None, ("Expert Parallelism / expert_map "
|
||||
"is currently not supported for "
|
||||
"CompressedTensorsW4A4MoeMethod.")
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
||||
cutlass_moe_fp4)
|
||||
|
||||
|
||||
@ -10,11 +10,10 @@ import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
from vllm.model_executor.layers.linear import LinearMethodBase
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, QuantizeMethodBase)
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
|
||||
get_linear_quant_method)
|
||||
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
|
||||
@ -111,23 +110,8 @@ class GPTQConfig(QuantizationConfig):
|
||||
return cls(weight_bits, group_size, desc_act, lm_head_quantized,
|
||||
dynamic)
|
||||
|
||||
def get_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
|
||||
if isinstance(layer, FusedMoE):
|
||||
# GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
|
||||
from .moe_wna16 import MoeWNA16Config
|
||||
|
||||
config = {
|
||||
"quant_method": "gptq",
|
||||
"bits": self.weight_bits,
|
||||
"group_size": self.group_size,
|
||||
"sym": True, # GPTQ typically uses symmetric quantization
|
||||
"lm_head": False,
|
||||
}
|
||||
return MoeWNA16Config.from_config(config).get_quant_method(
|
||||
layer, prefix)
|
||||
|
||||
def get_quant_method(self, layer: torch.nn.Module,
|
||||
prefix: str) -> Optional["GPTQLinearMethod"]:
|
||||
return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
|
||||
|
||||
|
||||
|
||||
@ -10,8 +10,11 @@ from torch.nn.parameter import Parameter
|
||||
import vllm.envs as envs
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.distributed import get_ep_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||
FlashInferCutlassMoEPrepareAndFinalize)
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
|
||||
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
||||
@ -20,9 +23,6 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, QuantizeMethodBase)
|
||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
|
||||
build_flashinfer_fp4_cutlass_moe_kernel,
|
||||
flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
|
||||
swap_w13_to_w31)
|
||||
@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp, requantize_with_max_scale)
|
||||
from vllm.model_executor.parameter import (ModelWeightParameter,
|
||||
PerTensorScaleParameter)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.utils.flashinfer import has_flashinfer_moe
|
||||
|
||||
@ -868,12 +869,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
|
||||
def __init__(self, quant_config: ModelOptNvFp4Config):
|
||||
self.quant_config = quant_config
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
|
||||
detect_nvfp4_moe_support)
|
||||
_nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
|
||||
self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
|
||||
self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
|
||||
self.use_marlin = _nvfp4.use_marlin
|
||||
self.cutlass_nvfp4_supported = cutlass_fp4_supported()
|
||||
self.use_marlin = False
|
||||
self.allow_flashinfer_cutlass = False
|
||||
|
||||
if envs.VLLM_USE_FLASHINFER_MOE_FP4:
|
||||
if self.cutlass_nvfp4_supported and current_platform.is_cuda() \
|
||||
and current_platform.is_device_capability(100):
|
||||
logger.info_once(
|
||||
"Using FlashInfer kernels for ModelOptNvFp4FusedMoE.")
|
||||
self.allow_flashinfer_cutlass = True
|
||||
else:
|
||||
logger.warning_once(
|
||||
"Flashinfer CUTLASS Fused MoE not supported "
|
||||
"or found on the current platform.")
|
||||
|
||||
if not self.cutlass_nvfp4_supported:
|
||||
if is_fp4_marlin_supported():
|
||||
self.use_marlin = True
|
||||
else:
|
||||
raise ValueError("Current platform does not support NVFP4"
|
||||
" quantization. Please use Blackwell and"
|
||||
" above.")
|
||||
|
||||
self.fused_experts = None # type: ignore
|
||||
|
||||
@ -883,8 +900,29 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
):
|
||||
if not self.allow_flashinfer_cutlass:
|
||||
return
|
||||
self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
|
||||
moe_parallel_config)
|
||||
|
||||
logger.debug_once("FlashInferExperts")
|
||||
# default to TP/EP case only
|
||||
|
||||
experts_kwargs: dict[str, Any] = {
|
||||
"use_nvfp4_w4a4": True,
|
||||
"use_dp": moe_parallel_config.dp_size > 1,
|
||||
"ep_rank": moe_parallel_config.ep_rank,
|
||||
"ep_size": moe_parallel_config.ep_size,
|
||||
"tp_rank": moe_parallel_config.tp_rank,
|
||||
"tp_size": moe_parallel_config.tp_size,
|
||||
}
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
||||
FlashInferExperts)
|
||||
experts = FlashInferExperts(**experts_kwargs)
|
||||
self.fused_experts = mk.FusedMoEModularKernel(
|
||||
FlashInferCutlassMoEPrepareAndFinalize(
|
||||
quant_dtype=torch.uint8,
|
||||
#meaning 2x e2m1 packed in one, kernel requirement
|
||||
),
|
||||
experts,
|
||||
)
|
||||
|
||||
# This method update self.fused_experts
|
||||
# only prepare_finalize is not None call select_gemm_impl
|
||||
@ -893,12 +931,32 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
def select_gemm_impl(self, prepare_finalize,
|
||||
moe) -> mk.FusedMoEPermuteExpertsUnpermute:
|
||||
|
||||
assert moe is not None and prepare_finalize is not None
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501
|
||||
select_nvfp4_gemm_impl)
|
||||
assert moe is not None
|
||||
assert prepare_finalize is not None
|
||||
experts = None
|
||||
all2all_manager = get_ep_group().device_communicator.all2all_manager
|
||||
assert all2all_manager is not None
|
||||
if self.allow_flashinfer_cutlass:
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
||||
FlashInferExperts)
|
||||
logger.debug_once("Using FlashInferExperts")
|
||||
experts = FlashInferExperts(
|
||||
use_nvfp4_w4a4=True,
|
||||
use_dp=moe.moe_parallel_config.dp_size > 1,
|
||||
ep_rank=moe.moe_parallel_config.ep_rank,
|
||||
ep_size=moe.moe_parallel_config.ep_size,
|
||||
tp_rank=moe.moe_parallel_config.tp_rank,
|
||||
tp_size=moe.moe_parallel_config.tp_size,
|
||||
)
|
||||
else:
|
||||
assert moe.dp_size > 1
|
||||
logger.debug_once("Using CutlassExpertsFp4")
|
||||
# Currently CutlassExpertsFp4 doesn't support DP
|
||||
raise ValueError("CutlassExpertsFp4 doesn't support DP. "
|
||||
"Use flashinfer CUTLASS FusedMoE backend instead "
|
||||
"(set VLLM_USE_FLASHINFER_MOE_FP4=1)")
|
||||
|
||||
return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
|
||||
logger)
|
||||
return experts
|
||||
|
||||
def uses_weight_scale_2_pattern(self) -> bool:
|
||||
"""
|
||||
@ -1004,8 +1062,18 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
gemm1_weight_scale = layer.w13_weight_scale.data
|
||||
|
||||
if self.allow_flashinfer_cutlass:
|
||||
gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
|
||||
gemm1_weight, gemm1_weight_scale, dim=-2)
|
||||
dim = -2
|
||||
size = gemm1_weight.size(dim)
|
||||
assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
|
||||
half = size // 2
|
||||
|
||||
# Reorder weight
|
||||
w1, w3 = gemm1_weight.split(half, dim=dim)
|
||||
gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous()
|
||||
|
||||
# Reorder scale
|
||||
s1, s3 = gemm1_weight_scale.split(half, dim=dim)
|
||||
gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous()
|
||||
|
||||
layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
|
||||
layer.w13_weight_scale = Parameter(gemm1_weight_scale,
|
||||
@ -1149,15 +1217,49 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input)
|
||||
else:
|
||||
out = flashinfer_fp4_cutlass_moe_forward(
|
||||
self.fused_experts,
|
||||
layer,
|
||||
x,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
# TP or DP case
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
||||
is_valid_flashinfer_cutlass_fused_moe)
|
||||
assert is_valid_flashinfer_cutlass_fused_moe(
|
||||
x, layer.w13_weight, layer.w2_weight), (
|
||||
"Flashinfer CUTLASS Fused MoE not applicable!")
|
||||
|
||||
a1_gscale = layer.w13_input_scale_quant
|
||||
a2_gscale = layer.w2_input_scale_quant
|
||||
extra_expert_args = {
|
||||
'g1_alphas': layer.g1_alphas,
|
||||
'g2_alphas': layer.g2_alphas,
|
||||
'out_dtype': x.dtype,
|
||||
# Avoid confusion with a1_scale and a2_scale
|
||||
# where are batch size related.
|
||||
'a1_gscale': a1_gscale,
|
||||
'a2_gscale': a2_gscale,
|
||||
}
|
||||
extra_prepare_args = {
|
||||
'use_dp': layer.dp_size > 1,
|
||||
'local_tokens': x.shape[0],
|
||||
'a1_gscale': a1_gscale,
|
||||
}
|
||||
extra_finalize_args = {
|
||||
'use_dp': layer.dp_size > 1,
|
||||
'local_tokens': x.shape[0],
|
||||
}
|
||||
|
||||
out = self.fused_experts(
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False, # TODO(shuw): fix later, now output is high prec
|
||||
activation=activation,
|
||||
global_num_experts=global_num_experts,
|
||||
expert_map=expert_map,
|
||||
w1_scale=layer.w13_blockscale_swizzled,
|
||||
w2_scale=layer.w2_blockscale_swizzled,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
extra_expert_args=extra_expert_args,
|
||||
extra_prepare_args=extra_prepare_args,
|
||||
extra_finalize_args=extra_finalize_args,
|
||||
)
|
||||
return out
|
||||
|
||||
@ -1,154 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
|
||||
FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||
FlashInferCutlassMoEPrepareAndFinalize)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"is_flashinfer_fp4_cutlass_moe_available",
|
||||
"reorder_w1w3_to_w3w1",
|
||||
"build_flashinfer_fp4_cutlass_moe_kernel",
|
||||
"flashinfer_fp4_cutlass_moe_forward",
|
||||
]
|
||||
|
||||
|
||||
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
|
||||
"""Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
|
||||
return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100))
|
||||
|
||||
|
||||
def reorder_w1w3_to_w3w1(weight: torch.Tensor,
|
||||
scale: torch.Tensor,
|
||||
dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
|
||||
size = weight.size(dim)
|
||||
assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
|
||||
half = size // 2
|
||||
|
||||
w1, w3 = weight.split(half, dim=dim)
|
||||
s1, s3 = scale.split(half, dim=dim)
|
||||
|
||||
return (torch.cat([w3, w1],
|
||||
dim=dim).contiguous(), torch.cat([s3, s1],
|
||||
dim=dim).contiguous())
|
||||
|
||||
|
||||
def build_flashinfer_fp4_cutlass_moe_kernel(
|
||||
moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
|
||||
"""Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
|
||||
experts = FlashInferExperts(
|
||||
use_nvfp4_w4a4=True,
|
||||
use_dp=moe_parallel_config.dp_size > 1,
|
||||
ep_rank=moe_parallel_config.ep_rank,
|
||||
ep_size=moe_parallel_config.ep_size,
|
||||
tp_rank=moe_parallel_config.tp_rank,
|
||||
tp_size=moe_parallel_config.tp_size,
|
||||
)
|
||||
logger.debug_once("FlashInferExperts (util)")
|
||||
return mk.FusedMoEModularKernel(
|
||||
FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
|
||||
experts,
|
||||
)
|
||||
|
||||
|
||||
def flashinfer_fp4_cutlass_moe_forward(
|
||||
fused_experts: mk.FusedMoEModularKernel,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
activation: str,
|
||||
global_num_experts: int,
|
||||
expert_map: Optional[torch.Tensor],
|
||||
apply_router_weight_on_input: bool,
|
||||
) -> torch.Tensor:
|
||||
"""Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
|
||||
|
||||
assert is_valid_flashinfer_cutlass_fused_moe(
|
||||
x, layer.w13_weight,
|
||||
layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
|
||||
|
||||
a1_gscale = layer.w13_input_scale_quant
|
||||
a2_gscale = layer.w2_input_scale_quant
|
||||
|
||||
extra_expert_args = {
|
||||
"g1_alphas": layer.g1_alphas,
|
||||
"g2_alphas": layer.g2_alphas,
|
||||
# Avoid confusion with a1_scale and a2_scale
|
||||
# where are batch size related.
|
||||
"a1_gscale": a1_gscale,
|
||||
"a2_gscale": a2_gscale,
|
||||
"out_dtype": x.dtype,
|
||||
}
|
||||
extra_prepare_args = {
|
||||
"use_dp": layer.dp_size > 1,
|
||||
"local_tokens": x.shape[0],
|
||||
"a1_gscale": a1_gscale,
|
||||
}
|
||||
extra_finalize_args = {
|
||||
"use_dp": layer.dp_size > 1,
|
||||
"local_tokens": x.shape[0],
|
||||
}
|
||||
|
||||
return fused_experts(
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False, # TODO(shuw): fix later, now output is high prec
|
||||
activation=activation,
|
||||
global_num_experts=global_num_experts,
|
||||
expert_map=expert_map,
|
||||
w1_scale=layer.w13_blockscale_swizzled,
|
||||
w2_scale=layer.w2_blockscale_swizzled,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
extra_expert_args=extra_expert_args,
|
||||
extra_prepare_args=extra_prepare_args,
|
||||
extra_finalize_args=extra_finalize_args,
|
||||
)
|
||||
|
||||
|
||||
def select_nvfp4_gemm_impl(
|
||||
allow_flashinfer_cutlass: bool,
|
||||
moe, # FusedMoEConfig
|
||||
logger):
|
||||
"""Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
|
||||
|
||||
# lazy import
|
||||
from vllm.distributed import get_ep_group
|
||||
|
||||
all2all_manager = get_ep_group().device_communicator.all2all_manager
|
||||
assert all2all_manager is not None
|
||||
|
||||
if allow_flashinfer_cutlass:
|
||||
logger.debug_once("Using FlashInferExperts")
|
||||
return FlashInferExperts(
|
||||
use_nvfp4_w4a4=True,
|
||||
use_dp=moe.moe_parallel_config.dp_size > 1,
|
||||
ep_rank=moe.moe_parallel_config.ep_rank,
|
||||
ep_size=moe.moe_parallel_config.ep_size,
|
||||
tp_rank=moe.moe_parallel_config.tp_rank,
|
||||
tp_size=moe.moe_parallel_config.tp_size,
|
||||
)
|
||||
|
||||
# native cutlass experts currently don't support DP; TP case won't call this
|
||||
raise ValueError(
|
||||
"CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
|
||||
"Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")
|
||||
@ -1,59 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
|
||||
is_flashinfer_fp4_cutlass_moe_available)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
||||
is_fp4_marlin_supported)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported)
|
||||
|
||||
__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
|
||||
|
||||
_logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NvFp4Support:
|
||||
"""Result container for NV-FP4 capability probing."""
|
||||
|
||||
cutlass_supported: bool
|
||||
allow_flashinfer_cutlass: bool
|
||||
use_marlin: bool
|
||||
|
||||
|
||||
def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
|
||||
"""Detect platform support for NV-FP4 fused-MoE path"""
|
||||
cutlass_supported = cutlass_fp4_supported()
|
||||
|
||||
allow_flashinfer = (cutlass_supported
|
||||
and is_flashinfer_fp4_cutlass_moe_available())
|
||||
|
||||
if allow_flashinfer:
|
||||
_logger.info_once("Using FlashInfer kernels for %s.", class_name
|
||||
or "NVFP4 path")
|
||||
else:
|
||||
if envs.VLLM_USE_FLASHINFER_MOE_FP4:
|
||||
_logger.warning_once(
|
||||
"FlashInfer kernels unavailable for %s on current platform.",
|
||||
class_name or "NVFP4 path",
|
||||
)
|
||||
|
||||
use_marlin = False
|
||||
if not cutlass_supported:
|
||||
if is_fp4_marlin_supported():
|
||||
use_marlin = True
|
||||
_logger.info_once("Falling back to Marlin FP4 MoE kernel.")
|
||||
else:
|
||||
raise ValueError(
|
||||
"Current platform does not support NVFP4 quantization. "
|
||||
"Please use Blackwell GPUs or enable FlashInfer.")
|
||||
|
||||
return NvFp4Support(
|
||||
cutlass_supported=cutlass_supported,
|
||||
allow_flashinfer_cutlass=allow_flashinfer,
|
||||
use_marlin=use_marlin,
|
||||
)
|
||||
@ -34,8 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
|
||||
pt_weights_iterator, safetensors_weights_iterator)
|
||||
from vllm.model_executor.models import is_pooling_model
|
||||
from vllm.model_executor.utils import (get_moe_expert_mapping,
|
||||
get_packed_modules_mapping,
|
||||
from vllm.model_executor.utils import (get_packed_modules_mapping,
|
||||
set_weight_attrs)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -44,12 +43,6 @@ from vllm.platforms import current_platform
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_moe_model(model: torch.nn.Module) -> bool:
|
||||
"""Checks if the model contains FusedMoE layers."""
|
||||
return bool(any(
|
||||
isinstance(module, FusedMoE) for module in model.modules()))
|
||||
|
||||
|
||||
class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
"""Model loader to load model weights with BitAndBytes quantization."""
|
||||
|
||||
@ -68,8 +61,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
# Store all module names (from transformers) that support
|
||||
# BNB quantization.
|
||||
self.target_modules: list[str] = []
|
||||
# Store the mapping of expert parameters for MoE models.
|
||||
self.expert_params_mapping: list[tuple[str, str, int, str]] = []
|
||||
# mapping weight names from transformers to vllm.
|
||||
self.weight_mapper: Callable = lambda name: name
|
||||
self.pre_quant: bool = False
|
||||
@ -422,8 +413,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
# in case model has a mixture of disk-merged and disk-split
|
||||
# weights with same last name.
|
||||
self.target_modules.append(name)
|
||||
elif isinstance(module, FusedMoE) and hasattr(
|
||||
module.quant_method, "quant_config"):
|
||||
elif (isinstance(module, FusedMoE)
|
||||
and hasattr(module.quant_method, "quant_config")):
|
||||
if not hasattr(model, "get_expert_mapping"):
|
||||
raise AttributeError(
|
||||
f"MoE Model {type(model).__name__} does not support "
|
||||
"BitsAndBytes quantization yet. Ensure this model has "
|
||||
"'get_expert_mapping' method.")
|
||||
# TODO: support FusedMoE with prequant and 8bit.
|
||||
if self.pre_quant:
|
||||
raise ValueError(
|
||||
@ -434,9 +430,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
"BitsAndBytes 8bit quantization with FusedMoE is not "
|
||||
"supported yet.")
|
||||
# Get the corresponding weight name using module name and
|
||||
# expert_params_mapping.
|
||||
|
||||
for exp in self.expert_params_mapping:
|
||||
# get_expert_mapping.
|
||||
expert_mapping = model.get_expert_mapping()
|
||||
for exp in expert_mapping:
|
||||
weight_name = exp[1]
|
||||
rep_name = name.replace("experts",
|
||||
"") + weight_name.removesuffix(".")
|
||||
@ -468,7 +464,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
elif isinstance(module, (RowParallelLinear, )):
|
||||
self.column_sharded_weights_modules.append(name)
|
||||
elif isinstance(module, FusedMoE):
|
||||
expert_mapping = self.expert_params_mapping
|
||||
expert_mapping = model.get_expert_mapping()
|
||||
for exp in expert_mapping:
|
||||
if exp[-1] == "w2":
|
||||
weight_name = exp[1]
|
||||
@ -520,13 +516,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
self.is_pool_model = is_pooling_model(model)
|
||||
self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
|
||||
|
||||
if is_moe_model(model):
|
||||
self.expert_params_mapping = get_moe_expert_mapping(model)
|
||||
if not self.expert_params_mapping:
|
||||
raise AttributeError(
|
||||
f"MoE Model {type(model).__name__} does not support "
|
||||
"BitsAndBytes quantization yet. Ensure this model has "
|
||||
"'get_expert_mapping' method.")
|
||||
# For some models like Molmo, we need to use hf_to_vllm_mapper
|
||||
# to ensure correct loading of weights.
|
||||
if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
|
||||
@ -580,10 +569,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
"""
|
||||
from bitsandbytes.functional import QuantState
|
||||
|
||||
if not self.expert_params_mapping:
|
||||
if not hasattr(model, "get_expert_mapping"):
|
||||
return dict()
|
||||
|
||||
expert_mapping = self.expert_params_mapping
|
||||
expert_mapping = model.get_expert_mapping()
|
||||
expert_qs_dict = {}
|
||||
for name, module in model.named_modules():
|
||||
if not isinstance(module, FusedMoE):
|
||||
|
||||
@ -123,10 +123,16 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
|
||||
return self.ctx.get_hf_config(AyaVisionConfig)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
|
||||
return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
|
||||
processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
|
||||
|
||||
def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
|
||||
return self.get_hf_processor(**kwargs).image_processor
|
||||
# Temporary workaround since this processor has multiple image tokens
|
||||
# See https://github.com/huggingface/transformers/issues/38350
|
||||
processor._check_special_mm_tokens = lambda *args, **kwargs: None
|
||||
|
||||
return processor
|
||||
|
||||
def get_image_processor(self) -> GotOcr2ImageProcessor:
|
||||
return self.get_hf_processor().image_processor
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor(
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
if mm_data:
|
||||
processed_outputs = self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(prompt=prompt, **mm_data),
|
||||
dict(**mm_kwargs, **tok_kwargs),
|
||||
)
|
||||
pixel_values = processed_outputs["pixel_values"]
|
||||
# split pixel values into patches corresponding to each image
|
||||
images_spatial_crop = processed_outputs["images_spatial_crop"]
|
||||
patches_per_image = [
|
||||
x.prod().item() + 1 for x in images_spatial_crop
|
||||
]
|
||||
pixel_values = pixel_values.split(patches_per_image)
|
||||
processed_outputs["pixel_values"] = pixel_values
|
||||
else:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
return tokenizer(prompt,
|
||||
add_special_tokens=True,
|
||||
return_tensors="pt")
|
||||
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
pixel_values = processed_outputs["pixel_values"]
|
||||
# split pixel values into patches corresponding to each image
|
||||
images_spatial_crop = processed_outputs["images_spatial_crop"]
|
||||
patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
|
||||
pixel_values = pixel_values.split(patches_per_image)
|
||||
processed_outputs["pixel_values"] = pixel_values
|
||||
processed_outputs = tokenizer(prompt,
|
||||
add_special_tokens=True,
|
||||
return_tensors="pt")
|
||||
|
||||
return processed_outputs
|
||||
|
||||
|
||||
@ -761,6 +761,12 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
|
||||
|
||||
class Florence2ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor()
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
|
||||
@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
|
||||
|
||||
def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
|
||||
return self.get_hf_processor(**kwargs).image_processor
|
||||
def get_image_processor(self) -> FuyuImageProcessor:
|
||||
return self.get_hf_processor().image_processor
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
@ -46,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsQuant
|
||||
from .utils import (AutoWeightsLoader, extract_layer_index,
|
||||
is_pp_missing_parameter, make_layers, maybe_prefix)
|
||||
|
||||
@ -69,7 +68,6 @@ class Gemma3nAltUp(nn.Module):
|
||||
altup_num_inputs: int,
|
||||
altup_coef_clip: float,
|
||||
altup_active_idx: int,
|
||||
quant_config: QuantizationConfig,
|
||||
prefix: str,
|
||||
):
|
||||
super().__init__()
|
||||
@ -82,7 +80,6 @@ class Gemma3nAltUp(nn.Module):
|
||||
altup_num_inputs,
|
||||
altup_num_inputs,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.correction_coefs",
|
||||
return_bias=False,
|
||||
)
|
||||
@ -90,7 +87,6 @@ class Gemma3nAltUp(nn.Module):
|
||||
altup_num_inputs,
|
||||
altup_num_inputs**2,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.prediction_coefs",
|
||||
return_bias=False,
|
||||
)
|
||||
@ -98,7 +94,6 @@ class Gemma3nAltUp(nn.Module):
|
||||
hidden_size,
|
||||
altup_num_inputs,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.modality_router",
|
||||
return_bias=False,
|
||||
)
|
||||
@ -405,7 +400,6 @@ class Gemma3nDecoderLayer(nn.Module):
|
||||
altup_num_inputs=config.altup_num_inputs,
|
||||
altup_coef_clip=config.altup_coef_clip,
|
||||
altup_active_idx=config.altup_active_idx,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.altup",
|
||||
)
|
||||
self.self_attn = Gemma3nAttention(
|
||||
@ -533,7 +527,7 @@ class Gemma3nDecoderLayer(nn.Module):
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class Gemma3nTextModel(nn.Module, SupportsQuant):
|
||||
class Gemma3nTextModel(nn.Module):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
@ -546,7 +540,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.embed_tokens",
|
||||
)
|
||||
self.embed_scale = torch.tensor(
|
||||
@ -556,7 +549,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
|
||||
self.embed_tokens_per_layer = VocabParallelEmbedding(
|
||||
config.vocab_size_per_layer_input,
|
||||
config.num_hidden_layers * config.hidden_size_per_layer_input,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.per_layer_embed_tokens",
|
||||
)
|
||||
self.embed_scale_per_layer = torch.tensor(
|
||||
@ -590,7 +582,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
|
||||
gather_output=True,
|
||||
return_bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.altup_projections.{idx-1}",
|
||||
prefix=f"{prefix}.{idx-1}.altup_projections",
|
||||
) for idx in range(1, self.config.altup_num_inputs)
|
||||
])
|
||||
self.altup_unembed_projections = nn.ModuleList([
|
||||
@ -601,7 +593,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
|
||||
gather_output=True,
|
||||
return_bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.altup_unembed_projections.{idx-1}",
|
||||
prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
|
||||
) for idx in range(1, self.config.altup_num_inputs)
|
||||
])
|
||||
|
||||
@ -782,7 +774,7 @@ class Gemma3nModel(nn.Module):
|
||||
**kwargs)
|
||||
|
||||
|
||||
class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
|
||||
class Gemma3nForConditionalGeneration(nn.Module):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
|
||||
@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": 1}
|
||||
|
||||
def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
|
||||
return self.get_hf_processor(**kwargs).image_processor
|
||||
def get_image_processor(self) -> Glm4vImageProcessor:
|
||||
return self.get_hf_processor().image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
|
||||
return self.get_hf_processor(**kwargs).video_processor
|
||||
def get_video_processor(self) -> Glm4vVideoProcessor:
|
||||
return self.get_hf_processor().video_processor
|
||||
|
||||
def _get_vision_info(
|
||||
self,
|
||||
@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
|
||||
total_frames: int) -> list[int]:
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
video_fps = metadata.get("fps", video_processor.fps)
|
||||
video_fps = metadata.get("fps", 2.0)
|
||||
meta_frames = metadata.get("total_num_frames", total_frames)
|
||||
max_frame_idx = meta_frames - 1
|
||||
duration = metadata.get("duration",
|
||||
@ -1120,7 +1120,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
||||
video_placeholder,
|
||||
)
|
||||
|
||||
video_grid_thw_lst.append(video_outputs["video_grid_thw"])
|
||||
grid_t = len(video_outputs["video_grid_thw"])
|
||||
_, grid_h, grid_w = video_outputs["video_grid_thw"][0]
|
||||
grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
|
||||
|
||||
video_grid_thw_lst.append(grid_thw)
|
||||
pixel_values_videos_lst.append(
|
||||
video_outputs["pixel_values_videos"])
|
||||
video_outputs = dict(
|
||||
|
||||
@ -392,7 +392,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
|
||||
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> H2OVLProcessor:
|
||||
if min_dynamic_patch is not None:
|
||||
kwargs["min_dynamic_patch"] = min_dynamic_patch
|
||||
if max_dynamic_patch is not None:
|
||||
kwargs["max_dynamic_patch"] = max_dynamic_patch
|
||||
if dynamic_image_size is not None:
|
||||
kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
return self.ctx.init_processor(
|
||||
H2OVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
|
||||
@ -25,7 +25,8 @@ import torch
|
||||
import torch.nn as nn
|
||||
from timm.layers import LayerNorm, LayerNorm2d
|
||||
from timm.models.regnet import RegStage
|
||||
from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
|
||||
from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
|
||||
SiglipVisionConfig)
|
||||
from transformers.modeling_utils import no_init_weights
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
@ -79,9 +80,26 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
|
||||
|
||||
class HCXVisionProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
def get_vision_encoder_info(self):
|
||||
return get_vision_encoder_info(self.get_hf_config())
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
**kwargs: object,
|
||||
):
|
||||
processor_cls = type(
|
||||
AutoProcessor.from_pretrained(
|
||||
self.ctx.model_config.model,
|
||||
trust_remote_code=self.ctx.model_config.trust_remote_code,
|
||||
))
|
||||
return self.ctx.get_hf_processor(
|
||||
processor_cls,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
|
||||
|
||||
@ -88,7 +88,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
|
||||
|
||||
class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
) -> Idefics3Processor:
|
||||
if size is not None:
|
||||
kwargs["size"] = size
|
||||
|
||||
return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
|
||||
@ -665,7 +665,14 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
"""Basic image-only ProcessingInfo for InternVL-style models."""
|
||||
|
||||
@abstractmethod
|
||||
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> BaseInternVLProcessor:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
@ -875,12 +882,27 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
|
||||
return max(max_frames_per_video, 1)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> InternVLProcessor:
|
||||
if min_dynamic_patch is not None:
|
||||
kwargs["min_dynamic_patch"] = min_dynamic_patch
|
||||
if max_dynamic_patch is not None:
|
||||
kwargs["max_dynamic_patch"] = max_dynamic_patch
|
||||
if dynamic_image_size is not None:
|
||||
kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
kwargs["video_token"] = self.get_video_token()
|
||||
|
||||
return self.ctx.init_processor(
|
||||
InternVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
video_token=self.get_video_token(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -44,6 +44,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.platforms import _Backend
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.config import uses_mrope
|
||||
from vllm.transformers_utils.processor import (
|
||||
cached_image_processor_from_config)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
@ -978,8 +980,72 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
|
||||
|
||||
class KeyeProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
return self.get_hf_processor(**kwargs).image_processor
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
):
|
||||
return self.ctx.get_hf_processor(
|
||||
image_processor=self.get_image_processor(
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
size=size,
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _get_image_processor_kwargs(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
):
|
||||
if self.ctx.model_config.mm_processor_kwargs:
|
||||
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
|
||||
|
||||
if min_pixels is not None:
|
||||
kwargs["min_pixels"] = min_pixels
|
||||
|
||||
if size is None:
|
||||
size = {"shortest_edge": min_pixels}
|
||||
else:
|
||||
size["shortest_edge"] = min_pixels
|
||||
|
||||
if max_pixels is not None:
|
||||
kwargs["max_pixels"] = max_pixels
|
||||
|
||||
if size is None:
|
||||
size = {"longest_edge": max_pixels}
|
||||
else:
|
||||
size["longest_edge"] = max_pixels
|
||||
|
||||
if size is not None:
|
||||
kwargs["size"] = size
|
||||
|
||||
return kwargs
|
||||
|
||||
def get_image_processor(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
):
|
||||
return cached_image_processor_from_config(
|
||||
self.ctx.model_config,
|
||||
**self._get_image_processor_kwargs(
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
size=size,
|
||||
**kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
@ -1180,6 +1246,20 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
return KeyeMultiModalDataParser()
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
|
||||
return self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(text=prompt, **mm_data),
|
||||
dict(**mm_kwargs, **tok_kwargs),
|
||||
)
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
|
||||
@ -51,25 +51,6 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
|
||||
|
||||
self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
if getattr(config, "norm_before_residual", False):
|
||||
self._residual_norm = self._norm_before_residual
|
||||
else:
|
||||
self._residual_norm = self._norm_after_residual
|
||||
|
||||
def _norm_before_residual(
|
||||
self,
|
||||
hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
hidden_states = self.hidden_norm(hidden_states)
|
||||
residual = hidden_states
|
||||
return hidden_states, residual
|
||||
|
||||
def _norm_after_residual(
|
||||
self,
|
||||
hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.hidden_norm(hidden_states)
|
||||
return hidden_states, residual
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
@ -78,10 +59,9 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
residual = hidden_states
|
||||
embeds = self.input_layernorm(embeds)
|
||||
|
||||
hidden_states, residual = self._residual_norm(
|
||||
hidden_states=hidden_states)
|
||||
hidden_states = self.hidden_norm(hidden_states)
|
||||
|
||||
hidden_states = torch.cat([embeds, hidden_states], dim=-1)
|
||||
# Self Attention
|
||||
@ -122,7 +102,7 @@ class LlamaModel(nn.Module):
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
LlamaDecoderLayer(
|
||||
config=self.config,
|
||||
self.config,
|
||||
prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
|
||||
)
|
||||
])
|
||||
|
||||
@ -8,9 +8,11 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from packaging.version import Version
|
||||
from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
|
||||
PixtralVisionConfig, PretrainedConfig,
|
||||
SiglipVisionConfig)
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
from transformers.models.llava import LlavaProcessor
|
||||
from transformers.models.pixtral import PixtralProcessor
|
||||
|
||||
@ -305,14 +307,29 @@ class PixtralHFMultiModalProcessor(
|
||||
|
||||
pixel_values = processed_outputs.get("pixel_values")
|
||||
if pixel_values is not None:
|
||||
# Avoid padding since we need the output for each image to be
|
||||
# independent of other images for the cache to work correctly
|
||||
image_sizes = processed_outputs["image_sizes"]
|
||||
assert len(pixel_values) == len(image_sizes)
|
||||
# Before/after https://github.com/huggingface/transformers/pull/35122
|
||||
if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
|
||||
images = mm_data["images"]
|
||||
assert isinstance(images, list)
|
||||
|
||||
processed_outputs["pixel_values"] = [
|
||||
p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
|
||||
]
|
||||
# Original output: (1, num_images, C, H, W)
|
||||
# New output: (num_images, C, H, W)
|
||||
assert (isinstance(pixel_values, list)
|
||||
and len(pixel_values) == 1)
|
||||
assert (isinstance(pixel_values[0], list)
|
||||
and len(pixel_values[0]) == len(images))
|
||||
|
||||
processed_outputs["pixel_values"] = pixel_values[0]
|
||||
else:
|
||||
# Avoid padding since we need the output for each image to be
|
||||
# independent of other images for the cache to work correctly
|
||||
image_sizes = processed_outputs["image_sizes"]
|
||||
assert len(pixel_values) == len(image_sizes)
|
||||
|
||||
processed_outputs["pixel_values"] = [
|
||||
p[:, :h, :w]
|
||||
for p, (h, w) in zip(pixel_values, image_sizes)
|
||||
]
|
||||
|
||||
return processed_outputs
|
||||
|
||||
@ -767,10 +784,17 @@ class MantisProcessingInfo(LlavaProcessingInfo):
|
||||
vision_info = self.get_vision_encoder_info()
|
||||
|
||||
kwargs.setdefault("patch_size", vision_info.get_patch_size())
|
||||
kwargs.setdefault(
|
||||
"vision_feature_select_strategy",
|
||||
hf_config.vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
|
||||
# BUG: num_additional_image_tokens = 0 but treated as 1,
|
||||
# so we set vision_feature_select_strategy to None to offset this
|
||||
kwargs.setdefault("vision_feature_select_strategy", None)
|
||||
else:
|
||||
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
|
||||
kwargs.setdefault(
|
||||
"vision_feature_select_strategy",
|
||||
hf_config.vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
|
||||
|
||||
|
||||
@ -331,8 +331,10 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return hf_processor
|
||||
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
return self.get_hf_processor(**kwargs).image_processor
|
||||
def get_image_processor(self):
|
||||
hf_processor = self.get_hf_processor()
|
||||
image_processor = hf_processor.image_processor # type: ignore
|
||||
return image_processor
|
||||
|
||||
def get_model_version(self):
|
||||
return get_version_by_config(self.get_hf_config())
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user