Merge branch 'main' into rename_file_info_to_pkg/file

This commit is contained in:
Ning Xie 2025-11-19 01:04:47 +08:00 committed by GitHub
commit 0da34283f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
255 changed files with 9441 additions and 2518 deletions

View File

@ -1068,7 +1068,7 @@ steps:
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
- label: Blackwell Fusion E2E Tests # 30 min
@ -1095,10 +1095,11 @@ steps:
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
agent_pool: mi325_1
mirror_hardwares: [amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@ -1107,7 +1108,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60

View File

@ -478,10 +478,11 @@ steps:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Cudagraph test
timeout_in_minutes: 20
@ -925,7 +926,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion Tests # 30 min
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
@ -946,7 +947,9 @@ steps:
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@ -969,8 +972,6 @@ steps:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
@ -1266,7 +1267,8 @@ steps:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py

2
.github/CODEOWNERS vendored
View File

@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
# Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor

81
.github/workflows/macos-smoke-test.yml vendored Normal file
View File

@ -0,0 +1,81 @@
name: macOS Apple Silicon Smoke Test
on:
push:
branches:
- main
workflow_dispatch: # Manual trigger
jobs:
macos-m1-smoke-test:
runs-on: macos-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: |
requirements/**/*.txt
pyproject.toml
python-version: '3.12'
- name: Create virtual environment
run: |
uv venv
echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
- name: Install dependencies and build vLLM
run: |
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
uv pip install -e .
env:
CMAKE_BUILD_PARALLEL_LEVEL: 4
- name: Verify installation
run: |
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
- name: Smoke test vllm serve
timeout-minutes: 10
run: |
# Start server in background
vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2048 \
--load-format=dummy \
--enforce-eager \
--port 8000 &
SERVER_PID=$!
# Wait for server to start
for i in {1..30}; do
if curl -s http://localhost:8000/health > /dev/null; then
echo "Server started successfully"
break
fi
if [ "$i" -eq 30 ]; then
echo "Server failed to start"
kill "$SERVER_PID"
exit 1
fi
sleep 2
done
# Test health endpoint
curl -f http://localhost:8000/health
# Test completion
curl -f http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"prompt": "Hello",
"max_tokens": 5
}'
# Cleanup
kill "$SERVER_PID"

View File

@ -3,10 +3,9 @@ MD007:
MD013: false
MD024:
siblings_only: true
MD031:
list_items: false
MD033: false
MD045: false
MD046: false
MD051: false
MD052: false
MD053: false
MD059: false

View File

@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# Hadacore kernels
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs(

View File

@ -0,0 +1,380 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
This benchmark runs the same workload twice:
1. With VLLM_BATCH_INVARIANT=0 (baseline)
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
And reports the timing and throughput metrics for comparison.
Environment variables:
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
Example usage:
# Benchmark qwen3 (default)
python benchmarks/benchmark_batch_invariance.py
# Benchmark deepseek with 8 GPUs
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
python benchmarks/benchmark_batch_invariance.py
# Quick test with fewer trials
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
python benchmarks/benchmark_batch_invariance.py
"""
import contextlib
import os
import random
import time
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
"""Generate a random prompt for benchmarking."""
prompt_templates = [
"Question: What is the capital of France?\nAnswer: The capital of France is",
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
"Once upon a time in a distant galaxy, there lived",
"The old man walked slowly down the street, remembering",
"In the year 2157, humanity finally discovered",
"To implement a binary search tree in Python, first we need to",
"The algorithm works by iterating through the array and",
"Here's how to optimize database queries using indexing:",
"The Renaissance was a period in European history that",
"Climate change is caused by several factors including",
"The human brain contains approximately 86 billion neurons which",
"I've been thinking about getting a new laptop because",
"Yesterday I went to the store and bought",
"My favorite thing about summer is definitely",
]
base_prompt = random.choice(prompt_templates)
if max_words < min_words:
max_words = min_words
target_words = random.randint(min_words, max_words)
if target_words > 50:
padding_text = (
" This is an interesting topic that deserves more explanation. "
* (target_words // 50)
)
base_prompt = base_prompt + padding_text
return base_prompt
def run_benchmark_with_batch_invariant(
model: str,
tp_size: int,
max_batch_size: int,
num_trials: int,
min_prompt: int,
max_prompt: int,
max_tokens: int,
temperature: float,
gpu_mem_util: float,
max_model_len: int,
backend: str,
batch_invariant: bool,
seed: int = 12345,
) -> dict:
"""
Run the benchmark with the specified configuration.
Returns a dict with timing and throughput metrics.
"""
random.seed(seed)
# Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
os.environ["VLLM_BATCH_INVARIANT"] = "0"
print(f"\n{'=' * 80}")
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
print(f" Model: {model}")
print(f" TP Size: {tp_size}")
print(f" Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Trials: {num_trials}")
print(f" Max Tokens: {max_tokens}")
print(f"{'=' * 80}\n")
sampling = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=max_tokens,
seed=20240919,
)
needle_prompt = "There once was a "
llm = None
try:
# Create LLM engine
start_init = time.perf_counter()
llm = LLM(
model=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init
print(f"Engine initialization time: {init_time:.2f}s\n")
# Generate baseline
print("Generating baseline (warmup)...")
baseline_out = llm.generate([needle_prompt], sampling)
assert len(baseline_out) == 1
baseline_text = baseline_out[0].outputs[0].text
print(f"Baseline output: '{baseline_text[:50]}...'\n")
# Run trials and measure timing
trial_times: list[float] = []
total_tokens = 0
total_prompts = 0
for trial in range(num_trials):
# Create a batch
prompts: list[str] = []
batch_size = random.randint(max_batch_size // 2, max_batch_size)
needle_pos = random.randint(0, batch_size - 1)
for i in range(batch_size):
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(_random_prompt(min_prompt, max_prompt))
# Measure time for this trial
start_time = time.perf_counter()
outputs = llm.generate(prompts, sampling)
trial_time = time.perf_counter() - start_time
trial_times.append(trial_time)
total_prompts += len(prompts)
# Count tokens
for output in outputs:
if output.outputs:
total_tokens += len(output.outputs[0].token_ids)
print(
f"Trial {trial + 1}/{num_trials}: "
f"batch_size={batch_size}, "
f"time={trial_time:.2f}s"
)
# Verify needle output still matches
needle_output = outputs[needle_pos]
assert needle_output.prompt == needle_prompt
# Compute statistics
avg_time = sum(trial_times) / len(trial_times)
min_time = min(trial_times)
max_time = max(trial_times)
throughput = total_tokens / sum(trial_times)
prompts_per_sec = total_prompts / sum(trial_times)
print(f"\n{'=' * 80}")
print("RESULTS:")
print(f" Average time per trial: {avg_time:.2f}s")
print(f" Min time: {min_time:.2f}s")
print(f" Max time: {max_time:.2f}s")
print(f" Total tokens generated: {total_tokens}")
print(f" Total prompts processed: {total_prompts}")
print(f" Throughput: {throughput:.2f} tokens/s")
print(f" Prompts/s: {prompts_per_sec:.2f}")
print(f"{'=' * 80}\n")
return {
"init_time": init_time,
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"total_tokens": total_tokens,
"total_prompts": total_prompts,
"throughput": throughput,
"prompts_per_sec": prompts_per_sec,
"trial_times": trial_times,
}
finally:
# Cleanup
if llm is not None:
with contextlib.suppress(Exception):
llm.shutdown()
def main():
# Check platform support
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
print("ERROR: Requires CUDA and >= Hopper (SM90)")
print(f"Current platform: {current_platform.device_type}")
if current_platform.is_cuda():
print(f"Device capability: {current_platform.get_device_capability()}")
return 1
# Read configuration from environment
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
print("\n" + "=" * 80)
print("VLLM BATCH INVARIANCE BENCHMARK")
print("=" * 80)
print("\nConfiguration:")
print(f" Model: {model}")
print(f" Tensor Parallel Size: {tp_size}")
print(f" Attention Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Number of Trials: {num_trials}")
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
print(f" Max Tokens to Generate: {max_tokens}")
print(f" Temperature: {temperature}")
print(f" GPU Memory Utilization: {gpu_mem_util}")
print(f" Max Model Length: {max_model_len}")
print("=" * 80)
# Run benchmark WITHOUT batch invariance (baseline)
print("\n" + "=" * 80)
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
print("=" * 80)
baseline_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=False,
)
# Run benchmark WITH batch invariance
print("\n" + "=" * 80)
print("PHASE 2: Running WITH batch invariance")
print("=" * 80)
batch_inv_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=True,
)
# Compare results
print("\n" + "=" * 80)
print("COMPARISON: Batch Invariance vs Baseline")
print("=" * 80)
init_overhead_pct = (
(batch_inv_results["init_time"] - baseline_results["init_time"])
/ baseline_results["init_time"]
* 100
)
time_overhead_pct = (
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
/ baseline_results["avg_time"]
* 100
)
throughput_change_pct = (
(batch_inv_results["throughput"] - baseline_results["throughput"])
/ baseline_results["throughput"]
* 100
)
print("\nInitialization Time:")
print(f" Baseline: {baseline_results['init_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
print(f" Overhead: {init_overhead_pct:+.2f}%")
print("\nAverage Trial Time:")
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
print(f" Overhead: {time_overhead_pct:+.2f}%")
print("\nThroughput (tokens/s):")
print(f" Baseline: {baseline_results['throughput']:.2f}")
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
print(f" Change: {throughput_change_pct:+.2f}%")
print("\nPrompts/s:")
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if time_overhead_pct > 0:
print(
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
"overhead"
)
else:
print(
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
"faster (unexpected!)"
)
if abs(throughput_change_pct) < 1.0:
print("Throughput difference is negligible (< 1%)")
elif throughput_change_pct < 0:
print(
f"Throughput decreased by {-throughput_change_pct:.1f}% "
"with batch invariance"
)
else:
print(
f"Throughput increased by {throughput_change_pct:.1f}% "
"with batch invariance (unexpected!)"
)
print("=" * 80 + "\n")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
benchmark-only runtime so the reported throughput stays comparable).
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>

View File

@ -561,8 +561,11 @@ async def client_main(
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
)
random.seed(args.seed)
np.random.seed(args.seed)
# Set unique seed per client (each client runs in its own process)
# Add 1 to ensure no client uses the same seed as the main process
client_seed = args.seed + client_id + 1
random.seed(client_seed)
np.random.seed(client_seed)
# Active conversations
active_convs: ConversationsMap = {}
@ -1073,6 +1076,7 @@ def process_statistics(
verbose: bool,
gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
warmup_runtime_sec: float | None = None,
) -> None:
if len(client_metrics) == 0:
logger.info("No samples to process")
@ -1166,8 +1170,13 @@ def process_statistics(
# Convert milliseconds to seconds
runtime_sec = runtime_sec / 1000.0
requests_per_sec = float(len(df)) / runtime_sec
params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
params = {
"runtime_sec": runtime_sec,
"requests_per_sec": requests_per_sec,
}
if warmup_runtime_sec is not None:
params["warmup_runtime_sec"] = warmup_runtime_sec
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
# Generate a summary of relevant metrics (and drop irrelevant data)
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@ -1490,6 +1499,7 @@ async def main() -> None:
f"Invalid --warmup-percentage={args.warmup_percentage}"
) from None
# Set global seeds for main process
random.seed(args.seed)
np.random.seed(args.seed)
@ -1548,6 +1558,8 @@ async def main() -> None:
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
)
warmup_runtime_sec: float | None = None
# Warm-up step
if args.warmup_step:
# Only send a single user prompt from every conversation.
@ -1562,26 +1574,56 @@ async def main() -> None:
# all clients should finish their work before exiting
warmup_bench_args = bench_args._replace(early_stop=False)
logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
warmup_start_ns = time.perf_counter_ns()
conversations, _ = await main_mp(
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
)
logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.PURPLE,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
# Run the benchmark
start_time = time.perf_counter_ns()
benchmark_start_ns = time.perf_counter_ns()
client_convs, client_metrics = await main_mp(
client_args, req_args, bench_args, tokenizer, conversations
)
total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
# Calculate requests per second
total_runtime_sec = total_runtime_ms / 1000.0
rps = len(client_metrics) / total_runtime_sec
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
logger.info(
f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
"requests per second: %.3f%s",
Color.GREEN,
benchmark_runtime_sec,
benchmark_runtime_ms,
requests_per_sec,
Color.RESET,
)
if warmup_runtime_sec is not None:
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.GREEN,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info(
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
Color.GREEN,
total_runtime_sec,
total_runtime_sec * 1000,
Color.RESET,
)
# Benchmark parameters
params = {
@ -1606,6 +1648,7 @@ async def main() -> None:
verbose=args.verbose,
gen_conv_args=gen_conv_args,
excel_output=args.excel_output,
warmup_runtime_sec=warmup_runtime_sec,
)
if args.output_file is not None:

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -754,7 +754,7 @@ class AttentionScheduler {
return l2_cache_size >> 1; // use 50% of L2 cache
}
// Fallback if sysctlbyname fails
return 128 * 1024 >> 1; // use 50% of 128KB
return 128LL * 1024 >> 1; // use 50% of 128KB
#else
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
TORCH_CHECK_NE(l2_cache_size, -1);

View File

@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
const torch::Tensor& scheduler_metadata,
const std::optional<torch::Tensor>& s_aux);
// Note: just for avoiding importing errors
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
&cpu_attention_with_kv_cache);
// placeholders
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {

View File

@ -37,6 +37,16 @@
#ifdef USE_ROCM
#define FINAL_MASK 0xffffffffffffffffULL
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
// implementation is copy/pasted from the implementation in ROCm 7.0
__device__ inline void __syncwarp() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
__builtin_amdgcn_wave_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
}
#endif
#else
#define FINAL_MASK 0xffffffff
#endif

View File

@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
});
if (numel % 256 != 0) {
out = out.index({torch::indexing::Slice(0, numel / had_size)});
out = out.narrow(0, 0, numel / had_size);
}
if (inplace && out.data_ptr() != x.data_ptr()) {

View File

@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
ClusterShape, KernelSchedule, EpilogueSchedule>>;
};
template <typename InType, typename OutType, bool EnableBias>
struct sm90_fp8_config_M8192_K6144 {
// M >= 8192, K >= 6144
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule =
cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
using EpilogueSchedule =
typename cutlass::epilogue::TmaWarpSpecializedCooperative;
using TileShape = Shape<_256, _128, _128>;
using ClusterShape = Shape<_2, _1, _1>;
using Cutlass3xGemm = conditional_t<
EnableBias,
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
TileShape, ClusterShape, KernelSchedule,
EpilogueSchedule>,
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
ClusterShape, KernelSchedule, EpilogueSchedule>>;
};
template <typename InType, typename OutType, bool EnableBias>
struct sm90_fp8_config_M128 {
// M in (64, 128]
@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault =
typename sm90_fp8_config_default<InType, OutType,
EnableBias>::Cutlass3xGemm;
using Cutlass3xGemmM8192_K6144 =
typename sm90_fp8_config_M8192_K6144<InType, OutType,
EnableBias>::Cutlass3xGemm;
using Cutlass3xGemmM128 =
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
uint32_t const m = a.size(0);
uint32_t const n = b.size(1);
uint32_t const k = a.size(1);
if (m <= 16) {
// m in [1, 16]
@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
// m in (64, 128]
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
} else if (m >= 8192 && k >= 6144) {
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
} else {
// m in (128, inf)
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(

View File

@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# Activate virtual environment and add uv to PATH
ENV PATH="/root/.local/bin:$PATH"
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694

View File

@ -1,4 +1,4 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d"
@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="9716b1b8"
ARG AITER_BRANCH="59bd8ff2"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950
# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1
ARG PYTHON_VERSION=3.12
RUN mkdir -p /app

View File

@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
libxext6 \
libgl1 \
lsb-release \
libaio-dev \
numactl \
wget \
vim \
@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN python3 -m pip install -e tests/vllm_test_utils
# install nixl from source code
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y

View File

@ -46,7 +46,10 @@ nav:
- contributing/model/multimodal.md
- contributing/model/transcription.md
- CI: contributing/ci
- Design Documents: design
- Design Documents:
- Plugins:
- design/*plugin*.md
- design/*
- API Reference:
- api/README.md
- api/vllm

View File

@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
Where to get started with vLLM depends on the type of user. If you are looking to:
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
For information about the development of vLLM, see:

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_latency.md"
--8<-- "docs/argparse/bench_latency.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_serve.md"
--8<-- "docs/argparse/bench_serve.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_plot.md"
--8<-- "docs/argparse/bench_sweep_plot.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_serve.md"
--8<-- "docs/argparse/bench_sweep_serve.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_serve_sla.md"
--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_throughput.md"
--8<-- "docs/argparse/bench_throughput.inc.md"

View File

@ -1,5 +1,5 @@
# vllm chat
## Options
## Arguments
--8<-- "docs/argparse/chat.md"
--8<-- "docs/argparse/chat.inc.md"

View File

@ -1,5 +1,5 @@
# vllm complete
## Options
## Arguments
--8<-- "docs/argparse/complete.md"
--8<-- "docs/argparse/complete.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/run-batch.md"
--8<-- "docs/argparse/run-batch.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/serve.md"
--8<-- "docs/argparse/serve.inc.md"

View File

@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
## CLI Arguments
The `vllm serve` command is used to launch the OpenAI-compatible server.
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
To see the available options, take a look at the [CLI Reference](../cli/README.md)!
## Configuration file

View File

@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
[Benchmark CLI]: #benchmark-cli
## Benchmark CLI
This section guides you through running benchmark tests with the extensive
@ -985,7 +983,7 @@ each document has close to 512 tokens.
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
with an embedding model, also set `--no_reranker`. Because in this case the query is
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
to account for the extra prompt which is the query. The token accounting to report the
throughput numbers correctly is also adjusted.

View File

@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
to warm it up so that future builds are faster.
<p align="center" width="100%">
<img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
</p>
## Update dependencies

View File

@ -1,7 +1,7 @@
# Summary
!!! important
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.

View File

@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
## Examples in-tree
- Whisper encoderdecoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
## Test with the API

View File

@ -224,6 +224,6 @@ snakeviz expensive_function.prof
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
collected objects for each gc.collect

View File

@ -29,8 +29,8 @@ pip install vllm
- API Path: `/chat/completions`
- Model: `qwen/Qwen1.5-0.5B-Chat`
![](../../assets/deployment/chatbox-settings.png)
![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
1. Go to `Just chat`, and start to chat:
![](../../assets/deployment/chatbox-chat.png)
![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)

View File

@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
- **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
- **Completion Mode**: `Completion`
![](../../assets/deployment/dify-settings.png)
![Dify settings screen](../../assets/deployment/dify-settings.png)
1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
![](../../assets/deployment/dify-create-chatbot.png)
![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
1. Click the chatbot you just created to open the chat interface and start interacting with the model:
![](../../assets/deployment/dify-chat.png)
![Dify chat screen](../../assets/deployment/dify-chat.png)

View File

@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
## Advanced Deployment Details
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLMs optimized inference without additional backend modifications.
With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLMs optimized inference without additional backend modifications.
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)

View File

@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
a new entry and cache it) or replay (if key exists in the cache).
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust whats in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust whats in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
#### Nested Wrapper design

View File

@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
!!! note
The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions.
The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
### FusedMoEPermuteExpertsUnpermute
@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
### FusedMoEModularKernel

View File

@ -1,6 +1,6 @@
# IO Processor Plugins
IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.

View File

@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
Review these logits processor implementations for guidance on writing built-in logits processors.
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
* Allowed token IDs

View File

@ -0,0 +1,220 @@
# LoRA Resolver Plugins
This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
## Overview
LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
to locate and load the adapter from their configured storage locations. This enables:
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
- **Automatic Discovery**: Seamless integration with existing LoRA workflows
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
## Prerequisites
Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
### Required Environment Variables
1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
```bash
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
```
2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
```bash
export VLLM_PLUGINS=lora_filesystem_resolver
```
3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
```bash
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
```
### Optional Environment Variables
- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
## Available Resolvers
### lora_filesystem_resolver
The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
#### Setup Steps
1. **Create the LoRA adapter storage directory**:
```bash
mkdir -p /path/to/lora/adapters
```
2. **Set environment variables**:
```bash
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
export VLLM_PLUGINS=lora_filesystem_resolver
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
```
3. **Start vLLM server**:
Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
```bash
python -m vllm.entrypoints.openai.api_server \
--model your-base-model \
--enable-lora
```
#### Directory Structure Requirements
The filesystem resolver expects LoRA adapters to be organized in the following structure:
```text
/path/to/lora/adapters/
├── adapter1/
│ ├── adapter_config.json
│ ├── adapter_model.bin
│ └── tokenizer files (if applicable)
├── adapter2/
│ ├── adapter_config.json
│ ├── adapter_model.bin
│ └── tokenizer files (if applicable)
└── ...
```
Each adapter directory must contain:
- **`adapter_config.json`**: Required configuration file with the following structure:
```json
{
"peft_type": "LORA",
"base_model_name_or_path": "your-base-model-name",
"r": 16,
"lora_alpha": 32,
"target_modules": ["q_proj", "v_proj"],
"bias": "none",
"modules_to_save": null,
"use_rslora": false,
"use_dora": false
}
```
- **`adapter_model.bin`**: The LoRA adapter weights file
#### Usage Example
1. **Prepare your LoRA adapter**:
```bash
# Assuming you have a LoRA adapter in /tmp/my_lora_adapter
cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
```
2. **Verify the directory structure**:
```bash
ls -la /path/to/lora/adapters/my_sql_adapter/
# Should show: adapter_config.json, adapter_model.bin, etc.
```
3. **Make a request using the adapter**:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "my_sql_adapter",
"prompt": "Generate a SQL query for:",
"max_tokens": 50,
"temperature": 0.1
}'
```
#### How It Works
1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
3. If found, it validates the `adapter_config.json` file
4. If the configuration matches the base model and is valid, the adapter is loaded
5. The request is processed normally with the newly loaded adapter
6. The adapter remains available for future requests
## Advanced Configuration
### Multiple Resolvers
You can configure multiple resolver plugins to load adapters from different sources:
'lora_s3_resolver' is an example of a custom resolver you would need to implement
```bash
export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
```
All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
### Custom Resolver Implementation
To implement your own resolver plugin:
1. **Create a new resolver class**:
```python
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.lora.request import LoRARequest
class CustomResolver(LoRAResolver):
async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
# Your custom resolution logic here
pass
```
2. **Register the resolver**:
```python
def register_custom_resolver():
resolver = CustomResolver()
LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
```
## Troubleshooting
### Common Issues
1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
- Ensure the directory exists and is accessible
- Check file permissions on the directory
2. **"LoRA adapter not found"**
- Verify the adapter directory name matches the requested model name
- Check that `adapter_config.json` exists and is valid JSON
- Ensure `adapter_model.bin` exists in the directory
3. **"Invalid adapter configuration"**
- Verify `peft_type` is set to "LORA"
- Check that `base_model_name_or_path` matches your base model
- Ensure `target_modules` is properly configured
4. **"LoRA rank exceeds maximum"**
- Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
### Debugging Tips
1. **Enable debug logging**:
```bash
export VLLM_LOGGING_LEVEL=DEBUG
```
2. **Verify environment variables**:
```bash
echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
echo $VLLM_PLUGINS
echo $VLLM_LORA_RESOLVER_CACHE_DIR
```
3. **Test adapter configuration**:
```bash
python -c "
import json
with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
config = json.load(f)
print('Config valid:', config)
"
```

View File

@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
## Fused MoE Experts Kernels
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.

View File

@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
!!! note
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
## Offline Custom Arguments

View File

@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
## Ways to Load Your Custom Logits Processor in vLLM
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
## Best Practices for Writing Custom Logits Processors
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method

View File

@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
We recommend three ways of implementations:
- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.

View File

@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
them locally with
```python

View File

@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8

View File

@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
### Which `dtype` should be used?
- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
### How to launch a vLLM service on CPU?

View File

@ -2,7 +2,7 @@
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
!!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source.

View File

@ -83,7 +83,7 @@ uv pip install dist/*.whl
!!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
```toml title="pyproject.toml"
[build-system]

View File

@ -1,12 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.metadata
import importlib.util
import logging
import sys
import traceback
from argparse import SUPPRESS, HelpFormatter
from argparse import SUPPRESS, Action, HelpFormatter
from collections.abc import Iterable
from importlib.machinery import ModuleSpec
from pathlib import Path
from typing import Literal
from typing import TYPE_CHECKING, Literal
from unittest.mock import MagicMock, patch
from pydantic_core import core_schema
@ -19,6 +22,11 @@ ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
sys.path.insert(0, str(ROOT_DIR))
def mock_if_no_torch(mock_module: str, mock: MagicMock):
if not importlib.util.find_spec("torch"):
sys.modules[mock_module] = mock
# Mock custom op code
class MockCustomOp:
@staticmethod
@ -29,18 +37,21 @@ class MockCustomOp:
return decorator
noop = lambda *a, **k: None
sys.modules["vllm._C"] = MagicMock()
sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
mock_if_no_torch("vllm._C", MagicMock())
mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
mock_if_no_torch(
"vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
)
# Mock any version checks by reading from compiled CI requirements
with open(ROOT_DIR / "requirements/test.txt") as f:
VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
# Make torch.nn.Parameter safe to inherit from
sys.modules["torch.nn"] = MagicMock(Parameter=object)
mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
class PydanticMagicMock(MagicMock):
@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
def __init__(self, *args, **kwargs):
name = kwargs.pop("name", None)
super().__init__(*args, **kwargs)
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
self.__spec__ = ModuleSpec(name, None)
def __get_pydantic_core_schema__(self, source_type, handler):
return core_schema.any_schema()
def auto_mock(module, attr, max_mocks=100):
def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
"""Function that automatically mocks missing modules during imports."""
logger.info("Importing %s from %s", attr, module)
logger.info("Importing %s from %s", attr, module_name)
for _ in range(max_mocks):
try:
module = importlib.import_module(module_name)
# First treat attr as an attr, then as a submodule
return getattr(
importlib.import_module(module),
attr,
importlib.import_module(f"{module}.{attr}"),
)
if hasattr(module, attr):
return getattr(module, attr)
return importlib.import_module(f"{module_name}.{attr}")
except ModuleNotFoundError as e:
assert e.name is not None
logger.info("Mocking %s for argparse doc generation", e.name)
sys.modules[e.name] = PydanticMagicMock(name=e.name)
except Exception as e:
logger.warning("Failed to import %s.%s: %s", module, attr, e)
except Exception:
logger.exception("Failed to import %s.%s: %s", module_name, attr)
raise ImportError(
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
)
@ -91,21 +105,26 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
FlexibleArgumentParser = auto_mock(
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
)
if TYPE_CHECKING:
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = auto_mock(
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
)
class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups."""
def __init__(self, prog, starting_heading_level=3):
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
def __init__(self, prog: str, starting_heading_level: int = 3):
super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
self._section_heading_prefix = "#" * starting_heading_level
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
self._markdown_output = []
def start_section(self, heading):
def start_section(self, heading: str):
if heading not in {"positional arguments", "options"}:
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
self._markdown_output.append(heading_md)
@ -113,14 +132,14 @@ class MarkdownFormatter(HelpFormatter):
def end_section(self):
pass
def add_text(self, text):
def add_text(self, text: str):
if text:
self._markdown_output.append(f"{text.strip()}\n\n")
def add_usage(self, usage, actions, groups, prefix=None):
pass
def add_arguments(self, actions):
def add_arguments(self, actions: Iterable[Action]):
for action in actions:
if len(action.option_strings) == 0 or "--help" in action.option_strings:
continue
@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
# Auto-mock runtime imports
if tb_list := traceback.extract_tb(e.__traceback__):
path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
auto_mock(module=".".join(path.parent.parts), attr=path.stem)
auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
return create_parser(add_cli_args, **kwargs)
else:
raise e
@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
# Generate documentation for each parser
for stem, parser in parsers.items():
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
# Specify encoding for building on Windows
with open(doc_path, "w", encoding="utf-8") as f:
f.write(super(type(parser), parser).format_help())

View File

@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
### Transformers
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
Currently, the Transformers backend works for the following:
Currently, the Transformers modeling backend works for the following:
- Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only, mixture-of-experts
@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
- Any combination of the following vLLM parallelisation schemes:
@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
!!! note
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
For a model to be compatible with the Transformers backend for vLLM it must:
For a model to be compatible with the Transformers modeling backend for vLLM it must:
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
- The model directory must have the correct structure (e.g. `config.json` is present).
- `config.json` must contain `auto_map.AutoModel`.
- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
- Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
If the compatible model is:
@ -66,13 +66,13 @@ If the compatible model is:
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
#### Writing custom models
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
To make your model compatible with the Transformers backend, it needs:
To make your model compatible with the Transformers modeling backend, it needs:
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
- If your model is encoder-only:
@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded:
1. The config is loaded.
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
That's it!
@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
Models do not _need_ to be natively supported to be used in vLLM.
The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
!!! tip
The easiest way to check if your model is really supported at runtime is to run the program below:
@ -351,6 +351,7 @@ th {
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------|
| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
@ -451,7 +452,7 @@ th {
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------|
@ -669,7 +670,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
@ -684,7 +685,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
@ -720,7 +721,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
@ -785,6 +786,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
!!! note
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
### Pooling Models
See [this page](./pooling_models.md) for more information on how to use pooling models.

View File

@ -1,6 +1,6 @@
# Using vLLM
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
Then, vLLM supports the following usage patterns:

View File

@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
# Voxtral
# Make sure to install mistral-common[audio].
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import (

View File

@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
from transformers import AutoTokenizer
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY = "empty"
MODEL_NAME = "Qwen/Qwen3-0.6B"
transport = httpx.HTTPTransport()
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
client = httpx.Client(
transport=transport,
base_url=GEN_ENDPOINT,
timeout=600,
headers=headers,
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
def main(client):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
"stream": False,
}
resp = client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
print(data)
print("-" * 50)
print("Token generation results:")
res = tokenizer.decode(data["choices"][0]["token_ids"])
print(res)
print("-" * 50)
if __name__ == "__main__":
main(client)

View File

@ -1,6 +0,0 @@
#!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install' to install the pre-commit hooks."
echo "Then linters will run automatically before each commit."

View File

@ -30,8 +30,8 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
mistral_common[image,audio] >= 1.8.5
gguf >= 0.17.0
mistral_common[image] >= 1.8.5
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

View File

@ -4,8 +4,9 @@ packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
wheel
jinja2>=3.1.6

View File

@ -10,3 +10,7 @@ mkdocs-minify-plugin
regex
ruff
pydantic
# For generating argparse docs.
# Adding requirements here should only be used as a last resort.
msgspec # Need for multiple inheritance involving msgspec.Struct

View File

@ -22,6 +22,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@ -193,7 +195,14 @@ def run_model(
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@create_new_process_for_each_test("spawn")
def test_multi_graph_piecewise_compile(
use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
):
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

View File

@ -21,6 +21,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from ..silly_attention import get_global_counter, reset_global_counter
@ -124,6 +126,7 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor):
_run_simple_model(
splitting_ops=["silly::attention"],

View File

@ -29,6 +29,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
("inductor", True), # Inductor, Inductor partition
],
)
@create_new_process_for_each_test("spawn")
def test_toy_llama(
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
):
@ -513,4 +516,8 @@ def benchmark():
if __name__ == "__main__":
benchmark()
# Protect against subprocess reimport when using spawn_new_process_for_each_test
import os
if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
benchmark()

View File

@ -20,13 +20,22 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..utils import flat_product, multi_gpu_test
is_blackwell = lambda: current_platform.is_device_capability(100)
"""Are we running on Blackwell, a lot of tests depend on it"""
class Matches(NamedTuple):
attention_fusion: int = 0
allreduce_fusion: int = 0
sequence_parallel: int = 0
async_tp: int = 0
class ModelBackendTestCase(NamedTuple):
model_name: str
model_kwargs: dict[str, Any]
backend: AttentionBackendEnum
attention_fusions: int
allreduce_fusions: int | None = None
matches: Matches
MODELS_FP8: list[ModelBackendTestCase] = []
@ -38,17 +47,33 @@ if current_platform.is_cuda():
ModelBackendTestCase(
# Use smaller model for L40s in CI
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=32,
allreduce_fusions=65,
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
# so FI attention+fp8_quant is at least tested once
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER
if is_blackwell()
else AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
attention_fusion=32,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
ModelBackendTestCase(
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
attention_fusions=48,
allreduce_fusions=96,
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
# https://github.com/vllm-project/vllm/issues/28568
# TODO FlashInfer attn broken on Blackwell for llama4:
# https://github.com/vllm-project/vllm/issues/28604
backend=AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
attention_fusion=48,
allreduce_fusion=96,
sequence_parallel=96,
async_tp=95, # mlp is moe, no fusion there
),
),
]
@ -57,8 +82,12 @@ if current_platform.is_cuda():
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
attention_fusions=32,
allreduce_fusions=65,
matches=Matches(
attention_fusion=32,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
]
@ -68,15 +97,23 @@ if current_platform.is_cuda():
model_name="meta-llama/Llama-3.1-8B-Instruct",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=0,
allreduce_fusions=65,
matches=Matches(
attention_fusion=0,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
ModelBackendTestCase(
model_name="Qwen/Qwen3-30B-A3B",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=0,
allreduce_fusions=97,
matches=Matches(
attention_fusion=0,
allreduce_fusion=97,
sequence_parallel=97,
async_tp=96, # MLP is MoE, half the fusions of dense
),
),
]
@ -86,19 +123,19 @@ elif current_platform.is_rocm():
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
ModelBackendTestCase(
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.ROCM_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
ModelBackendTestCase(
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
]
@ -106,8 +143,7 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, "
"attention_fusions, allreduce_fusions, custom_ops",
"model_name, model_kwargs, backend, matches, custom_ops",
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
# quant_fp4 only has the custom impl
@ -118,15 +154,14 @@ def test_attn_quant(
model_name: str,
model_kwargs: dict[str, Any],
backend: AttentionBackendEnum,
attention_fusions: int,
allreduce_fusions: int,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if backend == AttentionBackendEnum.FLASHINFER and (
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
not is_blackwell() or not has_flashinfer()
):
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
@ -169,12 +204,12 @@ def test_attn_quant(
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(compilation_config, model_name, **model_kwargs)
matches = re.findall(
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(matches) == 1, log_holder.text
assert int(matches[0]) == attention_fusions
assert len(log_matches) == 1, log_holder.text
assert int(log_matches[0]) == matches.attention_fusion
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, "
"attention_fusions, allreduce_fusions, custom_ops",
"model_name, model_kwargs, backend, matches, custom_ops",
# Toggle RMSNorm and QuantFP8 for FP8 models
list(
flat_product(
@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
model_name: str,
model_kwargs: dict,
backend: AttentionBackendEnum,
attention_fusions: int,
allreduce_fusions: int,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
if "fp4" in model_name.lower() and not is_blackwell():
pytest.skip("NVFP4 quant requires Blackwell")
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
# FlashInfer attn fusion requires Blackwell
matches = matches._replace(attention_fusion=0)
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
run_model(
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
)
matches = re.findall(
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(matches) == 2, log_holder.text
assert len(log_matches) == 2, log_holder.text
assert int(matches[0]) == attention_fusions
assert int(matches[1]) == attention_fusions
assert int(log_matches[0]) == matches.attention_fusion
assert int(log_matches[1]) == matches.attention_fusion
matches = re.findall(
log_matches = re.findall(
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(matches) == 2, log_holder.text
assert len(log_matches) == 2, log_holder.text
assert int(matches[0]) == allreduce_fusions
assert int(matches[1]) == allreduce_fusions
assert int(log_matches[0]) == matches.allreduce_fusion
assert int(log_matches[1]) == matches.allreduce_fusion
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops",
# Toggle RMSNorm and QuantFP8 for FP8 models
list(
flat_product(
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
)
)
# Toggle RMSNorm for FP4 models and unquant models
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
)
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="sequence parallel only tested on CUDA",
)
def test_tp2_attn_quant_async_tp(
model_name: str,
model_kwargs: dict,
backend: AttentionBackendEnum,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if is_blackwell():
# TODO: https://github.com/vllm-project/vllm/issues/27893
pytest.skip("Blackwell is not supported for AsyncTP pass")
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
if "fp4" in model_name.lower() and not is_blackwell():
pytest.skip("NVFP4 quant requires Blackwell")
if backend == AttentionBackendEnum.FLASHINFER:
if not has_flashinfer():
pytest.skip("FlashInfer backend requires flashinfer installed")
if not is_blackwell():
# FlashInfer attn fusion requires Blackwell
matches = matches._replace(attention_fusion=0)
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
mode = CUDAGraphMode.FULL_AND_PIECEWISE
splitting_ops: list[str] | None = None
else:
mode = CUDAGraphMode.FULL_DECODE_ONLY
splitting_ops = []
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
compilation_config = CompilationConfig(
# Testing properties
use_inductor_graph_partition=inductor_graph_partition,
cudagraph_mode=mode,
custom_ops=custom_ops_list,
splitting_ops=splitting_ops,
# Common
level=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(
enable_attn_fusion=True,
enable_noop=True,
enable_sequence_parallelism=True,
enable_async_tp=True,
),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True},
)
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
)
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.attention_fusion
assert int(log_matches[1]) == matches.attention_fusion
log_matches = re.findall(
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.sequence_parallel
assert int(log_matches[1]) == matches.sequence_parallel
log_matches = re.findall(
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.async_tp
assert int(log_matches[1]) == matches.async_tp
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):

View File

@ -5,15 +5,15 @@ import pytest
import torch
import vllm.envs as envs
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fusion import RMSNormQuantFusionPass
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.fx_utils import find_auto_fn
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import (
CompilationConfig,
CUDAGraphMode,
DeviceConfig,
ModelConfig,
PassConfig,
@ -27,6 +27,7 @@ from vllm.distributed.parallel_state import (
initialize_model_parallel,
)
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
@ -43,172 +44,157 @@ prompts = [
]
class TestModel(torch.nn.Module):
def __init__(self, hidden_size=16, intermediate_size=32):
class TestAllReduceRMSNormModel(torch.nn.Module):
def __init__(self, hidden_size=16, eps=1e-6):
super().__init__()
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.gate_proj = torch.nn.Parameter(
torch.empty((intermediate_size, hidden_size))
)
self.norm = RMSNorm(intermediate_size, 1e-05)
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
self.eps = eps
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
def forward(self, hidden_states, residual):
"""
Forward pass implementing the operations in the FX graph
def forward(self, x):
z = torch.relu(x)
x = resid = tensor_model_parallel_all_reduce(z)
y = self.norm[0](x)
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
z2 = torch.mm(y, self.w[0])
x2 = tensor_model_parallel_all_reduce(z2)
Returns:
Tuple containing the output tensor
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
y2, resid = self.norm[1](x2, resid)
# matrix multiplication
permute = self.gate_proj.permute(1, 0)
mm = torch.mm(view, permute)
z3 = torch.mm(y2, self.w[1])
x3 = tensor_model_parallel_all_reduce(z3)
# Tensor parallel all-reduce
all_reduce = tensor_model_parallel_all_reduce(mm)
y3, resid = self.norm[2](x3, resid)
# layer normalization
norm_output, residual_output = self.norm(all_reduce, residual)
z4 = torch.mm(y3, self.w[2])
x4 = tensor_model_parallel_all_reduce(z4)
return norm_output, residual_output
y4, resid = self.norm[3](x4, resid)
return y4
def ops_in_model_before(self):
return [torch.ops.vllm.all_reduce.default]
def ops_in_model_after(self):
return [
torch.ops.vllm.reduce_scatter.default,
torch.ops.vllm.all_gather.default,
torch.ops.vllm.reduce_scatter.default,
]
def ops_in_model(self):
return [torch.ops._C.fused_add_rms_norm.default]
if RMSNorm.enabled():
return [
torch.ops._C.rms_norm.default,
torch.ops._C.fused_add_rms_norm.default,
]
else:
return []
class TestQuantModel(torch.nn.Module):
def __init__(self, hidden_size=16, intermediate_size=32):
class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
def __init__(self, hidden_size=16, eps=1e-6):
super().__init__()
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.vllm_config = get_current_vllm_config()
self.gate_proj = torch.nn.Parameter(
torch.empty((intermediate_size, hidden_size)), requires_grad=False
)
self.norm = RMSNorm(intermediate_size, 1e-05)
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
self.hidden_size = hidden_size
self.eps = eps
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
self.w = [
torch.rand(hidden_size, hidden_size)
.to(dtype=current_platform.fp8_dtype())
.t()
for _ in range(3)
]
self.fp8_linear = Fp8LinearOp(act_quant_static=True)
self.scale = torch.rand(1, dtype=torch.float32)
# Create a weight that is compatible with torch._scaled_mm,
# which expects a column-major layout.
self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
self.wscale = torch.rand(1, dtype=torch.float32)
def forward(self, hidden_states, residual):
"""
Forward pass implementing the operations in the FX graph
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
Returns:
Tuple containing the output tensor
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
# matrix multiplication
permute = self.gate_proj.permute(1, 0)
mm = torch.mm(view, permute)
# Tensor parallel all-reduce
all_reduce = tensor_model_parallel_all_reduce(mm)
# layer normalization
norm_output, residual_output = self.norm(all_reduce, residual)
# scaled_mm with static input quantization
fp8_linear_result = self.fp8_linear.apply(
norm_output,
self.w,
self.wscale,
input_scale=self.scale.to(norm_output.device),
self.fp8_linear = Fp8LinearOp(
act_quant_static=True,
act_quant_group_shape=GroupShape.PER_TENSOR,
)
return fp8_linear_result, residual_output
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
def ops_in_model_before(self):
ops_to_remove = [torch.ops.vllm.all_reduce.default] # Always removed by SP
# The following are only removed if fusion happens
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
ops_to_remove.extend(
[
torch.ops._C.fused_add_rms_norm.default,
torch.ops._C.static_scaled_fp8_quant.default,
]
)
return ops_to_remove
def forward(self, hidden_states):
# avoid having graph input be an arg to a pattern directly
z = torch.relu(hidden_states)
x = resid = tensor_model_parallel_all_reduce(z)
y = self.norm[0](x)
z2 = self.fp8_linear.apply(
y, self.w[0], self.wscale[0], input_scale=self.scale[0]
)
x2 = tensor_model_parallel_all_reduce(z2)
y2, resid = self.norm[1](x2, resid)
z3 = self.fp8_linear.apply(
y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
)
x3 = tensor_model_parallel_all_reduce(z3)
y3, resid = self.norm[2](x3, resid) # use resid here
z4 = self.fp8_linear.apply(
y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
)
x4 = tensor_model_parallel_all_reduce(z4)
y4, resid = self.norm[3](x4, resid) # use resid here
return y4
def ops_in_model_after(self):
ops_to_add = [
torch.ops.vllm.reduce_scatter.default,
return [
torch.ops.vllm.all_gather.default,
torch.ops.vllm.reduce_scatter.default,
]
def ops_in_model_before(self):
return [
torch.ops.vllm.all_reduce.default,
]
# The following is only added if fusion happens
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
return ops_to_add
def ops_in_model(self):
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
# If fusion happens, the fused op is the one
# we check for (de)functionalization
if self.vllm_config.compilation_config.pass_config.enable_fusion:
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
else:
# If no fusion, the original ops are checked
elif RMSNorm.enabled():
return [
torch.ops._C.fused_add_rms_norm.default,
# TODO functionalization pass does not handle this yet
# torch.ops._C.static_scaled_fp8_quant.default,
]
elif self.fp8_linear.quant_fp8.enabled():
return [
torch.ops._C.static_scaled_fp8_quant.default,
]
else:
return []
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
@pytest.mark.parametrize(
"test_model_cls, custom_ops",
[
(TestAllReduceRMSNormModel, "+rms_norm"),
(TestAllReduceRMSNormModel, "-rms_norm"),
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
],
)
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [16])
@pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("enable_fusion", [True, False])
@pytest.mark.parametrize("dynamic", [False, True])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
def test_sequence_parallelism_pass(
test_model_cls: type[torch.nn.Module],
custom_ops: str,
batch_size: int,
seq_len: int,
hidden_size: int,
dtype: torch.dtype,
enable_fusion: bool,
dynamic: bool,
):
num_processes = 2
@ -220,11 +206,13 @@ def test_sequence_parallelism_pass(
args=(
num_processes,
test_model_cls,
custom_ops,
batch_size,
seq_len,
hidden_size,
dtype,
enable_fusion,
dynamic,
),
nprocs=nprocs,
)
@ -236,11 +224,13 @@ def sequence_parallelism_pass_on_test_model(
local_rank: int,
world_size: int,
test_model_cls: type[torch.nn.Module],
custom_ops: str,
batch_size: int,
seq_len: int,
hidden_size: int,
dtype: torch.dtype,
enable_fusion: bool,
dynamic: bool,
):
current_platform.seed_everything(0)
@ -264,12 +254,16 @@ def sequence_parallelism_pass_on_test_model(
initialize_model_parallel(tensor_model_parallel_size=world_size)
# configure vllm config for SequenceParallelismPass
custom_ops_list = custom_ops.split(",") if custom_ops else []
compilation_config = CompilationConfig(
splitting_ops=[], # avoid automatic rms_norm enablement
cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings
custom_ops=custom_ops_list,
pass_config=PassConfig(
enable_sequence_parallelism=True,
enable_fusion=enable_fusion,
enable_noop=True,
)
),
) # NoOp needed for fusion
device_config = DeviceConfig(device=torch.device("cuda"))
@ -289,7 +283,6 @@ def sequence_parallelism_pass_on_test_model(
with set_current_vllm_config(vllm_config):
noop_pass = NoOpEliminationPass(vllm_config)
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
func_pass = FixFunctionalizationPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
assert (
sequence_parallelism_pass.compilation_config.splitting_ops
@ -310,38 +303,29 @@ def sequence_parallelism_pass_on_test_model(
passes_for_backend.append(cleanup_pass)
backend_no_func = TestBackend(*passes_for_backend)
backend_func = TestBackend(*passes_for_backend, func_pass)
backend = TestBackend(*passes_for_backend)
model = test_model_cls(hidden_size, hidden_size * 2)
model = test_model_cls(hidden_size)
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
compiled_model_no_func(hidden_states, residual)
compiled_model_func = torch.compile(model, backend=backend_func)
compiled_model_func(hidden_states, residual)
if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)
assert sequence_parallelism_pass.matched_count == 1
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
assert sequence_parallelism_pass.matched_count == 4
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
backend_no_func.check_before_ops(model.ops_in_model_before())
for op in model.ops_in_model_before():
assert backend.op_count(op, before=True) == 4
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
backend_no_func.check_after_ops(model.ops_in_model_after())
for op in model.ops_in_model_after():
assert backend.op_count(op, before=False) == 4
# check if the functionalization pass is applied
for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized
found = dict()
for node in backend_func.graph_post_pass.nodes:
for op in model.ops_in_model():
if is_func(node, op):
found[op] = True
assert all(found[op] for op in model.ops_in_model())
find_auto_fn(backend.graph_post_pass.nodes, op)

View File

@ -2,59 +2,134 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import torch
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.config import CompilationMode
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
from vllm.config import (
CompilationConfig,
CompilationMode,
VllmConfig,
set_current_vllm_config,
)
class MyMod(torch.nn.Module):
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
if cache is not None:
return x + cache
return x * 2
if x.size()[0] >= 4:
return x * 2
else:
return x * 100
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
class MyWrapper(TorchCompileWithNoGuardsWrapper):
def __init__(self, model):
self.model = model
compiled_callable = torch.compile(self.forward, backend="eager")
super().__init__(
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
super().__init__()
def forward(self, x: torch.Tensor): # type: ignore[override]
# this is the function to be compiled
return self.model(x)
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
"""Test basic functionality of TorchCompileWithNoGuardsWrapper."""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Create a proper vLLM config instead of mocking
vllm_config = VllmConfig()
vllm_config.compilation_config = CompilationConfig()
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
vllm_config.compilation_config.backend = "inductor"
# Test DYNAMO_TRACE_ONCE
with set_current_vllm_config(vllm_config):
torch._dynamo.reset()
mod = MyMod()
wrapper = MyWrapper(mod)
# First call should trigger compilation
x = torch.tensor([1, 2, 3, 4])
torch._dynamo.mark_dynamic(x, 0)
result1 = wrapper(x)
expected1 = torch.tensor([2, 4, 6, 8])
assert torch.allclose(result1, expected1), (
f"Expected {expected1}, got {result1}"
)
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# this is the function to be compiled
return self.model(x, cache)
# Second call should use compiled code
x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2)
expected2 = torch.tensor([2, 4, 6])
assert torch.allclose(result2, expected2), (
f"Expected {expected2}, got {result2}"
)
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# let torch.compile compile twice
if len(self.compiled_codes) == 2:
dispatch_id = 0 if cache is None else 1
with self.dispatch_to_code(dispatch_id):
return self.forward(x, cache)
else:
return self.compiled_callable(x, cache)
# without the wrapper result would be different.
result3 = mod(x2)
expected3 = torch.tensor([100, 200, 300])
assert torch.allclose(result3, expected3), (
f"Expected {result3}, got {expected3}"
)
def test_torch_compile_wrapper():
mod = MyMod()
wrappers = []
for i in range(3):
torch._dynamo.reset()
# with STOCK_TORCH_COMPILE we do not remove guards.
vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
torch._dynamo.reset()
with set_current_vllm_config(vllm_config):
mod = MyMod()
wrapper = MyWrapper(mod)
wrappers.append(wrapper)
x = torch.tensor([1])
wrapper(x, None) # profile run, compile
# create a cache tensor
cache = torch.tensor([2])
wrapper(x, cache) # warm up with cache, recompile
# for new input, dispatch to the compiled code directly
new_x = torch.tensor([3])
assert wrapper(new_x, None).item() == 6 # dispatch to the first compiled code
assert wrapper(new_x, cache).item() == 5 # dispatch to the second compiled code
# First call should trigger compilation
x = torch.tensor([1, 2, 3, 4])
torch._dynamo.mark_dynamic(x, 0)
for wrapper in wrappers:
# make sure they have independent compiled codes
assert len(wrapper.compiled_codes) == 2
result1 = wrapper(x)
expected1 = torch.tensor([2, 4, 6, 8])
assert torch.allclose(result1, expected1), (
f"Expected {expected1}, got {result1}"
)
# Second call should triger another compilation
x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2)
expected2 = torch.tensor([100, 200, 300])
assert torch.allclose(result2, expected2), (
f"Expected {expected2}, got {result2}"
)
# NO_COMPILATION level not supported.
vllm_config.compilation_config.mode = None
torch._dynamo.reset()
with set_current_vllm_config(vllm_config):
torch._dynamo.reset()
mod = MyMod()
try:
wrapper = MyWrapper(mod)
except Exception:
return
raise AssertionError("expected an exception to be raised")
if __name__ == "__main__":
# Run with both parameter values
class MockMonkeypatch:
def setenv(self, name, value):
os.environ[name] = value
mp = MockMonkeypatch()
print("Testing with VLLM_USE_BYTECODE_HOOK=False")
test_torch_compile_wrapper(False, mp)
print("Testing with VLLM_USE_BYTECODE_HOOK=True")
test_torch_compile_wrapper(True, mp)
print("All tests passed!")

View File

@ -0,0 +1,437 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Integration tests for MultiprocExecutor at the executor level.
This test directly tests the executor without going through the LLM interface,
focusing on executor initialization, RPC calls, and distributed execution.
"""
import multiprocessing
import os
from tests.utils import multi_gpu_test
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_open_port
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
MODEL = "facebook/opt-125m"
def create_vllm_config(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = 256,
gpu_memory_utilization: float = 0.3,
distributed_executor_backend: str = "mp",
nnodes: int = 1,
node_rank: int = 0,
master_port: int = 0,
) -> VllmConfig:
"""Create a VllmConfig for testing using EngineArgs."""
engine_args = EngineArgs(
model=MODEL,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
# Override distributed node settings if needed
if nnodes > 1 or node_rank > 0:
vllm_config.parallel_config.nnodes = nnodes
vllm_config.parallel_config.node_rank = node_rank
vllm_config.parallel_config.master_port = master_port
if nnodes > 1:
vllm_config.parallel_config.disable_custom_all_reduce = True
return vllm_config
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
"""Create a minimal SchedulerOutput for testing."""
# This is a simplified version - in practice you'd need proper
# SchedulerOutput construction based on the actual vLLM v1 API
return SchedulerOutput(
scheduled_new_reqs=[],
scheduled_resumed_reqs=[],
scheduled_running_reqs=[],
num_scheduled_tokens={},
total_num_scheduled_tokens=0,
)
def test_multiproc_executor_initialization():
"""Test that MultiprocExecutor can be initialized with proper config."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
# Create executor - this should initialize workers
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 1, "World size should be 1 for single GPU"
assert executor.local_world_size == 1, "Local world size should be 1"
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_initialization_tensor_parallel():
"""Test MultiprocExecutor initialization with tensor parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 2, "World size should be 2 for TP=2"
assert executor.local_world_size == 2, "Local world size should be 2"
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
# Verify output rank calculation
output_rank = executor._get_output_rank()
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_collective_rpc():
"""Test collective RPC calls to all workers."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test check_health RPC - should work without errors
executor.check_health()
# Test that RPC works correctly
# Note: We're just testing that the RPC mechanism works,
# not testing actual model execution here
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_failure_callback():
"""Test failure callback registration and invocation."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test callback registration
callback_invoked = []
def test_callback():
callback_invoked.append(True)
# Register callback
executor.register_failure_callback(test_callback)
# Callback should not be invoked yet
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
# Simulate failure
executor.is_failed = True
# Register another callback - should be invoked immediately
executor.register_failure_callback(test_callback)
assert len(callback_invoked) == 1, (
"Callback should be invoked when executor is failed"
)
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_worker_monitor():
"""Test that worker monitor is set up correctly."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify all worker processes are alive
for worker in executor.workers:
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
# Verify executor is not in failed state
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
# After shutdown, workers should be terminated
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), (
f"Worker rank {worker.rank} should terminate after shutdown"
)
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_get_response_message_queues():
"""Test message queue retrieval for different ranks."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Get all message queues
all_queues = executor.get_response_mqs()
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
# Get message queue for specific rank
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_shutdown_cleanup():
"""Test that shutdown properly cleans up resources."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor is set up
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) > 0, "Should have at least one worker"
# Shutdown
executor.shutdown()
# Verify cleanup
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), "Worker processes should be terminated"
# Verify shutdown event is set
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
# Multiple shutdowns should be safe (idempotent)
executor.shutdown()
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_pipeline_parallel():
"""Test MultiprocExecutor with pipeline parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=2,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify executor properties
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
assert len(executor.workers) == 4, "Should have 4 workers"
# Verify output rank calculation
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
# Specifically rank 2 (first rank of last PP stage)
output_rank = executor._get_output_rank()
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
# Verify max_concurrent_batches for pipeline parallel
assert executor.max_concurrent_batches == 2, (
"Max concurrent batches should equal PP size"
)
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_properties():
"""Test various executor properties and configurations."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test supports_pp property
assert MultiprocExecutor.supports_pp is True, (
"MultiprocExecutor should support pipeline parallelism"
)
# Test world_size calculation
assert executor.world_size == (
executor.parallel_config.tensor_parallel_size
* executor.parallel_config.pipeline_parallel_size
), "World size should equal TP * PP"
# Test local_world_size calculation
assert executor.local_world_size == (
executor.parallel_config.world_size // executor.parallel_config.nnodes
), "Local world size should be world_size / nnodes"
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_multi_node():
"""
Test MultiprocExecutor with multi-node configuration.
This simulates 2 nodes with TP=4:
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
Total world_size = 4, nnodes = 2
"""
port = get_open_port()
# symm_mem does not work for simulating multi instance in single node
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
"""Run a single node's executor."""
executor = None
try:
# Set CUDA_VISIBLE_DEVICES for this node
if node_rank == 0:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
# Create config for this node
vllm_config = create_vllm_config(
tensor_parallel_size=4, # Total TP across all nodes
pipeline_parallel_size=1,
nnodes=2, # 2 nodes
node_rank=node_rank,
master_port=port, # same port
)
# Create executor for this node
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify node-specific properties
assert executor.world_size == 4, (
f"World size should be 4 on node {node_rank}"
)
assert executor.local_world_size == 2, (
f"Local world size should be 2 on node {node_rank}"
)
assert len(executor.workers) == 2, (
f"Should have 2 local workers on node {node_rank}"
)
# Verify worker ranks are correct for this node
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
actual_ranks = sorted([w.rank for w in executor.workers])
assert actual_ranks == expected_ranks, (
f"Node {node_rank} should have workers "
f"with ranks {expected_ranks}, got {actual_ranks}"
)
# Verify all workers are alive
for worker in executor.workers:
assert worker.proc.is_alive(), (
f"Worker rank {worker.rank} should be alive on node {node_rank}"
)
# executor.gen
# Put success result in queue BEFORE shutdown to avoid hanging
result_queue.put({"node": node_rank, "success": True})
import time
time.sleep(2)
executor.shutdown()
except Exception as e:
# Put failure result in queue
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
raise e
finally:
if executor is not None:
executor.shutdown()
# Create a queue to collect results from both processes
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
# Start both node processes
processes = []
for node_rank in range(2):
p = multiprocessing.Process(
target=run_node,
args=(node_rank, result_queue, port),
name=f"Node{node_rank}",
)
p.start()
processes.append(p)
# Wait for both processes to complete
all_completed = True
for p in processes:
p.join(timeout=60)
if p.is_alive():
p.terminate()
p.join(timeout=20)
if p.is_alive():
p.kill()
p.join()
all_completed = False
# Check results from both nodes
results: list[dict[str, int | bool]] = []
while len(results) < 2:
try:
result = result_queue.get(timeout=1)
results.append(result)
except Exception:
pass
assert all_completed, "Not all processes completed successfully"
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
assert results[0]["success"], f"Node 0 failed: {results[0]}"
assert results[1]["success"], f"Node 1 failed: {results[1]}"

View File

@ -18,6 +18,7 @@ import pytest
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..models.registry import HF_EXAMPLE_MODELS
@ -161,6 +162,7 @@ def _compare_sp(
test_options: SPTestOptions,
num_gpus_available: int,
use_inductor_graph_partition: bool,
enable_async_tp: bool,
*,
method: Literal["generate", "encode"],
is_multimodal: bool,
@ -244,10 +246,10 @@ def _compare_sp(
compilation_config = {
"mode": CompilationMode.VLLM_COMPILE,
"custom_ops": ["+rms_norm"],
"compile_sizes": [4, 8],
"pass_config": {
"enable_sequence_parallelism": True,
"enable_async_tp": enable_async_tp,
"enable_fusion": enable_fusion,
"enable_noop": True,
},
@ -307,6 +309,7 @@ SP_TEST_MODELS = [
],
)
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP
@create_new_process_for_each_test()
def test_tp_sp_generation(
model_id: str,
@ -316,10 +319,19 @@ def test_tp_sp_generation(
test_options: SPTestOptions,
num_gpus_available,
use_inductor_graph_partition: bool,
enable_async_tp: bool,
):
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
if (
"fp8" in model_id.lower()
and current_platform.get_device_capability() < (9, 0)
and (not enable_async_tp)
):
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
_compare_sp(
model_id,
parallel_setup,
@ -328,6 +340,7 @@ def test_tp_sp_generation(
test_options,
num_gpus_available,
use_inductor_graph_partition,
enable_async_tp=enable_async_tp,
method="generate",
is_multimodal=False,
)

View File

@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request): # noqa: F811
"128",
"--enforce-eager",
"--max-num-seqs",
"1",
"4",
"--enable-force-include-usage",
"--port",
"55857",
@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage():
"--dtype",
"bfloat16",
"--max-num-seqs",
"1",
"4",
"--enforce-eager",
"--enable-force-include-usage",
"--gpu-memory-utilization",

View File

@ -16,6 +16,7 @@ from transformers import AutoTokenizer
from vllm import version
from ...conftest import LocalAssetServer
from ...utils import RemoteOpenAIServer
MODELS = {
@ -69,7 +70,6 @@ async def client(server):
_PROMPT = "Hello my name is Robert and I love magic"
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
@ -250,6 +250,7 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio
async def test_metrics_exist(
local_asset_server: LocalAssetServer,
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
@ -265,13 +266,21 @@ async def test_metrics_exist(
temperature=0.0,
)
else:
# https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
{
"type": "image_url",
"image_url": {
"url": local_asset_server.url_for(
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
),
},
},
{"type": "text", "text": "What's in this image?"},
],
}

View File

@ -0,0 +1,262 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
import pytest
import pytest_asyncio
from transformers import AutoTokenizer
from vllm.config import ModelConfig
from vllm.v1.engine.detokenizer import check_stop_strings
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
GEN_ENDPOINT = "/inference/v1/generate"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
seed=0,
dtype="bfloat16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(MODEL_NAME)
@pytest.fixture(scope="module")
def messages():
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
@pytest.fixture(scope="module")
def server(request):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
]
extra_args = getattr(request, "param", None)
if extra_args is not None:
args = args + (
list(extra_args)
if isinstance(extra_args, (list, tuple))
else [str(extra_args)]
)
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
async with httpx.AsyncClient(
transport=transport,
base_url=server.url_root,
timeout=600,
headers=headers,
) as c:
yield c
@pytest.mark.asyncio
async def test_generate_endpoint(client):
payload = {
"model": MODEL_NAME,
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
@pytest.mark.asyncio
async def test_same_response_as_chat_completions(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
for ignore_eos in [True, False]:
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
# NOTE coordinator will set this to skip detokenization
"detokenize": False,
"ignore_eos": ignore_eos,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"ignore_eos": ignore_eos,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
async def test_stop_string_workflow(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
# stop strings are only supported when detokenize is True.
"stop": ["27 member"],
},
# TODO stream test is much more interesting
"stream": False,
}
with pytest.raises(httpx.HTTPStatusError):
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_resp.raise_for_status()
payload["sampling_params"]["stop"] = None
generate_resp = await client.post(
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
# NOTE This is under the responsibility of the coordinator
# stop_checker = StopChecker(
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
# )
stop_str, truncate_to = check_stop_strings(
generate_res, len(generate_res), ["27 member"], False
)
assert stop_str == "27 member"
# abort request that hit stop string (requires tokens-only mode)
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
# res.raise_for_status()
generate_res = generate_res[:truncate_to]
# Get stop_str response from chat completions
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"stop": ["27 member"],
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
@pytest.mark.parametrize(
"server",
[
[
"--enable-lora",
"--lora-modules",
"Alice=charent/self_cognition_Alice",
"Bob=charent/self_cognition_Bob",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
],
indirect=True,
)
async def test_generate_with_lora_adapter(client, tokenizer, messages):
# Verify adapters are listed
models_resp = await client.get("/v1/models")
models_resp.raise_for_status()
models = {m["id"] for m in models_resp.json().get("data", [])}
assert {"Alice", "Bob"}.issubset(models)
# Generate using a LoRA adapter by specifying its name as the model
payload = {
"model": "Alice",
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": "Alice",
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": "Alice",
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res

View File

@ -40,8 +40,6 @@ NUM_EXPERTS = [8, 64]
TOP_KS = [1, 2, 6]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
@dataclass

View File

@ -33,8 +33,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]

View File

@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
DTYPES = [torch.bfloat16]

View File

@ -42,8 +42,6 @@ MNK_FACTORS = [
]
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
@dataclasses.dataclass

View File

@ -7,6 +7,7 @@ fp8 block-quantized case.
"""
import dataclasses
from contextlib import contextmanager
import pytest
import torch.distributed
@ -14,6 +15,7 @@ from torch.distributed import ProcessGroup
from typing_extensions import ParamSpec
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
fp8_w8a8_moe_quant_config,
@ -61,6 +63,23 @@ requires_deep_gemm = pytest.mark.skipif(
P = ParamSpec("P")
@contextmanager
def with_dp_metadata(M: int, world_size: int):
num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
vllm_config = VllmConfig()
vllm_config.parallel_config.data_parallel_size = world_size
vllm_config.parallel_config.enable_expert_parallel = True
with set_forward_context(
None,
vllm_config,
num_tokens=M,
num_tokens_across_dp=num_tokens_across_dp,
):
yield
def next_power_of_2(x):
import math
@ -285,18 +304,21 @@ def deepep_deepgemm_moe_impl(
quant_config=quant_config,
)
out = mk.forward(
hidden_states=test_tensors.rank_tokens,
w1=w1,
w2=w2,
topk_weights=test_tensors.topk_weights,
topk_ids=test_tensors.topk,
inplace=False,
activation="silu",
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
)
with with_dp_metadata(
M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
):
out = mk.forward(
hidden_states=test_tensors.rank_tokens,
w1=w1,
w2=w2,
topk_weights=test_tensors.topk_weights,
topk_ids=test_tensors.topk,
inplace=False,
activation="silu",
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
)
return out

View File

@ -45,8 +45,6 @@ MNK_FACTORS = [
]
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def quant_fp8_per_tensor_batches(a):
@ -79,10 +77,14 @@ class TestData:
@staticmethod
def make_moe_tensors_8bit(
m: int, k: int, n: int, e: int, reorder: bool
m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
) -> "TestData":
is_gated = activation != "relu2_no_mul"
hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
w13 = torch.randn(
(e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
)
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
# Scale to fp8
@ -192,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
def test_flashinfer_cutlass_moe_fp8_no_graph(
m: int,
n: int,
k: int,
e: int,
topk: int,
activation: str,
monkeypatch,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config):
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
td = TestData.make_moe_tensors_8bit(
m, k, n, e, reorder=False, activation=activation
)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
activation="silu",
activation=activation,
global_num_experts=e,
expert_map=None,
apply_router_weight_on_input=True,
@ -255,7 +261,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
td.layer,
topk_weights,
topk_ids,
activation="silu",
activation=activation,
global_num_experts=e,
expert_map=None,
apply_router_weight_on_input=True,

View File

@ -81,8 +81,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [
]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def run_moe_test(

View File

@ -192,8 +192,6 @@ def pplx_cutlass_moe(
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def _pplx_moe(

View File

@ -81,8 +81,6 @@ TOP_KS = [1, 2, 6]
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def torch_prepare(

View File

@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):

View File

@ -29,8 +29,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]

View File

@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
DTYPES = [torch.half, torch.bfloat16]
M = [1, 33, 64, 222]

View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import Mock, patch
import pytest
import torch
from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
from vllm.model_executor.models.utils import get_draft_quant_config
from vllm.platforms import current_platform
DEVICES = (
[f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
if current_platform.is_cuda_alike()
else ["cpu"]
)
def test_get_draft_quant_config_with_draft_model():
mock_draft_model_config = Mock(spec=ModelConfig)
mock_load_config = Mock(spec=LoadConfig)
mock_speculative_config = Mock(spec=SpeculativeConfig)
mock_speculative_config.draft_model_config = mock_draft_model_config
mock_vllm_config = Mock(spec=VllmConfig)
mock_vllm_config.speculative_config = mock_speculative_config
mock_vllm_config.load_config = mock_load_config
mock_quant_config = Mock()
with patch.object(
VllmConfig, "get_quantization_config", return_value=mock_quant_config
):
result = get_draft_quant_config(mock_vllm_config)
# Verify the function calls get_quantization_config with draft model config
VllmConfig.get_quantization_config.assert_called_once_with(
mock_draft_model_config, mock_load_config
)
assert result == mock_quant_config
def test_get_draft_quant_config_without_draft_model():
mock_speculative_config = Mock(spec=SpeculativeConfig)
mock_speculative_config.draft_model_config = None
mock_vllm_config = Mock(spec=VllmConfig)
mock_vllm_config.speculative_config = mock_speculative_config
mock_vllm_config.load_config = Mock(spec=LoadConfig)
result = get_draft_quant_config(mock_vllm_config)
assert result is None
@torch.inference_mode()
@pytest.mark.parametrize("device", DEVICES)
def test_fc_layer_quant_config_usage(dist_init, device) -> None:
import torch
from vllm.model_executor.layers.linear import ReplicatedLinear
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device)
input_size = 256
output_size = 128
fc_no_quant = ReplicatedLinear(
input_size=input_size,
output_size=output_size,
bias=False,
params_dtype=torch.float16,
quant_config=None,
prefix="fc",
)
assert fc_no_quant.quant_config is None
assert fc_no_quant.input_size == input_size
assert fc_no_quant.output_size == output_size
mock_quant_config = Mock()
fc_with_quant = ReplicatedLinear(
input_size=input_size,
output_size=output_size,
bias=False,
params_dtype=torch.float16,
quant_config=mock_quant_config,
prefix="fc",
)
assert fc_with_quant.quant_config == mock_quant_config
# Check forward pass
x = torch.randn(2, input_size, dtype=torch.float16)
output, _ = fc_no_quant(x)
assert output.shape == (2, output_size)
def test_kv_cache_scale_name_handling():
# Mock a quant config that supports cache scales
mock_quant_config = Mock()
mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
# Condition check in load_weights
name = "layers.0.self_attn.k_proj.weight"
scale_name = mock_quant_config.get_cache_scale(name)
# Check if get_cache_scale is called and returns expected value
mock_quant_config.get_cache_scale.assert_called_once_with(name)
assert scale_name == "layers.0.self_attn.kv_scale"
def test_kv_cache_scale_name_no_scale():
# Mock a quant config that returns None for get_cache_scale
mock_quant_config = Mock()
mock_quant_config.get_cache_scale = Mock(return_value=None)
name = "layers.0.mlp.gate_proj.weight"
scale_name = mock_quant_config.get_cache_scale(name)
# Should return None for weights that don't have cache scales
assert scale_name is None
def test_maybe_remap_kv_scale_name():
from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
params_dict = {
"layers.0.self_attn.kv_scale": Mock(),
"layers.1.self_attn.kv_scale": Mock(),
}
name = "layers.0.self_attn.some_scale"
remapped = maybe_remap_kv_scale_name(name, params_dict)
assert remapped in params_dict or remapped == name or remapped is None
def test_load_weights_kv_scale_handling():
kv_scale_param = Mock()
kv_scale_param.weight_loader = Mock()
params_dict = {
"layers.0.self_attn.kv_scale": kv_scale_param,
}
mock_quant_config = Mock()
mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
# Load_weights logic for KV cache scales
name = "layers.0.self_attn.k_proj.weight"
loaded_weight_tensor = torch.tensor([1.0, 2.0])
if mock_quant_config is not None:
scale_name = mock_quant_config.get_cache_scale(name)
if scale_name:
param = params_dict[scale_name]
assert param is kv_scale_param
weight_to_load = (
loaded_weight_tensor
if loaded_weight_tensor.dim() == 0
else loaded_weight_tensor[0]
)
assert scale_name == "layers.0.self_attn.kv_scale"
assert weight_to_load == loaded_weight_tensor[0]

View File

@ -348,9 +348,14 @@ def test_fp32_cache_state(
# Helper functions for the APC tests
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
def _get_vllm_runner_params(
model: str,
max_model_len: int,
tensor_parallel_size: int = 1,
):
return {
"model_name": model,
"enable_chunked_prefill": True,
"enable_prefix_caching": False,
"max_model_len": max_model_len,
"tensor_parallel_size": tensor_parallel_size,

View File

@ -11,7 +11,7 @@ from vllm import TokensPrompt
["Qwen/Qwen3-0.6B"],
)
@torch.inference_mode
def test_embed_models(hf_runner, vllm_runner, model: str):
def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
n_prompt_tokens = [55, 56, 57]
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
enforce_eager=True,
runner="pooling",
enable_chunked_prefill=False,
enable_prefix_caching=False,
enable_prefix_caching=True,
) as vllm_model:
pooling_outputs = vllm_model.llm.encode(
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
for n, output in zip(n_prompt_tokens, pooling_outputs):
assert len(output.prompt_token_ids) == n
assert len(output.outputs.data) == n
assert output.num_cached_tokens == 0
# test enable_prefix_caching plus all pooling
# we need to skip reading cache at this request by
# request.skip_reading_prefix_cache
pooling_outputs = vllm_model.llm.encode(
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
pooling_task="token_embed",
)
for n, output in zip(n_prompt_tokens, pooling_outputs):
assert len(output.prompt_token_ids) == n
assert len(output.outputs.data) == n
assert output.num_cached_tokens == 0
# skip_reading_prefix_cache can still write to cache
# to accelerate following requests
pooling_outputs = vllm_model.llm.encode(
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
pooling_task="embed",
)
for n, output in zip(n_prompt_tokens, pooling_outputs):
assert len(output.prompt_token_ids) == n
assert output.num_cached_tokens > 0

View File

@ -12,6 +12,7 @@ import pytest
from packaging.version import Version
from transformers import (
AutoModel,
AutoModelForCausalLM,
AutoModelForImageTextToText,
AutoModelForTextToWaveform,
)
@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
hf_model_kwargs={"revision": "refs/pr/5"},
),
"paddleocr_vl": VLMTestInfo(
models=["PaddlePaddle/PaddleOCR-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
img_idx_to_prompt=lambda idx: (
"<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
),
multi_image_prompt=(
"Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
"Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
"Describe these two images separately."
),
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForCausalLM,
image_size_factors=[(), (0.25,)],
),
"phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

View File

@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Literal, NamedTuple
import pytest
from huggingface_hub import hf_hub_download
from pytest import MarkDecorator
from tests.quantization.utils import is_quant_method_supported
from vllm.assets.image import ImageAsset
from vllm.utils.torch_utils import set_default_torch_num_threads
from ....conftest import PromptImageInput, VllmRunner
from ...utils import check_logprobs_close
class GGUFMMTestConfig(NamedTuple):
original_model: str
gguf_repo: str
gguf_backbone: str
gguf_mmproj: str
prompt: list[str]
mm_data: dict[Literal["images"], PromptImageInput]
max_model_len: int = 4096
marks: list[MarkDecorator] = []
@property
def gguf_model(self):
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
GEMMA3_CONFIG = GGUFMMTestConfig(
original_model="google/gemma-3-4b-it",
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
gguf_mmproj="mmproj-model-f16-4B.gguf",
prompt=["<start_of_image>Describe this image in detail:"],
mm_data={"images": [ImageAsset("stop_sign").pil_image]},
marks=[pytest.mark.core_model],
)
MODELS_TO_TEST = [GEMMA3_CONFIG]
def run_multimodal_gguf_test(
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
):
# Run gguf model.
with (
set_default_torch_num_threads(1),
vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=model.max_model_len,
) as gguf_model,
):
gguf_outputs = gguf_model.generate_greedy_logprobs(
prompts=model.prompt,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
**model.mm_data,
)
# Run unquantized model.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=model.max_model_len,
) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
prompts=model.prompt,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
**model.mm_data,
)
check_logprobs_close(
outputs_0_lst=original_outputs,
outputs_1_lst=gguf_outputs,
name_0="original",
name_1="gguf",
)
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS_TO_TEST
],
)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
vllm_runner: type[VllmRunner],
model: GGUFMMTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)

View File

@ -34,6 +34,7 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality(
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
@ -86,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality(
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
@ -94,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos(
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS functionality with batched videos.
@ -102,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(
2. Both pruning configurations work with multiple videos
3. The model doesn't crash when processing multiple videos simultaneously
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)

View File

@ -78,6 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
)
GEMMA3_CONFIG = GGUFTestConfig(
original_model="google/gemma-3-270m-it",
gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
)
MODELS = [
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG,
@ -85,6 +91,7 @@ MODELS = [
GPT2_CONFIG,
STABLELM_CONFIG,
DOLPHIN_CONFIG,
GEMMA3_CONFIG,
# STARCODER_CONFIG, # broken
]
@ -148,7 +155,7 @@ def check_model_outputs(
"model",
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1])

View File

@ -173,6 +173,10 @@ class _HfExamplesInfo:
_TEXT_GENERATION_EXAMPLE_MODELS = {
# [Decoder-only]
"AfmoeForCausalLM": _HfExamplesInfo(
"arcee-ai/Trinity-Nano",
is_available_online=False,
),
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test the functionality of the Transformers backend."""
"""Test the functionality of the Transformers modeling backend."""
from typing import Any
@ -85,7 +85,7 @@ def test_models(
required = Version("5.0.0.dev")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip(
"MoE models with the Transformers backend require "
"MoE models with the Transformers modeling backend require "
f"transformers>={required}, but got {installed}"
)

View File

@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
],
)
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_path, dtype=dtype) as vllm_model:
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)

View File

@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
# Test loading a quantized checkpoint
compare_two_settings(
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
# Test GPTQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
# Test AWQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-AWQ",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
# Test wNa16
compare_two_settings(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
[],
["--cpu-offload-gb", "1"],
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)

View File

@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("max_tokens", [4])
def test_model_experts_int8_startup(
hf_runner,
vllm_runner,
@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
with vllm_runner(
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@ -45,10 +45,10 @@ def test_model_load_and_run(
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner(model_id) as llm:
with vllm_runner(model_id, enforce_eager=True) as llm:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
def check_model(model):
attn = model.model.layers[0].self_attn.attn
@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@ -142,7 +142,10 @@ def test_load_fp16_model(
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner(
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
"facebook/opt-125m",
quantization="fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
) as llm:
def check_model(model):

View File

@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
print(output)

View File

@ -49,4 +49,4 @@ def test_lm_head(
vllm_model.apply_model(check_model)
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])

Some files were not shown because too many files have changed in this diff Show More