mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 20:04:26 +08:00
Merge branch 'main' into woosuk/model-runner-v2
This commit is contained in:
commit
913b8e9569
38
csrc/launch_bounds_utils.h
Normal file
38
csrc/launch_bounds_utils.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
// maximum blocks per SM cap
|
||||||
|
#ifndef VLLM_LAUNCH_BLOCKS_CAP
|
||||||
|
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// compile-time estimate of max threads per SM for launch bounds.
|
||||||
|
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||||
|
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||||
|
#else
|
||||||
|
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// compute the number of blocks per SM to request in __launch_bounds__
|
||||||
|
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
|
||||||
|
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
|
||||||
|
(((VAL) <= 0) \
|
||||||
|
? 1 \
|
||||||
|
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
|
||||||
|
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
|
||||||
|
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
|
||||||
|
|
||||||
|
// runtime-time helper to compute blocks/SM
|
||||||
|
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
|
||||||
|
int device = -1;
|
||||||
|
cudaGetDevice(&device);
|
||||||
|
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
|
||||||
|
cudaDeviceGetAttribute(&max_threads_per_sm,
|
||||||
|
cudaDevAttrMaxThreadsPerMultiProcessor, device);
|
||||||
|
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
|
||||||
|
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
|
||||||
|
}
|
||||||
@ -26,6 +26,7 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
|
#include "launch_bounds_utils.h"
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@ -63,7 +64,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
|
|||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false>
|
template <class Type, bool UE8M0_SF = false>
|
||||||
__global__ void __launch_bounds__(1024, 4)
|
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out,
|
float const* SFScale, uint32_t* out,
|
||||||
uint32_t* SFout) {
|
uint32_t* SFout) {
|
||||||
@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
|||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||||
int const numBlocksPerSM = 2048 / block.x;
|
int const numBlocksPerSM =
|
||||||
|
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||||
|
|
||||||
VLLM_DISPATCH_HALF_TYPES(
|
VLLM_DISPATCH_HALF_TYPES(
|
||||||
|
|||||||
@ -26,12 +26,13 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
|
#include "launch_bounds_utils.h"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||||
__global__ void __launch_bounds__(512, 4)
|
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||||
uint32_t* input_offset_by_experts,
|
uint32_t* input_offset_by_experts,
|
||||||
@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4)
|
|||||||
|
|
||||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||||
__global__ void __launch_bounds__(1024, 4)
|
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||||
uint32_t* input_offset_by_experts,
|
uint32_t* input_offset_by_experts,
|
||||||
@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input,
|
|||||||
int const workSizePerRow = k / ELTS_PER_THREAD;
|
int const workSizePerRow = k / ELTS_PER_THREAD;
|
||||||
int const totalWorkSize = m_topk * workSizePerRow;
|
int const totalWorkSize = m_topk * workSizePerRow;
|
||||||
dim3 block(std::min(workSizePerRow, 512));
|
dim3 block(std::min(workSizePerRow, 512));
|
||||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
// Get number of blocks per SM
|
||||||
int const numBlocksPerSM = 2048 / block.x;
|
int const numBlocksPerSM =
|
||||||
|
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||||
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
||||||
multiProcessorCount * numBlocksPerSM));
|
multiProcessorCount * numBlocksPerSM));
|
||||||
while (grid.x <= multiProcessorCount && block.x > 64) {
|
while (grid.x <= multiProcessorCount && block.x > 64) {
|
||||||
|
|||||||
@ -26,13 +26,14 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
|
#include "launch_bounds_utils.h"
|
||||||
#include "nvfp4_utils.cuh"
|
#include "nvfp4_utils.cuh"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Use UE4M3 by default.
|
// Use UE4M3 by default.
|
||||||
template <class Type, bool UE8M0_SF = false>
|
template <class Type, bool UE8M0_SF = false>
|
||||||
__global__ void __launch_bounds__(512, 4)
|
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||||
using PackedVec = PackedVec<Type>;
|
using PackedVec = PackedVec<Type>;
|
||||||
@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
|
|||||||
// Grid, Block size.
|
// Grid, Block size.
|
||||||
// Each thread converts 8 values.
|
// Each thread converts 8 values.
|
||||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
// Get number of blocks per SM
|
||||||
int const numBlocksPerSM = 2048 / block.x;
|
int const numBlocksPerSM =
|
||||||
|
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||||
|
|
||||||
// Launch the cvt kernel.
|
// Launch the cvt kernel.
|
||||||
|
|||||||
@ -36,22 +36,23 @@ th:not(:first-child) {
|
|||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
|
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
|
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
||||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
|
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
|
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
||||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
||||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
|
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||||
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
|
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
|
||||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
|
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
|
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
|
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||||
|
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
|
||||||
|
|
||||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||||
@ -76,3 +77,4 @@ th:not(:first-child) {
|
|||||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ |
|
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ |
|
||||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) |
|
||||||
|
|||||||
@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM.
|
|||||||
|
|
||||||
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
|
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
|
||||||
|
|
||||||
!!! note
|
|
||||||
Prompt embeddings are currently only supported in the v0 engine.
|
|
||||||
|
|
||||||
## Offline Inference
|
## Offline Inference
|
||||||
|
|
||||||
To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
|||||||
VllmConfig, set_current_vllm_config)
|
VllmConfig, set_current_vllm_config)
|
||||||
from vllm.envs import VLLM_USE_V1
|
from vllm.envs import VLLM_USE_V1
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
# This import automatically registers `torch.ops.silly.attention`
|
# This import automatically registers `torch.ops.silly.attention`
|
||||||
from ..silly_attention import get_global_counter, reset_global_counter
|
from ..silly_attention import get_global_counter, reset_global_counter
|
||||||
@ -50,16 +51,21 @@ class SillyModel(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
def _run_simple_model(
|
||||||
@torch.inference_mode()
|
splitting_ops,
|
||||||
def test_simple_piecewise_compile(use_inductor):
|
use_inductor_graph_partition,
|
||||||
assert VLLM_USE_V1
|
use_inductor,
|
||||||
|
expected_num_piecewise_graphs_seen,
|
||||||
|
expected_num_piecewise_capturable_graphs_seen,
|
||||||
|
expected_num_backend_compilations,
|
||||||
|
expected_num_cudagraph_captured,
|
||||||
|
):
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationLevel.PIECEWISE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
use_inductor=use_inductor,
|
use_inductor=use_inductor,
|
||||||
splitting_ops=["silly.attention"],
|
splitting_ops=splitting_ops,
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
cudagraph_copy_inputs=True,
|
cudagraph_copy_inputs=True,
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_capture_sizes=[1, 2],
|
||||||
))
|
))
|
||||||
@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1, # one graph for the model
|
num_graphs_seen=1, # one graph for the model
|
||||||
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
num_piecewise_capturable_graphs_seen=
|
||||||
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
|
expected_num_piecewise_capturable_graphs_seen,
|
||||||
num_cudagraph_captured=
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||||
), set_forward_context(None,
|
), set_forward_context(None,
|
||||||
vllm_config=vllm_config): # background context
|
vllm_config=vllm_config): # background context
|
||||||
# warm up with background context
|
# warm up with background context
|
||||||
@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
output = model(input)
|
output = model(input)
|
||||||
assert get_global_counter() == 2
|
assert get_global_counter() == 2
|
||||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_simple_piecewise_compile(use_inductor):
|
||||||
|
assert VLLM_USE_V1
|
||||||
|
_run_simple_model(
|
||||||
|
splitting_ops=["silly.attention"],
|
||||||
|
use_inductor_graph_partition=False,
|
||||||
|
use_inductor=use_inductor,
|
||||||
|
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
||||||
|
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
||||||
|
expected_num_backend_compilations=
|
||||||
|
3, # num_piecewise_capturable_graphs_seen
|
||||||
|
expected_num_cudagraph_captured=
|
||||||
|
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
|
||||||
|
def test_simple_inductor_graph_partition(splitting_ops):
|
||||||
|
assert VLLM_USE_V1
|
||||||
|
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available "
|
||||||
|
"in PyTorch 2.9+")
|
||||||
|
|
||||||
|
_run_simple_model(
|
||||||
|
# inductor graph partition automatically resets splitting_ops
|
||||||
|
# to be an empty list
|
||||||
|
splitting_ops=splitting_ops,
|
||||||
|
use_inductor_graph_partition=True,
|
||||||
|
use_inductor=True,
|
||||||
|
expected_num_piecewise_graphs_seen=
|
||||||
|
1, # since not splitting at fx graph level
|
||||||
|
expected_num_piecewise_capturable_graphs_seen=
|
||||||
|
1, # since not splitting at fx graph level
|
||||||
|
expected_num_backend_compilations=
|
||||||
|
1, # since not splitting at fx graph level
|
||||||
|
expected_num_cudagraph_captured=
|
||||||
|
6, # inductor graph partition still captures 6
|
||||||
|
# graph, same as fx graph partition.
|
||||||
|
)
|
||||||
|
|||||||
@ -60,4 +60,5 @@ direct_register_custom_op(
|
|||||||
mutates_args=["out"],
|
mutates_args=["out"],
|
||||||
fake_impl=silly_attention_fake,
|
fake_impl=silly_attention_fake,
|
||||||
target_lib=silly_lib,
|
target_lib=silly_lib,
|
||||||
|
tags=(torch._C.Tag.cudagraph_unsafe, ),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -20,7 +20,6 @@ class TestSetting:
|
|||||||
tp_size: int
|
tp_size: int
|
||||||
attn_backend: str
|
attn_backend: str
|
||||||
method: str
|
method: str
|
||||||
fullgraph: bool
|
|
||||||
|
|
||||||
|
|
||||||
# we cannot afford testing the full Cartesian product
|
# we cannot afford testing the full Cartesian product
|
||||||
@ -36,7 +35,6 @@ class TestSetting:
|
|||||||
tp_size=2,
|
tp_size=2,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# llama model with quantization
|
# llama model with quantization
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -46,7 +44,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# MoE model
|
# MoE model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -56,7 +53,6 @@ class TestSetting:
|
|||||||
tp_size=2,
|
tp_size=2,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# embedding model
|
# embedding model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -73,7 +69,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="encode",
|
method="encode",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
TestSetting(
|
TestSetting(
|
||||||
model="BAAI/bge-base-en-v1.5",
|
model="BAAI/bge-base-en-v1.5",
|
||||||
@ -82,7 +77,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="encode",
|
method="encode",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# vision language model
|
# vision language model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -92,7 +86,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate_with_image",
|
method="generate_with_image",
|
||||||
fullgraph=False,
|
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -109,9 +102,8 @@ def test_compile_correctness(
|
|||||||
tp_size = test_setting.tp_size
|
tp_size = test_setting.tp_size
|
||||||
attn_backend = test_setting.attn_backend
|
attn_backend = test_setting.attn_backend
|
||||||
method = test_setting.method
|
method = test_setting.method
|
||||||
fullgraph = test_setting.fullgraph
|
if cuda_device_count_stateless() < pp_size * tp_size:
|
||||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
|
||||||
pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
|
|
||||||
f"{cuda_device_count_stateless()}")
|
f"{cuda_device_count_stateless()}")
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
@ -149,9 +141,5 @@ def test_compile_correctness(
|
|||||||
]:
|
]:
|
||||||
all_args.append(final_args + [f"-O{level}"])
|
all_args.append(final_args + [f"-O{level}"])
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
|
||||||
# "DYNAMO_ONCE" will always use fullgraph
|
|
||||||
all_envs[-1][
|
|
||||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
|
||||||
|
|
||||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
@ -10,9 +11,13 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
from tests.v1.attention.utils import _Backend
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig
|
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||||
|
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
||||||
|
PassConfig)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
from ..utils import create_new_process_for_each_test
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
@ -79,9 +84,7 @@ def test_full_graph(
|
|||||||
):
|
):
|
||||||
model, model_kwargs = model_info
|
model, model_kwargs = model_info
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context():
|
||||||
# make sure these models can be captured in full graph mode
|
|
||||||
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
|
|
||||||
print(f"MODEL={model}")
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
run_model(optimization_level, model, model_kwargs)
|
run_model(optimization_level, model, model_kwargs)
|
||||||
@ -107,6 +110,18 @@ def test_full_graph(
|
|||||||
(CompilationConfig(level=CompilationLevel.PIECEWISE,
|
(CompilationConfig(level=CompilationLevel.PIECEWISE,
|
||||||
debug_dump_path=tempfile.gettempdir()),
|
debug_dump_path=tempfile.gettempdir()),
|
||||||
("facebook/opt-125m", {})),
|
("facebook/opt-125m", {})),
|
||||||
|
] + [
|
||||||
|
# graph inductor partition
|
||||||
|
(
|
||||||
|
CompilationConfig(
|
||||||
|
level=CompilationLevel.PIECEWISE,
|
||||||
|
# inductor graph partition uses
|
||||||
|
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
|
||||||
|
use_inductor_graph_partition=True,
|
||||||
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
|
compile_sizes=[1, 2]),
|
||||||
|
model) for model in models_list(all=False)
|
||||||
|
if is_torch_equal_or_newer("2.9.0.dev")
|
||||||
])
|
])
|
||||||
# only test some of the models
|
# only test some of the models
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
@ -114,11 +129,51 @@ def test_custom_compile_config(
|
|||||||
compilation_config: CompilationConfig,
|
compilation_config: CompilationConfig,
|
||||||
model_info: tuple[str, dict[str, Any]],
|
model_info: tuple[str, dict[str, Any]],
|
||||||
):
|
):
|
||||||
|
if (compilation_config.use_inductor_graph_partition
|
||||||
|
and not is_torch_equal_or_newer("2.9.0.dev")):
|
||||||
|
pytest.skip("inductor graph partition is only available "
|
||||||
|
"in PyTorch 2.9+")
|
||||||
|
|
||||||
model, model_kwargs = model_info
|
model, model_kwargs = model_info
|
||||||
print(f"MODEL={model}")
|
print(f"MODEL={model}")
|
||||||
run_model(compilation_config, model, model_kwargs)
|
run_model(compilation_config, model, model_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||||
|
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available "
|
||||||
|
"in PyTorch 2.9+")
|
||||||
|
|
||||||
|
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||||
|
compilation_config = CompilationConfig(
|
||||||
|
level=CompilationLevel.PIECEWISE,
|
||||||
|
use_inductor_graph_partition=True,
|
||||||
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
|
custom_ops=["+quant_fp8"],
|
||||||
|
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||||
|
)
|
||||||
|
model_kwargs = {
|
||||||
|
"kv_cache_dtype": "fp8",
|
||||||
|
"max_model_len": 1024,
|
||||||
|
}
|
||||||
|
with caplog_vllm.at_level(
|
||||||
|
logging.DEBUG), global_force_attn_backend_context_manager(
|
||||||
|
_Backend.FLASHINFER):
|
||||||
|
run_model(compilation_config, model, model_kwargs)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert ("Fused quantization onto 48 attention nodes"
|
||||||
|
in caplog_vllm.text), caplog_vllm.text
|
||||||
|
except AssertionError:
|
||||||
|
# Note: this message is only triggered when the compilation goes
|
||||||
|
# through the custom pass. Due to multiple layers of cache on
|
||||||
|
# PyTorch side, the compilation of a graph may be cached such
|
||||||
|
# that custom pass directly goes through cache. In this case,
|
||||||
|
# we go through this branch and assert that the pass is not
|
||||||
|
# triggered.
|
||||||
|
assert "Fused quantization" not in caplog_vllm.text
|
||||||
|
|
||||||
|
|
||||||
def run_model(compile_config: Union[int, CompilationConfig], model: str,
|
def run_model(compile_config: Union[int, CompilationConfig], model: str,
|
||||||
model_kwargs: dict[str, Any]):
|
model_kwargs: dict[str, Any]):
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
Fp8LinearOp)
|
Fp8LinearOp)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
@ -339,6 +340,10 @@ else:
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"split_attention",
|
"split_attention",
|
||||||
[False, True] if current_platform.is_rocm() else [False])
|
[False, True] if current_platform.is_rocm() else [False])
|
||||||
|
# TODO(boyuan): test inductor graph partition on rocm
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"use_inductor_graph_partition",
|
||||||
|
[False] if current_platform.is_rocm() else [False, True])
|
||||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||||
reason="Only test ROCm or CUDA")
|
reason="Only test ROCm or CUDA")
|
||||||
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
|
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
|
||||||
@ -352,9 +357,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
|||||||
dtype: torch.dtype, model_name: str,
|
dtype: torch.dtype, model_name: str,
|
||||||
model_class: type[AttentionQuantPatternModel],
|
model_class: type[AttentionQuantPatternModel],
|
||||||
backend: _Backend, split_attention: bool,
|
backend: _Backend, split_attention: bool,
|
||||||
monkeypatch, dist_init):
|
use_inductor_graph_partition: bool,
|
||||||
|
monkeypatch, dist_init, caplog_vllm):
|
||||||
"""Test AttentionStaticQuantPattern fusion pass"""
|
"""Test AttentionStaticQuantPattern fusion pass"""
|
||||||
|
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer(
|
||||||
|
"2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available "
|
||||||
|
"in PyTorch 2.9+")
|
||||||
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||||
if split_attention:
|
if split_attention:
|
||||||
monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
|
monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
|
||||||
@ -372,6 +383,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
|||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationLevel.PIECEWISE,
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
),
|
),
|
||||||
cache_config=CacheConfig(cache_dtype="fp8"))
|
cache_config=CacheConfig(cache_dtype="fp8"))
|
||||||
|
|
||||||
@ -444,6 +456,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
|||||||
backend=test_backend,
|
backend=test_backend,
|
||||||
fullgraph=True)
|
fullgraph=True)
|
||||||
assert model_compiled.attn._o_scale_float is None
|
assert model_compiled.attn._o_scale_float is None
|
||||||
|
|
||||||
result_fused_1 = model_compiled(q, k, v)
|
result_fused_1 = model_compiled(q, k, v)
|
||||||
|
|
||||||
if backend == _Backend.FLASHINFER:
|
if backend == _Backend.FLASHINFER:
|
||||||
@ -453,6 +466,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
|||||||
# _o_scale_float
|
# _o_scale_float
|
||||||
assert model_compiled.attn._o_scale_float is not None
|
assert model_compiled.attn._o_scale_float is not None
|
||||||
result_fused_2 = model_compiled(q, k, v)
|
result_fused_2 = model_compiled(q, k, v)
|
||||||
|
|
||||||
assert model_compiled.attn._o_scale_float is not None
|
assert model_compiled.attn._o_scale_float is not None
|
||||||
|
|
||||||
torch.testing.assert_close(result_unfused,
|
torch.testing.assert_close(result_unfused,
|
||||||
|
|||||||
@ -987,17 +987,7 @@ class VllmRunner:
|
|||||||
return [req_output.outputs.score for req_output in req_outputs]
|
return [req_output.outputs.score for req_output in req_outputs]
|
||||||
|
|
||||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
if hasattr(self.llm.llm_engine, "model_executor"):
|
return self.llm.apply_model(func)
|
||||||
# This works either in V0 or in V1 with
|
|
||||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0
|
|
||||||
executor = self.llm.llm_engine.model_executor
|
|
||||||
return executor.apply_model(func)
|
|
||||||
|
|
||||||
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
|
||||||
def _apply_model(self):
|
|
||||||
return func(self.get_model())
|
|
||||||
|
|
||||||
return self.llm.llm_engine.collective_rpc(_apply_model)
|
|
||||||
|
|
||||||
def get_llm(self) -> LLM:
|
def get_llm(self) -> LLM:
|
||||||
return self.llm
|
return self.llm
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import datetime
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import openai # use the official client for correctness check
|
import openai # use the official client for correctness check
|
||||||
@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
|
|||||||
output.extend(chunk.choices[0].delta.tool_calls)
|
output.extend(chunk.choices[0].delta.tool_calls)
|
||||||
for o in output:
|
for o in output:
|
||||||
assert o.id is None or o.id == 'functions.get_current_weather:0'
|
assert o.id is None or o.id == 'functions.get_current_weather:0'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
@pytest.mark.parametrize("arguments", ["{}", ''])
|
||||||
|
async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
|
||||||
|
arguments: str):
|
||||||
|
# Step 1: Define a tool that requires no parameters
|
||||||
|
tools = [{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_time",
|
||||||
|
"description":
|
||||||
|
"Get the current date and time. No parameters needed.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {}, # No parameters
|
||||||
|
"required": [] # No required fields
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
messages = [{"role": "user", "content": "What time is it now?"}]
|
||||||
|
# Step 2: Send user message and let model decide whether to call the tool
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto" # Let model choose automatically
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Check if model wants to call a tool
|
||||||
|
message = response.choices[0].message
|
||||||
|
if message.tool_calls:
|
||||||
|
# Get the first tool call
|
||||||
|
tool_call = message.tool_calls[0]
|
||||||
|
tool_name = tool_call.function.name
|
||||||
|
# Step 4: Execute the tool locally (no parameters)
|
||||||
|
if tool_name == "get_current_time":
|
||||||
|
# Test both empty string and "{}" for no-arg tool calls
|
||||||
|
tool_call.function.arguments = arguments
|
||||||
|
messages.append(message)
|
||||||
|
current_time = datetime.datetime.now()
|
||||||
|
result = current_time.isoformat()
|
||||||
|
messages.append({
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": tool_call.id,
|
||||||
|
"content": result,
|
||||||
|
})
|
||||||
|
# Step 5: Send tool result back to model to continue conversation
|
||||||
|
final_response = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
# Output final natural language response
|
||||||
|
assert final_response.choices[0].message.content is not None
|
||||||
|
|
||||||
|
else:
|
||||||
|
# No tool called — just print model's direct reply
|
||||||
|
assert message.content is not None
|
||||||
|
|||||||
@ -14,9 +14,6 @@ from transformers import AutoConfig
|
|||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
pytest.skip("Skipping prompt_embeds test until V1 supports it.",
|
|
||||||
allow_module_level=True)
|
|
||||||
|
|
||||||
# any model with a chat template should work here
|
# any model with a chat template should work here
|
||||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||||
|
|
||||||
|
|||||||
@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
|||||||
assert response_2 is not None
|
assert response_2 is not None
|
||||||
assert response_2.status == "completed"
|
assert response_2.status == "completed"
|
||||||
assert response_2.output_text is not None
|
assert response_2.output_text is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_output_messages_enabled(client: OpenAI, model_name: str,
|
||||||
|
server):
|
||||||
|
response = await client.responses.create(
|
||||||
|
model=model_name,
|
||||||
|
input="What is the capital of South Korea?",
|
||||||
|
extra_body={"enable_response_messages": True})
|
||||||
|
|
||||||
|
assert response is not None
|
||||||
|
assert response.status == "completed"
|
||||||
|
assert len(response.input_messages) > 0
|
||||||
|
assert len(response.output_messages) > 0
|
||||||
|
|||||||
@ -11,7 +11,8 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||||
from vllm.config import VllmConfig, current_platform, set_current_vllm_config
|
from vllm.config import VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||||
|
|
||||||
|
|||||||
@ -1,21 +1,24 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import importlib
|
|
||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from importlib.util import find_spec
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
||||||
|
QuarkLinearMethod, QuarkW4A4MXFP4)
|
||||||
|
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
|
||||||
|
QuarkW4A4MXFp4MoEMethod)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.flashinfer import has_flashinfer
|
from vllm.utils.flashinfer import has_flashinfer
|
||||||
|
|
||||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
|
QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
|
||||||
"quark") is not None and version.parse(
|
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
||||||
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
|
||||||
|
|
||||||
TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
|
TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
|
||||||
) and current_platform.is_device_capability(100)
|
) and current_platform.is_device_capability(100)
|
||||||
@ -39,6 +42,12 @@ class ModelCase:
|
|||||||
tp: int
|
tp: int
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def enable_pickle(monkeypatch):
|
||||||
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('model_case', [
|
@pytest.mark.parametrize('model_case', [
|
||||||
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
|
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
|
||||||
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
|
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
|
||||||
@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
|
|||||||
tensor_parallel_size=model_case.tp,
|
tensor_parallel_size=model_case.tp,
|
||||||
load_format="dummy") as llm:
|
load_format="dummy") as llm:
|
||||||
|
|
||||||
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
|
def check_model(model):
|
||||||
# Re-enable this later.
|
layer = model.model.layers[0]
|
||||||
# def check_model(model):
|
|
||||||
# layer = model.model.layers[0]
|
|
||||||
|
|
||||||
# qkv_proj = layer.self_attn.qkv_proj
|
qkv_proj = layer.self_attn.qkv_proj
|
||||||
|
|
||||||
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||||
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
||||||
|
|
||||||
# assert isinstance(layer.mlp.experts.quant_method,
|
assert isinstance(layer.mlp.experts.quant_method,
|
||||||
# QuarkW4A4MXFp4MoEMethod)
|
QuarkW4A4MXFp4MoEMethod)
|
||||||
|
|
||||||
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
||||||
# llm.apply_model(check_model)
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Today I am in the French Alps and",
|
output = llm.generate_greedy("Today I am in the French Alps and",
|
||||||
max_tokens=20)
|
max_tokens=20)
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from PIL import Image
|
|||||||
|
|
||||||
from vllm.multimodal.image import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||||
|
from vllm.utils import set_default_torch_num_threads
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||||
PromptVideoInput, VllmRunner)
|
PromptVideoInput, VllmRunner)
|
||||||
@ -17,11 +18,9 @@ from ...utils import check_logprobs_close
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def enable_pickle(monkeypatch):
|
||||||
"""
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
"""
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
|
|
||||||
|
|
||||||
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
||||||
@ -126,9 +125,8 @@ def batch_make_image_embeddings(
|
|||||||
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
||||||
dtype=torch.int64)
|
dtype=torch.int64)
|
||||||
return visual(pixel_values_on_device,
|
return visual(pixel_values_on_device,
|
||||||
grid_thw=image_grid_thw_on_device)
|
grid_thw=image_grid_thw_on_device).cpu()
|
||||||
|
|
||||||
# V1 Test: this calls a V0 internal.
|
|
||||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||||
|
|
||||||
# split into original batches
|
# split into original batches
|
||||||
@ -210,7 +208,7 @@ def batch_make_video_embeddings(
|
|||||||
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
||||||
dtype=torch.int64)
|
dtype=torch.int64)
|
||||||
return visual(pixel_values_on_device,
|
return visual(pixel_values_on_device,
|
||||||
grid_thw=video_grid_thw_on_device)
|
grid_thw=video_grid_thw_on_device).cpu()
|
||||||
|
|
||||||
# V1 Test: this calls a V0 internal.
|
# V1 Test: this calls a V0 internal.
|
||||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||||
@ -266,19 +264,22 @@ def run_embedding_input_test(
|
|||||||
processor = AutoProcessor.from_pretrained(model)
|
processor = AutoProcessor.from_pretrained(model)
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
# max_model_len should be greater than image_feature_size
|
||||||
with vllm_runner(model,
|
with set_default_torch_num_threads(1):
|
||||||
runner="generate",
|
vllm_model = vllm_runner(
|
||||||
max_model_len=4000,
|
model,
|
||||||
max_num_seqs=3,
|
runner="generate",
|
||||||
dtype=dtype,
|
max_model_len=4000,
|
||||||
limit_mm_per_prompt={
|
max_num_seqs=3,
|
||||||
"image": mm_limit,
|
dtype=dtype,
|
||||||
"video": mm_limit
|
limit_mm_per_prompt={
|
||||||
},
|
"image": mm_limit,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
"video": mm_limit
|
||||||
distributed_executor_backend=distributed_executor_backend
|
},
|
||||||
) as vllm_model:
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
with vllm_model:
|
||||||
outputs_per_case_for_original_input = [
|
outputs_per_case_for_original_input = [
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
@ -329,9 +330,8 @@ def run_embedding_input_test(
|
|||||||
@pytest.mark.parametrize("max_tokens", [128])
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
@pytest.mark.parametrize("num_logprobs", [10])
|
@pytest.mark.parametrize("num_logprobs", [10])
|
||||||
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||||
size_factors, dtype: str,
|
size_factors, dtype, max_tokens,
|
||||||
max_tokens: int,
|
num_logprobs, monkeypatch) -> None:
|
||||||
num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
inputs_per_case: list[tuple[
|
inputs_per_case: list[tuple[
|
||||||
|
|||||||
@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
|||||||
monkeypatch) -> None:
|
monkeypatch) -> None:
|
||||||
|
|
||||||
# Test V1: this test hangs during setup on single-scale input.
|
# Test V1: this test hangs during setup on single-scale input.
|
||||||
# TODO: fixure out why and re-enable this on V1.
|
# TODO: figure out why and re-enable this on V1.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||||
run_awq_test(
|
run_awq_test(
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
|
|||||||
@ -43,12 +43,9 @@ ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def enable_pickle(monkeypatch):
|
||||||
"""
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
"""
|
|
||||||
if not current_platform.is_cpu():
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs(
|
|||||||
|
|
||||||
dtype = "bfloat16"
|
dtype = "bfloat16"
|
||||||
|
|
||||||
# skip language translation prompt for the static per tensor asym model
|
# skip language translation prompt for the static per tensor models
|
||||||
if (model_path ==
|
if model_path in (
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
|
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||||
): # noqa: E501
|
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||||
|
):
|
||||||
example_prompts = example_prompts[0:-1]
|
example_prompts = example_prompts[0:-1]
|
||||||
|
|
||||||
with hf_runner(model_path, dtype=dtype) as hf_model:
|
with hf_runner(model_path, dtype=dtype) as hf_model:
|
||||||
|
|||||||
@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
|
|||||||
if use_rocm_aiter:
|
if use_rocm_aiter:
|
||||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||||
|
|
||||||
# vllm_runner.apply_model() relies on V0 internals.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
|
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
|
|||||||
if use_rocm_aiter:
|
if use_rocm_aiter:
|
||||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||||
|
|
||||||
# vllm_runner.apply_model() relies on V0 internals.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
if force_marlin:
|
if force_marlin:
|
||||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||||
|
|||||||
@ -31,41 +31,46 @@ MODEL_QUANT = [
|
|||||||
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
|
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
|
||||||
def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
|
def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
|
||||||
monkeypatch):
|
monkeypatch):
|
||||||
# vllm_runner.apply_model() relies on V0 internals.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
|
|
||||||
|
|
||||||
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
|
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
|
||||||
GPTQLinearMethod)
|
GPTQLinearMethod)
|
||||||
|
|
||||||
for name, submodule in (vllm_model.llm.llm_engine.model_executor.
|
with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
|
||||||
driver_worker.model_runner.model.named_modules()):
|
|
||||||
if name == "lm_head":
|
|
||||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
|
||||||
elif name == 'model.layers.0.self_attn.qkv_proj':
|
|
||||||
# The first layer is quantized using bits=4, group_size=128
|
|
||||||
# desc_act=True
|
|
||||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
|
||||||
config = submodule.quant_method.quant_config
|
|
||||||
assert config.weight_bits == 4
|
|
||||||
assert config.group_size == 128
|
|
||||||
assert config.desc_act
|
|
||||||
elif name == 'model.layers.1.self_attn.qkv_proj':
|
|
||||||
# The second layer is quantized using bits=8, group_size=32
|
|
||||||
# desc_act=False
|
|
||||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
|
||||||
config = submodule.quant_method.quant_config
|
|
||||||
assert get_dynamic_override(config, layer_name=name,
|
|
||||||
key="bits") == 8
|
|
||||||
assert get_dynamic_override(config,
|
|
||||||
layer_name=name,
|
|
||||||
key="group_size") == 32
|
|
||||||
assert not get_dynamic_override(
|
|
||||||
config, layer_name=name, key="desc_act")
|
|
||||||
elif (name == 'model.layers.2.self_attn.qkv_proj'
|
|
||||||
or name == 'model.layers.2.mlp.gate_up_proj'):
|
|
||||||
# All other layers (layer index >= 2) are not quantized
|
|
||||||
assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
|
|
||||||
|
|
||||||
del vllm_model
|
def check_model(model):
|
||||||
|
for name, submodule in model.named_modules():
|
||||||
|
if name == "lm_head":
|
||||||
|
assert isinstance(submodule.quant_method,
|
||||||
|
linear_method_cls)
|
||||||
|
elif name == 'model.layers.0.self_attn.qkv_proj':
|
||||||
|
# The first layer is quantized using bits=4, group_size=128
|
||||||
|
# desc_act=True
|
||||||
|
assert isinstance(submodule.quant_method,
|
||||||
|
linear_method_cls)
|
||||||
|
config = submodule.quant_method.quant_config
|
||||||
|
assert config.weight_bits == 4
|
||||||
|
assert config.group_size == 128
|
||||||
|
assert config.desc_act
|
||||||
|
elif name == 'model.layers.1.self_attn.qkv_proj':
|
||||||
|
# The second layer is quantized using bits=8, group_size=32
|
||||||
|
# desc_act=False
|
||||||
|
assert isinstance(submodule.quant_method,
|
||||||
|
linear_method_cls)
|
||||||
|
config = submodule.quant_method.quant_config
|
||||||
|
assert get_dynamic_override(config,
|
||||||
|
layer_name=name,
|
||||||
|
key="bits") == 8
|
||||||
|
assert get_dynamic_override(config,
|
||||||
|
layer_name=name,
|
||||||
|
key="group_size") == 32
|
||||||
|
assert not get_dynamic_override(
|
||||||
|
config, layer_name=name, key="desc_act")
|
||||||
|
elif (name == 'model.layers.2.self_attn.qkv_proj'
|
||||||
|
or name == 'model.layers.2.mlp.gate_up_proj'):
|
||||||
|
# All other layers (layer index >= 2) are not quantized
|
||||||
|
assert isinstance(submodule.quant_method,
|
||||||
|
UnquantizedLinearMethod)
|
||||||
|
|
||||||
|
llm.apply_model(check_model)
|
||||||
|
|||||||
@ -29,8 +29,8 @@ def test_lm_head(
|
|||||||
lm_head_quantized: bool,
|
lm_head_quantized: bool,
|
||||||
monkeypatch,
|
monkeypatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
# vllm_runner.apply_model() relies on V0 internals.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
with vllm_runner(model_id, dtype=torch.float16,
|
with vllm_runner(model_id, dtype=torch.float16,
|
||||||
max_model_len=2048) as vllm_model:
|
max_model_len=2048) as vllm_model:
|
||||||
|
|
||||||
|
|||||||
@ -11,16 +11,12 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from vllm.platforms import current_platform
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def enable_pickle(monkeypatch):
|
||||||
"""
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
"""
|
|
||||||
if not current_platform.is_cpu():
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
|
@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
|
||||||
|
|||||||
@ -13,6 +13,16 @@ from vllm.model_executor.layers.quantization.ptpc_fp8 import (
|
|||||||
PTPCFp8LinearMethod)
|
PTPCFp8LinearMethod)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
UNSUPPORTED_STR = (
|
||||||
|
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
|
||||||
|
"support output dtype of bfloat16. torch.float16 is specified.")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def enable_pickle(monkeypatch):
|
||||||
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
|
@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
|
||||||
reason="PTPC FP8 is not supported on this GPU type.")
|
reason="PTPC FP8 is not supported on this GPU type.")
|
||||||
@ -21,14 +31,22 @@ from vllm.platforms import current_platform
|
|||||||
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
|
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
|
||||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
|
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
|
||||||
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with vllm_runner("facebook/opt-125m",
|
llm = vllm_runner("facebook/opt-125m",
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
quantization="ptpc_fp8",
|
quantization="ptpc_fp8",
|
||||||
kv_cache_dtype=kv_cache_dtype) as llm:
|
kv_cache_dtype=kv_cache_dtype)
|
||||||
|
except AssertionError as e:
|
||||||
|
if str(e) == UNSUPPORTED_STR:
|
||||||
|
# If the error message matches, the test passes
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# If the error message does not match, re-raise the exception
|
||||||
|
raise
|
||||||
|
|
||||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
with llm:
|
||||||
|
|
||||||
|
def check_model(model):
|
||||||
fc1 = model.model.decoder.layers[0].fc1
|
fc1 = model.model.decoder.layers[0].fc1
|
||||||
assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
|
assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
|
||||||
if kv_cache_dtype == "ptpc_fp8":
|
if kv_cache_dtype == "ptpc_fp8":
|
||||||
@ -40,17 +58,8 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
|||||||
if current_platform.has_device_capability(94):
|
if current_platform.has_device_capability(94):
|
||||||
# For GPUs with hardware support, we keep weights in fp8
|
# For GPUs with hardware support, we keep weights in fp8
|
||||||
assert fc1.weight.dtype == torch.float8_e4m3fnuz
|
assert fc1.weight.dtype == torch.float8_e4m3fnuz
|
||||||
else:
|
|
||||||
pytest.skip()
|
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
llm.apply_model(check_model)
|
||||||
assert output
|
|
||||||
except AssertionError as e:
|
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||||
if str(
|
assert output
|
||||||
e
|
|
||||||
) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.": # noqa: E501
|
|
||||||
# If the error message matches, the test passes
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# If the error message does not match, re-raise the exception
|
|
||||||
raise
|
|
||||||
|
|||||||
@ -7,10 +7,10 @@ Run `pytest tests/quantization/test_quark.py`.
|
|||||||
See also `tests/kernels/moe/test_mxfp4_moe.py`.
|
See also `tests/kernels/moe/test_mxfp4_moe.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import importlib
|
|
||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from importlib.util import find_spec
|
||||||
|
|
||||||
import huggingface_hub
|
import huggingface_hub
|
||||||
import lm_eval
|
import lm_eval
|
||||||
@ -24,9 +24,8 @@ from vllm.platforms import current_platform
|
|||||||
|
|
||||||
from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
|
from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
|
||||||
|
|
||||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
|
QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
|
||||||
"quark") is not None and version.parse(
|
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
||||||
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
|
||||||
|
|
||||||
if QUARK_MXFP4_AVAILABLE:
|
if QUARK_MXFP4_AVAILABLE:
|
||||||
from quark.torch.export.nn.modules.realquantizer import (
|
from quark.torch.export.nn.modules.realquantizer import (
|
||||||
@ -43,11 +42,9 @@ except huggingface_hub.errors.RepositoryNotFoundError:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def enable_pickle(monkeypatch):
|
||||||
"""
|
"""`LLM.apply_model` requires pickling a function."""
|
||||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
"""
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
|
@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
|
||||||
@ -132,13 +129,12 @@ def test_quark_fp8_parity(vllm_runner):
|
|||||||
}
|
}
|
||||||
with (vllm_runner(quark_model_id, **llm_kwargs) as
|
with (vllm_runner(quark_model_id, **llm_kwargs) as
|
||||||
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
|
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
|
||||||
quark_model = (quark_handle.llm.llm_engine.model_executor.
|
|
||||||
driver_worker.model_runner.model)
|
|
||||||
quark_state_dict = quark_model.state_dict()
|
|
||||||
|
|
||||||
fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
|
def get_state_dict(model):
|
||||||
model_runner.model)
|
return {k: v.cpu() for k, v in model.state_dict().items()}
|
||||||
fp8_state_dict = fp8_model.state_dict()
|
|
||||||
|
quark_state_dict, = quark_handle.apply_model(get_state_dict)
|
||||||
|
fp8_state_dict, = fp8_handle.apply_model(get_state_dict)
|
||||||
|
|
||||||
assert fp8_state_dict.keys() == quark_state_dict.keys()
|
assert fp8_state_dict.keys() == quark_state_dict.keys()
|
||||||
|
|
||||||
|
|||||||
@ -105,18 +105,21 @@ def test_register_quantization_config():
|
|||||||
])
|
])
|
||||||
def test_custom_quant(vllm_runner, model, monkeypatch):
|
def test_custom_quant(vllm_runner, model, monkeypatch):
|
||||||
"""Test infer with the custom quantization method."""
|
"""Test infer with the custom quantization method."""
|
||||||
# vllm_runner.apply_model() relies on V0 internals.
|
# `LLM.apply_model` requires pickling a function.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
with vllm_runner(model_name=model,
|
with vllm_runner(model_name=model,
|
||||||
quantization="custom_quant",
|
quantization="custom_quant",
|
||||||
enforce_eager=True) as llm:
|
enforce_eager=True) as llm:
|
||||||
|
|
||||||
model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
def check_model(model):
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
qkv_proj = layer.self_attn.qkv_proj
|
qkv_proj = layer.self_attn.qkv_proj
|
||||||
|
|
||||||
# Check the quantization method is FakeQuantLinearMethod
|
# Check the quantization method is FakeQuantLinearMethod
|
||||||
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
|
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
|
||||||
|
|
||||||
|
llm.apply_model(check_model)
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||||
assert output
|
assert output
|
||||||
|
|||||||
@ -18,12 +18,14 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager
|
|||||||
from vllm.v1.core.kv_cache_utils import (
|
from vllm.v1.core.kv_cache_utils import (
|
||||||
BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
|
BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
|
||||||
estimate_max_model_len, generate_block_hash_extra_keys,
|
estimate_max_model_len, generate_block_hash_extra_keys,
|
||||||
get_kv_cache_configs, get_max_concurrency_for_kv_cache_config,
|
generate_scheduler_kv_cache_config, get_kv_cache_configs,
|
||||||
get_request_block_hasher, hash_block_tokens, init_none_hash,
|
get_max_concurrency_for_kv_cache_config, get_request_block_hasher,
|
||||||
is_kv_cache_type_uniform, make_block_hash_with_group_id)
|
hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform,
|
||||||
|
make_block_hash_with_group_id)
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheGroupSpec, KVCacheSpec,
|
KVCacheGroupSpec, KVCacheSpec,
|
||||||
KVCacheTensor, SlidingWindowSpec)
|
KVCacheTensor, SlidingWindowSpec,
|
||||||
|
UniformTypeKVCacheSpecs)
|
||||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
|
|
||||||
@ -927,36 +929,36 @@ def test_merge_kv_cache_spec():
|
|||||||
assert merged_layer_spec.sliding_window == 1
|
assert merged_layer_spec.sliding_window == 1
|
||||||
|
|
||||||
|
|
||||||
def test_is_kv_cache_type_uniform():
|
def test_is_kv_cache_spec_uniform():
|
||||||
kv_cache_spec = {
|
kv_cache_spec = {
|
||||||
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
||||||
"layer_2": new_kv_cache_spec(num_kv_heads=32),
|
"layer_2": new_kv_cache_spec(num_kv_heads=32),
|
||||||
}
|
}
|
||||||
assert is_kv_cache_type_uniform(kv_cache_spec)
|
assert is_kv_cache_spec_uniform(kv_cache_spec)
|
||||||
|
|
||||||
kv_cache_spec = {
|
kv_cache_spec = {
|
||||||
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
||||||
"layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
|
"layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
|
||||||
}
|
}
|
||||||
assert is_kv_cache_type_uniform(kv_cache_spec)
|
assert is_kv_cache_spec_uniform(kv_cache_spec)
|
||||||
|
|
||||||
kv_cache_spec = {
|
kv_cache_spec = {
|
||||||
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
"layer_1": new_kv_cache_spec(num_kv_heads=32),
|
||||||
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
||||||
}
|
}
|
||||||
assert not is_kv_cache_type_uniform(kv_cache_spec)
|
assert not is_kv_cache_spec_uniform(kv_cache_spec)
|
||||||
|
|
||||||
kv_cache_spec = {
|
kv_cache_spec = {
|
||||||
"layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
"layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
||||||
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
||||||
}
|
}
|
||||||
assert is_kv_cache_type_uniform(kv_cache_spec)
|
assert is_kv_cache_spec_uniform(kv_cache_spec)
|
||||||
|
|
||||||
kv_cache_spec = {
|
kv_cache_spec = {
|
||||||
"layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
"layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
|
||||||
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2),
|
"layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2),
|
||||||
}
|
}
|
||||||
assert not is_kv_cache_type_uniform(kv_cache_spec)
|
assert not is_kv_cache_spec_uniform(kv_cache_spec)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -1286,14 +1288,28 @@ def test_get_kv_cache_config_one_worker():
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# different hidden size, unimplemented
|
# different hidden size
|
||||||
kv_cache_specs_hybrid = {
|
kv_cache_specs_hybrid = {
|
||||||
'layer_1': new_kv_cache_spec(head_size=128),
|
'layer_1': new_kv_cache_spec(head_size=128),
|
||||||
'layer_2': new_kv_cache_spec(),
|
'layer_2': new_kv_cache_spec(head_size=64),
|
||||||
}
|
}
|
||||||
with pytest.raises(NotImplementedError):
|
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||||
get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid],
|
vllm_config, [kv_cache_specs_hybrid],
|
||||||
[mem_per_block_per_layer * 2 * 32])[0]
|
[mem_per_block_per_layer * 3 * 32])[0]
|
||||||
|
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||||
|
num_blocks=32,
|
||||||
|
kv_cache_tensors=[
|
||||||
|
KVCacheTensor(size=mem_per_block_per_layer * 32 * 2,
|
||||||
|
shared_by=["layer_1"]),
|
||||||
|
KVCacheTensor(size=mem_per_block_per_layer * 32,
|
||||||
|
shared_by=["layer_2"]),
|
||||||
|
],
|
||||||
|
kv_cache_groups=[
|
||||||
|
KVCacheGroupSpec(["layer_1", "layer_2"],
|
||||||
|
UniformTypeKVCacheSpecs(
|
||||||
|
block_size=16,
|
||||||
|
kv_cache_specs=kv_cache_specs_hybrid))
|
||||||
|
])
|
||||||
|
|
||||||
# Test num_gpu_blocks_override
|
# Test num_gpu_blocks_override
|
||||||
vllm_config.cache_config.num_gpu_blocks_override = 16
|
vllm_config.cache_config.num_gpu_blocks_override = 16
|
||||||
@ -1324,3 +1340,75 @@ def test_get_kv_cache_configs_attention_free():
|
|||||||
kv_cache_groups=[],
|
kv_cache_groups=[],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_uniform_type_kv_cache_specs():
|
||||||
|
# All layers are full attention, can be merged
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_kv_cache_spec(),
|
||||||
|
'layer_2': new_kv_cache_spec(head_size=128),
|
||||||
|
}
|
||||||
|
uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
|
||||||
|
assert uniform_spec == UniformTypeKVCacheSpecs(
|
||||||
|
block_size=16, kv_cache_specs=kv_cache_specs)
|
||||||
|
|
||||||
|
# Full attention + sliding window, cannot be merged
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_kv_cache_spec(),
|
||||||
|
'layer_2': new_sliding_window_spec(sliding_window=1),
|
||||||
|
}
|
||||||
|
uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
|
||||||
|
assert uniform_spec is None
|
||||||
|
|
||||||
|
# different order of full attention + sliding window, cannot be merged
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_sliding_window_spec(sliding_window=1),
|
||||||
|
'layer_2': new_kv_cache_spec(),
|
||||||
|
}
|
||||||
|
uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
|
||||||
|
assert uniform_spec is None
|
||||||
|
|
||||||
|
# Same-size sliding window, can be merged
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_sliding_window_spec(sliding_window=1),
|
||||||
|
'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128),
|
||||||
|
}
|
||||||
|
uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
|
||||||
|
assert uniform_spec == UniformTypeKVCacheSpecs(
|
||||||
|
block_size=16, kv_cache_specs=kv_cache_specs)
|
||||||
|
|
||||||
|
# different block sizes, cannot be merged
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_kv_cache_spec(block_size=16),
|
||||||
|
'layer_2': new_kv_cache_spec(block_size=32),
|
||||||
|
}
|
||||||
|
uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
|
||||||
|
assert uniform_spec is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_scheduler_kv_cache_config():
|
||||||
|
kv_cache_specs = {
|
||||||
|
'layer_1': new_kv_cache_spec(),
|
||||||
|
'layer_2': new_kv_cache_spec(head_size=128),
|
||||||
|
}
|
||||||
|
kv_cache_configs = [
|
||||||
|
KVCacheConfig(
|
||||||
|
num_blocks=10,
|
||||||
|
kv_cache_tensors=[],
|
||||||
|
kv_cache_groups=[
|
||||||
|
KVCacheGroupSpec(['layer_1', 'layer_2'],
|
||||||
|
UniformTypeKVCacheSpecs(
|
||||||
|
block_size=16,
|
||||||
|
kv_cache_specs=kv_cache_specs)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
|
||||||
|
kv_cache_configs)
|
||||||
|
assert scheduler_kv_cache_config == KVCacheConfig(
|
||||||
|
num_blocks=10,
|
||||||
|
kv_cache_tensors=[],
|
||||||
|
kv_cache_groups=[
|
||||||
|
KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec())
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from typing import Optional, Union
|
from typing import Any, Optional, Union
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -331,6 +331,46 @@ def echo_dc(
|
|||||||
return [val for _ in range(3)] if return_list else val
|
return [val for _ in range(3)] if return_list else val
|
||||||
|
|
||||||
|
|
||||||
|
# Dummy utility function to test dict serialization with custom types.
|
||||||
|
def echo_dc_dict(
|
||||||
|
self,
|
||||||
|
msg: str,
|
||||||
|
return_dict: bool = False,
|
||||||
|
) -> Union[MyDataclass, dict[str, MyDataclass]]:
|
||||||
|
print(f"echo dc dict util function called: {msg}")
|
||||||
|
val = None if msg is None else MyDataclass(msg)
|
||||||
|
# Return dict of dataclasses to verify support for returning dicts
|
||||||
|
# with custom value types.
|
||||||
|
if return_dict:
|
||||||
|
return {"key1": val, "key2": val, "key3": val}
|
||||||
|
else:
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
# Dummy utility function to test nested structures with custom types.
|
||||||
|
def echo_dc_nested(
|
||||||
|
self,
|
||||||
|
msg: str,
|
||||||
|
structure_type: str = "list_of_dicts",
|
||||||
|
) -> Any:
|
||||||
|
print(f"echo dc nested util function called: {msg}, "
|
||||||
|
f"structure: {structure_type}")
|
||||||
|
val = None if msg is None else MyDataclass(msg)
|
||||||
|
|
||||||
|
if structure_type == "list_of_dicts": # noqa
|
||||||
|
# Return list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}]
|
||||||
|
return [{"a": val, "b": val}, {"c": val, "d": val}]
|
||||||
|
elif structure_type == "dict_of_lists":
|
||||||
|
# Return dict of lists: {"list1": [val, val], "list2": [val, val]}
|
||||||
|
return {"list1": [val, val], "list2": [val, val]}
|
||||||
|
elif structure_type == "deep_nested":
|
||||||
|
# Return deeply nested: {"outer": [{"inner": [val, val]},
|
||||||
|
# {"inner": [val]}]}
|
||||||
|
return {"outer": [{"inner": [val, val]}, {"inner": [val]}]}
|
||||||
|
else:
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio(loop_scope="function")
|
@pytest.mark.asyncio(loop_scope="function")
|
||||||
async def test_engine_core_client_util_method_custom_return(
|
async def test_engine_core_client_util_method_custom_return(
|
||||||
monkeypatch: pytest.MonkeyPatch):
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
@ -384,6 +424,167 @@ async def test_engine_core_client_util_method_custom_return(
|
|||||||
client.shutdown()
|
client.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(loop_scope="function")
|
||||||
|
async def test_engine_core_client_util_method_custom_dict_return(
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
|
||||||
|
# Must set insecure serialization to allow returning custom types.
|
||||||
|
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
|
# Monkey-patch core engine utility function to test.
|
||||||
|
m.setattr(EngineCore, "echo_dc_dict", echo_dc_dict, raising=False)
|
||||||
|
|
||||||
|
engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
|
||||||
|
vllm_config = engine_args.create_engine_config(
|
||||||
|
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||||
|
executor_class = Executor.get_class(vllm_config)
|
||||||
|
|
||||||
|
with set_default_torch_num_threads(1):
|
||||||
|
client = EngineCoreClient.make_client(
|
||||||
|
multiprocess_mode=True,
|
||||||
|
asyncio_mode=True,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
executor_class=executor_class,
|
||||||
|
log_stats=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Test utility method returning custom / non-native data type.
|
||||||
|
core_client: AsyncMPClient = client
|
||||||
|
|
||||||
|
# Test single object return
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_dict", "testarg3", False)
|
||||||
|
assert isinstance(result,
|
||||||
|
MyDataclass) and result.message == "testarg3"
|
||||||
|
|
||||||
|
# Test dict return with custom value types
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_dict", "testarg3", True)
|
||||||
|
assert isinstance(result, dict) and len(result) == 3
|
||||||
|
for key, val in result.items():
|
||||||
|
assert key in ["key1", "key2", "key3"]
|
||||||
|
assert isinstance(val,
|
||||||
|
MyDataclass) and val.message == "testarg3"
|
||||||
|
|
||||||
|
# Test returning dict with None values
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_dict", None, True)
|
||||||
|
assert isinstance(result, dict) and len(result) == 3
|
||||||
|
for key, val in result.items():
|
||||||
|
assert key in ["key1", "key2", "key3"]
|
||||||
|
assert val is None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(loop_scope="function")
|
||||||
|
async def test_engine_core_client_util_method_nested_structures(
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
|
||||||
|
# Must set insecure serialization to allow returning custom types.
|
||||||
|
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
|
# Monkey-patch core engine utility function to test.
|
||||||
|
m.setattr(EngineCore, "echo_dc_nested", echo_dc_nested, raising=False)
|
||||||
|
|
||||||
|
engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
|
||||||
|
vllm_config = engine_args.create_engine_config(
|
||||||
|
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||||
|
executor_class = Executor.get_class(vllm_config)
|
||||||
|
|
||||||
|
with set_default_torch_num_threads(1):
|
||||||
|
client = EngineCoreClient.make_client(
|
||||||
|
multiprocess_mode=True,
|
||||||
|
asyncio_mode=True,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
executor_class=executor_class,
|
||||||
|
log_stats=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
core_client: AsyncMPClient = client
|
||||||
|
|
||||||
|
# Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}]
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_nested", "nested1", "list_of_dicts")
|
||||||
|
assert isinstance(result, list) and len(result) == 2
|
||||||
|
for i, item in enumerate(result):
|
||||||
|
assert isinstance(item, dict)
|
||||||
|
if i == 0:
|
||||||
|
assert "a" in item and "b" in item
|
||||||
|
assert isinstance(
|
||||||
|
item["a"],
|
||||||
|
MyDataclass) and item["a"].message == "nested1"
|
||||||
|
assert isinstance(
|
||||||
|
item["b"],
|
||||||
|
MyDataclass) and item["b"].message == "nested1"
|
||||||
|
else:
|
||||||
|
assert "c" in item and "d" in item
|
||||||
|
assert isinstance(
|
||||||
|
item["c"],
|
||||||
|
MyDataclass) and item["c"].message == "nested1"
|
||||||
|
assert isinstance(
|
||||||
|
item["d"],
|
||||||
|
MyDataclass) and item["d"].message == "nested1"
|
||||||
|
|
||||||
|
# Test dict of lists: {"list1": [val, val], "list2": [val, val]}
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_nested", "nested2", "dict_of_lists")
|
||||||
|
assert isinstance(result, dict) and len(result) == 2
|
||||||
|
assert "list1" in result and "list2" in result
|
||||||
|
for key, lst in result.items():
|
||||||
|
assert isinstance(lst, list) and len(lst) == 2
|
||||||
|
for item in lst:
|
||||||
|
assert isinstance(
|
||||||
|
item, MyDataclass) and item.message == "nested2"
|
||||||
|
|
||||||
|
# Test deeply nested: {"outer": [{"inner": [val, val]},
|
||||||
|
# {"inner": [val]}]}
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_nested", "nested3", "deep_nested")
|
||||||
|
assert isinstance(result, dict) and "outer" in result
|
||||||
|
outer_list = result["outer"]
|
||||||
|
assert isinstance(outer_list, list) and len(outer_list) == 2
|
||||||
|
|
||||||
|
# First dict in outer list should have "inner" with 2 items
|
||||||
|
inner_dict1 = outer_list[0]
|
||||||
|
assert isinstance(inner_dict1, dict) and "inner" in inner_dict1
|
||||||
|
inner_list1 = inner_dict1["inner"]
|
||||||
|
assert isinstance(inner_list1, list) and len(inner_list1) == 2
|
||||||
|
for item in inner_list1:
|
||||||
|
assert isinstance(item,
|
||||||
|
MyDataclass) and item.message == "nested3"
|
||||||
|
|
||||||
|
# Second dict in outer list should have "inner" with 1 item
|
||||||
|
inner_dict2 = outer_list[1]
|
||||||
|
assert isinstance(inner_dict2, dict) and "inner" in inner_dict2
|
||||||
|
inner_list2 = inner_dict2["inner"]
|
||||||
|
assert isinstance(inner_list2, list) and len(inner_list2) == 1
|
||||||
|
assert isinstance(
|
||||||
|
inner_list2[0],
|
||||||
|
MyDataclass) and inner_list2[0].message == "nested3"
|
||||||
|
|
||||||
|
# Test with None values in nested structures
|
||||||
|
result = await core_client.call_utility_async(
|
||||||
|
"echo_dc_nested", None, "list_of_dicts")
|
||||||
|
assert isinstance(result, list) and len(result) == 2
|
||||||
|
for item in result:
|
||||||
|
assert isinstance(item, dict)
|
||||||
|
for val in item.values():
|
||||||
|
assert val is None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.shutdown()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"multiprocessing_mode,publisher_config",
|
"multiprocessing_mode,publisher_config",
|
||||||
[(True, "tcp"), (False, "inproc")],
|
[(True, "tcp"), (False, "inproc")],
|
||||||
|
|||||||
@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
USE_XFORMERS_OPS = None
|
USE_XFORMERS_OPS = None
|
||||||
|
try:
|
||||||
|
tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
|
||||||
|
except AttributeError:
|
||||||
|
tag_cudagraph_unsafe = () # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
def check_xformers_availability():
|
def check_xformers_availability():
|
||||||
@ -577,6 +581,7 @@ direct_register_custom_op(
|
|||||||
mutates_args=[],
|
mutates_args=[],
|
||||||
fake_impl=unified_attention_fake,
|
fake_impl=unified_attention_fake,
|
||||||
dispatch_key=current_platform.dispatch_key,
|
dispatch_key=current_platform.dispatch_key,
|
||||||
|
tags=tag_cudagraph_unsafe,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -627,4 +632,5 @@ direct_register_custom_op(
|
|||||||
mutates_args=["output", "output_block_scale"],
|
mutates_args=["output", "output_block_scale"],
|
||||||
fake_impl=unified_attention_with_output_fake,
|
fake_impl=unified_attention_with_output_fake,
|
||||||
dispatch_key=current_platform.dispatch_key,
|
dispatch_key=current_platform.dispatch_key,
|
||||||
|
tags=tag_cudagraph_unsafe,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -31,8 +31,11 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
|
def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
|
||||||
if compilation_config.use_inductor:
|
if compilation_config.use_inductor:
|
||||||
if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
|
# Use standalone compile only if requested, version is new enough,
|
||||||
"2.8.0.dev"):
|
# and the symbol actually exists in this PyTorch build.
|
||||||
|
if (envs.VLLM_USE_STANDALONE_COMPILE
|
||||||
|
and is_torch_equal_or_newer("2.8.0.dev")
|
||||||
|
and hasattr(torch._inductor, "standalone_compile")):
|
||||||
logger.debug("Using InductorStandaloneAdaptor")
|
logger.debug("Using InductorStandaloneAdaptor")
|
||||||
return InductorStandaloneAdaptor()
|
return InductorStandaloneAdaptor()
|
||||||
else:
|
else:
|
||||||
@ -326,6 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
|||||||
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
|
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
|
||||||
]
|
]
|
||||||
global compilation_start_time
|
global compilation_start_time
|
||||||
|
|
||||||
compiled_graph_for_dynamic_shape = self.vllm_backend.\
|
compiled_graph_for_dynamic_shape = self.vllm_backend.\
|
||||||
compiler_manager.compile(
|
compiler_manager.compile(
|
||||||
submod,
|
submod,
|
||||||
@ -336,7 +340,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
|||||||
num_graphs=len(self.compile_submod_names),
|
num_graphs=len(self.compile_submod_names),
|
||||||
runtime_shape=None)
|
runtime_shape=None)
|
||||||
# Lazy import here to avoid circular import
|
# Lazy import here to avoid circular import
|
||||||
from .cuda_graph import CUDAGraphOptions
|
|
||||||
from .cuda_piecewise_backend import PiecewiseBackend
|
from .cuda_piecewise_backend import PiecewiseBackend
|
||||||
|
|
||||||
piecewise_backend = PiecewiseBackend(
|
piecewise_backend = PiecewiseBackend(
|
||||||
@ -344,7 +347,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
|||||||
len(self.compile_submod_names), sym_shape_indices,
|
len(self.compile_submod_names), sym_shape_indices,
|
||||||
compiled_graph_for_dynamic_shape, self.vllm_backend)
|
compiled_graph_for_dynamic_shape, self.vllm_backend)
|
||||||
|
|
||||||
if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
|
if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
|
and
|
||||||
|
not self.compilation_config.use_inductor_graph_partition):
|
||||||
|
# We're using Dynamo-based piecewise splitting, so we wrap
|
||||||
|
# the whole subgraph with a static graph wrapper.
|
||||||
|
from .cuda_graph import CUDAGraphOptions
|
||||||
|
|
||||||
# resolve the static graph wrapper class (e.g. CUDAGraphWrapper
|
# resolve the static graph wrapper class (e.g. CUDAGraphWrapper
|
||||||
# class) as platform dependent.
|
# class) as platform dependent.
|
||||||
static_graph_wrapper_class = resolve_obj_by_qualname(
|
static_graph_wrapper_class = resolve_obj_by_qualname(
|
||||||
|
|||||||
@ -1183,7 +1183,7 @@ class AllReduceFusionPass(VllmInductorPass):
|
|||||||
self.end_and_log()
|
self.end_and_log()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if self.disabled:
|
if getattr(self, "disabled", True):
|
||||||
return
|
return
|
||||||
if flashinfer_comm is not None:
|
if flashinfer_comm is not None:
|
||||||
flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
|
flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import contextlib
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Callable, Optional, TypeVar, Union, overload
|
from typing import Callable, Optional, TypeVar, Union, overload
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@ -14,7 +15,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
|||||||
from vllm.config import CompilationLevel, VllmConfig
|
from vllm.config import CompilationLevel, VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import supports_dynamo
|
from vllm.utils import resolve_obj_by_qualname, supports_dynamo
|
||||||
|
|
||||||
from .monitor import start_monitoring_torch_compile
|
from .monitor import start_monitoring_torch_compile
|
||||||
|
|
||||||
@ -301,8 +302,11 @@ def _support_torch_compile(
|
|||||||
|
|
||||||
with patch.object(InliningInstructionTranslator, 'inline_call',
|
with patch.object(InliningInstructionTranslator, 'inline_call',
|
||||||
patched_inline_call), torch._dynamo.config.patch(
|
patched_inline_call), torch._dynamo.config.patch(
|
||||||
**dynamo_config_patches):
|
**dynamo_config_patches
|
||||||
|
), maybe_use_cudagraph_partition_wrapper(
|
||||||
|
self.vllm_config):
|
||||||
output = self.compiled_callable(*args, **kwargs)
|
output = self.compiled_callable(*args, **kwargs)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
# usually, capturing the model once is enough, and then we can
|
# usually, capturing the model once is enough, and then we can
|
||||||
@ -314,3 +318,52 @@ def _support_torch_compile(
|
|||||||
|
|
||||||
cls.__call__ = __call__
|
cls.__call__ = __call__
|
||||||
return cls
|
return cls
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
|
||||||
|
"""
|
||||||
|
Context manager to set/unset customized cudagraph partition wrappers.
|
||||||
|
|
||||||
|
If we're using Inductor-based graph partitioning, we currently have the
|
||||||
|
whole `fx.Graph` before Inductor lowering and and the piecewise
|
||||||
|
splitting happens after all graph passes and fusions. Here, we add
|
||||||
|
a custom hook for Inductor to wrap each partition with our static
|
||||||
|
graph wrapper class to maintain more control over static graph
|
||||||
|
capture and replay.
|
||||||
|
"""
|
||||||
|
from vllm.config import CUDAGraphMode
|
||||||
|
|
||||||
|
compilation_config = vllm_config.compilation_config
|
||||||
|
if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
|
and compilation_config.use_inductor_graph_partition):
|
||||||
|
from torch._inductor.utils import CUDAGraphWrapperMetadata
|
||||||
|
|
||||||
|
from vllm.compilation.cuda_graph import CUDAGraphOptions
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
static_graph_wrapper_class = resolve_obj_by_qualname(
|
||||||
|
current_platform.get_static_graph_wrapper_cls())
|
||||||
|
|
||||||
|
def customized_cudagraph_wrapper(f,
|
||||||
|
metadata: CUDAGraphWrapperMetadata):
|
||||||
|
partition_id = metadata.partition_index
|
||||||
|
num_partitions = metadata.num_partitions
|
||||||
|
return static_graph_wrapper_class(
|
||||||
|
runnable=f,
|
||||||
|
vllm_config=vllm_config,
|
||||||
|
runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||||
|
cudagraph_options=CUDAGraphOptions(
|
||||||
|
debug_log_enable=partition_id == 0,
|
||||||
|
gc_disable=partition_id != 0,
|
||||||
|
weak_ref_output=partition_id == num_partitions - 1,
|
||||||
|
))
|
||||||
|
|
||||||
|
torch._inductor.utils.set_customized_partition_wrappers(
|
||||||
|
customized_cudagraph_wrapper)
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
|
and compilation_config.use_inductor_graph_partition):
|
||||||
|
torch._inductor.utils.set_customized_partition_wrappers(None)
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from typing import Callable, Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.config import (CompilationLevel, CUDAGraphMode,
|
from vllm.config import (CompilationLevel, CUDAGraphMode,
|
||||||
get_current_vllm_config)
|
get_current_vllm_config)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher:
|
|||||||
options = get_current_vllm_config(
|
options = get_current_vllm_config(
|
||||||
).compilation_config.inductor_compile_config
|
).compilation_config.inductor_compile_config
|
||||||
|
|
||||||
compiled_callable = torch.compile(
|
compiled_callable = torch.compile(self.forward,
|
||||||
self.forward,
|
fullgraph=True,
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=backend,
|
||||||
backend=backend,
|
options=options)
|
||||||
options=options)
|
|
||||||
|
|
||||||
self.compiled_callable = compiled_callable
|
self.compiled_callable = compiled_callable
|
||||||
self.original_code_object = self.__class__.forward.__code__
|
self.original_code_object = self.__class__.forward.__code__
|
||||||
|
|||||||
@ -299,6 +299,26 @@ class CompilationConfig:
|
|||||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
|
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
use_inductor_graph_partition: bool = False
|
||||||
|
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
|
||||||
|
This partition happens at inductor codegen time after all passes and fusions
|
||||||
|
are finished. It generates a single `call` function which wraps
|
||||||
|
cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
|
||||||
|
outside the partition functions. For a graph with N cudagraph-unsafe ops
|
||||||
|
(e.g., Attention), there would be N+1 partitions. To mark an op as
|
||||||
|
cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
|
||||||
|
register the custom op.
|
||||||
|
|
||||||
|
This config supports both full cudagraph and piecewise cudagraph without
|
||||||
|
compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
|
||||||
|
to each partition. For N+1 partitions, there would be N+1
|
||||||
|
CUDAGraph wrapper instances.
|
||||||
|
|
||||||
|
For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
|
||||||
|
inductor `call` function in the model runner. The top-level full cudagraph
|
||||||
|
capture ignores all partitioning.
|
||||||
|
"""
|
||||||
|
|
||||||
pass_config: PassConfig = field(default_factory=PassConfig)
|
pass_config: PassConfig = field(default_factory=PassConfig)
|
||||||
"""Custom inductor passes, see PassConfig for more details"""
|
"""Custom inductor passes, see PassConfig for more details"""
|
||||||
|
|
||||||
@ -461,6 +481,12 @@ class CompilationConfig:
|
|||||||
"since full_cuda_graph is deprecated.")
|
"since full_cuda_graph is deprecated.")
|
||||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||||
|
|
||||||
|
if (self.use_inductor_graph_partition
|
||||||
|
and not is_torch_equal_or_newer("2.9.0.dev")):
|
||||||
|
raise ValueError("use_inductor_graph_partition is only "
|
||||||
|
"supported with torch>=2.9.0.dev. Set "
|
||||||
|
"use_inductor_graph_partition=False instead.")
|
||||||
|
|
||||||
def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
|
def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
|
||||||
if self.level == CompilationLevel.NO_COMPILATION:
|
if self.level == CompilationLevel.NO_COMPILATION:
|
||||||
raise ValueError("No compilation level is set.")
|
raise ValueError("No compilation level is set.")
|
||||||
@ -540,19 +566,36 @@ class CompilationConfig:
|
|||||||
"set_splitting_ops_for_v1 should only be called when "
|
"set_splitting_ops_for_v1 should only be called when "
|
||||||
"level is CompilationLevel.PIECEWISE")
|
"level is CompilationLevel.PIECEWISE")
|
||||||
|
|
||||||
|
use_inductor_graph_partition_msg = (
|
||||||
|
"When use_inductor_graph_partition=True, splitting_ops "
|
||||||
|
"are ignored and set to an empty list. Instead, "
|
||||||
|
"\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
|
||||||
|
"used to annotate custom ops for graph partition.")
|
||||||
|
|
||||||
if self.splitting_ops is None:
|
if self.splitting_ops is None:
|
||||||
# NOTE: When using full cudagraph, instead of setting an empty
|
if self.use_inductor_graph_partition:
|
||||||
# list and capture the full cudagraph inside the flattened fx
|
# When using inductor graph partition, we set splitting_ops
|
||||||
# graph, we keep the piecewise fx graph structure but capture the
|
# to be empty and rely on torch._C.Tag.cudagraph_unsafe to
|
||||||
# full cudagraph outside the fx graph. This reduces some cpu
|
# annotate custom ops as splitting ops.
|
||||||
# overhead when the runtime batch_size is not cudagraph captured.
|
logger.warning_once(use_inductor_graph_partition_msg)
|
||||||
# see https://github.com/vllm-project/vllm/pull/20059 for details.
|
self.splitting_ops = []
|
||||||
# make a copy to avoid mutating the class-level list via reference.
|
else:
|
||||||
self.splitting_ops = list(self._attention_ops)
|
# NOTE: When using full cudagraph, instead of setting an empty
|
||||||
|
# list and capture the full cudagraph inside the flattened fx
|
||||||
|
# graph, we keep the piecewise fx graph structure but capture
|
||||||
|
# the full cudagraph outside the fx graph. This reduces some
|
||||||
|
# cpu overhead when the runtime batch_size is not cudagraph
|
||||||
|
# captured. see https://github.com/vllm-project/vllm/pull/20059
|
||||||
|
# for details. make a copy to avoid mutating the class-level
|
||||||
|
# list via reference.
|
||||||
|
self.splitting_ops = list(self._attention_ops)
|
||||||
elif len(self.splitting_ops) == 0:
|
elif len(self.splitting_ops) == 0:
|
||||||
logger.warning_once("Using piecewise compilation with empty "
|
logger.warning_once(
|
||||||
"splitting_ops.")
|
"Using piecewise compilation with empty "
|
||||||
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
"splitting_ops and use_inductor_graph_partition"
|
||||||
|
f"={self.use_inductor_graph_partition}.")
|
||||||
|
if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
and not self.use_inductor_graph_partition):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"When compilation level is piecewise with empty "
|
"When compilation level is piecewise with empty "
|
||||||
"splitting_ops, PIECEWISE cudagraph_mode will be "
|
"splitting_ops, PIECEWISE cudagraph_mode will be "
|
||||||
@ -562,7 +605,26 @@ class CompilationConfig:
|
|||||||
"any problems.")
|
"any problems.")
|
||||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||||
self.splitting_ops = []
|
self.splitting_ops = []
|
||||||
|
elif self.use_inductor_graph_partition:
|
||||||
|
logger.warning_once(use_inductor_graph_partition_msg)
|
||||||
|
self.splitting_ops = []
|
||||||
|
|
||||||
def splitting_ops_contain_attention(self) -> bool:
|
def splitting_ops_contain_attention(self) -> bool:
|
||||||
return self.splitting_ops is not None and all(
|
return self.splitting_ops is not None and all(
|
||||||
op in self.splitting_ops for op in self._attention_ops)
|
op in self.splitting_ops for op in self._attention_ops)
|
||||||
|
|
||||||
|
def is_attention_compiled_piecewise(self) -> bool:
|
||||||
|
use_fx_graph_piecewise_compilation = (
|
||||||
|
self.level == CompilationLevel.PIECEWISE
|
||||||
|
and self.splitting_ops_contain_attention())
|
||||||
|
|
||||||
|
inductor_used = (self.level == CompilationLevel.PIECEWISE
|
||||||
|
and self.use_inductor) or (
|
||||||
|
self.level >= CompilationLevel.DYNAMO_AS_IS
|
||||||
|
and self.backend == "inductor")
|
||||||
|
use_inductor_piecewise_compilation = (
|
||||||
|
inductor_used and self.use_inductor_graph_partition
|
||||||
|
and not self.splitting_ops_contain_attention())
|
||||||
|
|
||||||
|
return use_fx_graph_piecewise_compilation or \
|
||||||
|
use_inductor_piecewise_compilation
|
||||||
|
|||||||
@ -322,8 +322,28 @@ class ModelConfig:
|
|||||||
factors.append(self.override_generation_config)
|
factors.append(self.override_generation_config)
|
||||||
factors.append(self.rope_scaling)
|
factors.append(self.rope_scaling)
|
||||||
factors.append(self.rope_theta)
|
factors.append(self.rope_theta)
|
||||||
|
|
||||||
# hf_config can control how the model looks!
|
# hf_config can control how the model looks!
|
||||||
factors.append(self.hf_config.to_json_string())
|
try:
|
||||||
|
hf_config_json = self.hf_config.to_json_string(use_diff=False)
|
||||||
|
except TypeError:
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
from vllm.utils.jsontree import json_map_leaves
|
||||||
|
|
||||||
|
# Handle nested HF configs with unserializable values gracefully
|
||||||
|
hf_config_json = json.dumps(
|
||||||
|
json_map_leaves(
|
||||||
|
lambda v: v.to_dict()
|
||||||
|
if isinstance(v, PretrainedConfig) else str(v),
|
||||||
|
self.hf_config.to_dict(),
|
||||||
|
),
|
||||||
|
indent=2,
|
||||||
|
sort_keys=True,
|
||||||
|
) + "\n"
|
||||||
|
|
||||||
|
factors.append(hf_config_json)
|
||||||
|
|
||||||
str_factors = str(factors)
|
str_factors = str(factors)
|
||||||
assert_hashable(str_factors)
|
assert_hashable(str_factors)
|
||||||
return hashlib.sha256(str(factors).encode()).hexdigest()
|
return hashlib.sha256(str(factors).encode()).hexdigest()
|
||||||
@ -964,6 +984,9 @@ class ModelConfig:
|
|||||||
"modelopt",
|
"modelopt",
|
||||||
"modelopt_fp4",
|
"modelopt_fp4",
|
||||||
"petit_nvfp4",
|
"petit_nvfp4",
|
||||||
|
# Ensure heavy backends are probed last to avoid unnecessary
|
||||||
|
# imports during override detection (e.g., MXFP4 imports Triton)
|
||||||
|
"mxfp4",
|
||||||
]
|
]
|
||||||
quantization_methods = [
|
quantization_methods = [
|
||||||
q for q in supported_quantization if q not in overrides
|
q for q in supported_quantization if q not in overrides
|
||||||
|
|||||||
@ -569,9 +569,10 @@ class NixlConnectorWorker:
|
|||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
"""Cleanup background threads on destruction."""
|
"""Cleanup background threads on destruction."""
|
||||||
self._handshake_initiation_executor.shutdown(wait=False)
|
if executor := getattr(self, "_handshake_initiation_executor", None):
|
||||||
if self._nixl_handshake_listener_t:
|
executor.shutdown(wait=False)
|
||||||
self._nixl_handshake_listener_t.join(timeout=0)
|
if listener_t := getattr(self, "_nixl_handshake_listener_t", None):
|
||||||
|
listener_t.join(timeout=0)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _nixl_handshake_listener(metadata: NixlAgentMetadata,
|
def _nixl_handshake_listener(metadata: NixlAgentMetadata,
|
||||||
@ -1379,4 +1380,4 @@ class NixlKVConnectorStats(KVConnectorStats):
|
|||||||
# TODO: reduce stats to a single value, calculate latency/throughput
|
# TODO: reduce stats to a single value, calculate latency/throughput
|
||||||
return {
|
return {
|
||||||
"num_successful_transfers": self.data["num_successful_transfers"]
|
"num_successful_transfers": self.data["num_successful_transfers"]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from typing import Sequence as GenericSequence
|
|||||||
from typing import Set, Type, Union, cast
|
from typing import Set, Type, Union, cast
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
@ -55,6 +56,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
|||||||
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
|
from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
from vllm.worker.model_runner_base import InputProcessingError
|
from vllm.worker.model_runner_base import InputProcessingError
|
||||||
|
from vllm.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
||||||
@ -1817,13 +1819,16 @@ class LLMEngine:
|
|||||||
return sampling_params
|
return sampling_params
|
||||||
|
|
||||||
def collective_rpc(self,
|
def collective_rpc(self,
|
||||||
method: Union[str, Callable[..., _R]],
|
method: Union[str, Callable[[WorkerBase], _R]],
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
args: tuple = (),
|
args: tuple = (),
|
||||||
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
|
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
|
||||||
return self.model_executor.collective_rpc(method, timeout, args,
|
return self.model_executor.collective_rpc(method, timeout, args,
|
||||||
kwargs)
|
kwargs)
|
||||||
|
|
||||||
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
|
return self.collective_rpc("apply_model", args=(func, ))
|
||||||
|
|
||||||
|
|
||||||
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
|
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||||
|
|||||||
@ -1450,9 +1450,11 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
|
|||||||
and isinstance(message["tool_calls"], list)
|
and isinstance(message["tool_calls"], list)
|
||||||
):
|
):
|
||||||
for item in message["tool_calls"]:
|
for item in message["tool_calls"]:
|
||||||
item["function"]["arguments"] = json.loads(
|
# if arguments is None or empty string, set to {}
|
||||||
item["function"]["arguments"]
|
if content := item["function"].get("arguments"):
|
||||||
)
|
item["function"]["arguments"] = json.loads(content)
|
||||||
|
else:
|
||||||
|
item["function"]["arguments"] = {}
|
||||||
|
|
||||||
|
|
||||||
def parse_chat_messages(
|
def parse_chat_messages(
|
||||||
|
|||||||
@ -522,9 +522,14 @@ class LLM:
|
|||||||
"""
|
"""
|
||||||
Run a function directly on the model inside each worker,
|
Run a function directly on the model inside each worker,
|
||||||
returning the result for each of them.
|
returning the result for each of them.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
To reduce the overhead of data transfer, avoid returning large
|
||||||
|
arrays or tensors from this method. If you must return them,
|
||||||
|
make sure you move them to CPU first to avoid taking up additional
|
||||||
|
VRAM!
|
||||||
"""
|
"""
|
||||||
executor = self.llm_engine.model_executor
|
return self.llm_engine.apply_model(func)
|
||||||
return executor.apply_model(func)
|
|
||||||
|
|
||||||
def _get_beam_search_lora_requests(
|
def _get_beam_search_lora_requests(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel):
|
|||||||
"access by 3rd parties, and long enough to be "
|
"access by 3rd parties, and long enough to be "
|
||||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||||
"to 256 bit). Not supported by vLLM engine V0."))
|
"to 256 bit). Not supported by vLLM engine V0."))
|
||||||
|
|
||||||
|
enable_response_messages: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"Dictates whether or not to return messages as part of the "
|
||||||
|
"response object. Currently only supported for non-streaming "
|
||||||
|
"non-background and gpt-oss only. "))
|
||||||
# --8<-- [end:responses-extra-params]
|
# --8<-- [end:responses-extra-params]
|
||||||
|
|
||||||
_DEFAULT_SAMPLING_PARAMS = {
|
_DEFAULT_SAMPLING_PARAMS = {
|
||||||
@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
model: str
|
model: str
|
||||||
object: Literal["response"] = "response"
|
object: Literal["response"] = "response"
|
||||||
output: list[ResponseOutputItem]
|
output: list[ResponseOutputItem]
|
||||||
|
# These are populated when enable_response_messages is set to True
|
||||||
|
# TODO: Currently an issue where content of harmony messages
|
||||||
|
# is not available when these are serialized. Metadata is available
|
||||||
|
input_messages: Optional[list[ChatCompletionMessageParam]] = None
|
||||||
|
output_messages: Optional[list[ChatCompletionMessageParam]] = None
|
||||||
parallel_tool_calls: bool
|
parallel_tool_calls: bool
|
||||||
temperature: float
|
temperature: float
|
||||||
tool_choice: ToolChoice
|
tool_choice: ToolChoice
|
||||||
@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
output: list[ResponseOutputItem],
|
output: list[ResponseOutputItem],
|
||||||
status: ResponseStatus,
|
status: ResponseStatus,
|
||||||
usage: Optional[ResponseUsage] = None,
|
usage: Optional[ResponseUsage] = None,
|
||||||
|
input_messages: Optional[list[ChatCompletionMessageParam]] = None,
|
||||||
|
output_messages: Optional[list[ChatCompletionMessageParam]] = None,
|
||||||
) -> "ResponsesResponse":
|
) -> "ResponsesResponse":
|
||||||
|
|
||||||
incomplete_details: Optional[IncompleteDetails] = None
|
incomplete_details: Optional[IncompleteDetails] = None
|
||||||
@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
# TODO: implement the other reason for incomplete_details,
|
# TODO: implement the other reason for incomplete_details,
|
||||||
# which is content_filter
|
# which is content_filter
|
||||||
# incomplete_details = IncompleteDetails(reason='content_filter')
|
# incomplete_details = IncompleteDetails(reason='content_filter')
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
id=request.request_id,
|
id=request.request_id,
|
||||||
created_at=created_time,
|
created_at=created_time,
|
||||||
@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
metadata=request.metadata,
|
metadata=request.metadata,
|
||||||
model=model_name,
|
model=model_name,
|
||||||
output=output,
|
output=output,
|
||||||
|
input_messages=input_messages,
|
||||||
|
output_messages=output_messages,
|
||||||
parallel_tool_calls=request.parallel_tool_calls,
|
parallel_tool_calls=request.parallel_tool_calls,
|
||||||
temperature=sampling_params.temperature,
|
temperature=sampling_params.temperature,
|
||||||
tool_choice=request.tool_choice,
|
tool_choice=request.tool_choice,
|
||||||
|
|||||||
@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
# "completed" is implemented as the "catch-all" for now.
|
# "completed" is implemented as the "catch-all" for now.
|
||||||
status: ResponseStatus = "completed"
|
status: ResponseStatus = "completed"
|
||||||
|
|
||||||
|
input_messages = None
|
||||||
|
output_messages = None
|
||||||
if self.use_harmony:
|
if self.use_harmony:
|
||||||
assert isinstance(context, HarmonyContext)
|
assert isinstance(context, HarmonyContext)
|
||||||
output = self._make_response_output_items_with_harmony(context)
|
output = self._make_response_output_items_with_harmony(context)
|
||||||
|
if request.enable_response_messages:
|
||||||
|
input_messages = context.messages[:context.num_init_messages]
|
||||||
|
output_messages = context.messages[context.num_init_messages:]
|
||||||
num_tool_output_tokens = context.num_tool_output_tokens
|
num_tool_output_tokens = context.num_tool_output_tokens
|
||||||
if len(output) > 0:
|
if len(output) > 0:
|
||||||
if context.finish_reason == "length":
|
if context.finish_reason == "length":
|
||||||
@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
output = self._make_response_output_items(request, final_output,
|
output = self._make_response_output_items(request, final_output,
|
||||||
tokenizer)
|
tokenizer)
|
||||||
|
|
||||||
|
# TODO: context for non-gptoss models doesn't use messages
|
||||||
|
# so we can't get them out yet
|
||||||
|
if request.enable_response_messages:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"enable_response_messages is currently"
|
||||||
|
" only supported for gpt-oss")
|
||||||
# Calculate usage.
|
# Calculate usage.
|
||||||
assert final_res.prompt_token_ids is not None
|
assert final_res.prompt_token_ids is not None
|
||||||
num_tool_output_tokens = 0
|
num_tool_output_tokens = 0
|
||||||
@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
response = ResponsesResponse.from_request(
|
response = ResponsesResponse.from_request(
|
||||||
request,
|
request,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
|
input_messages=input_messages,
|
||||||
|
output_messages=output_messages,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
created_time=created_time,
|
created_time=created_time,
|
||||||
output=output,
|
output=output,
|
||||||
|
|||||||
@ -98,6 +98,15 @@ class Hermes2ProToolParser(ToolParser):
|
|||||||
else:
|
else:
|
||||||
return delta_text
|
return delta_text
|
||||||
|
|
||||||
|
def adjust_request(
|
||||||
|
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||||
|
if request.tools and request.tool_choice != 'none':
|
||||||
|
# do not skip special tokens because the tool_call tokens are
|
||||||
|
# marked "special" in some models. Since they are skipped
|
||||||
|
# prior to the call to the tool parser, it breaks tool calling.
|
||||||
|
request.skip_special_tokens = False
|
||||||
|
return request
|
||||||
|
|
||||||
def extract_tool_calls(
|
def extract_tool_calls(
|
||||||
self,
|
self,
|
||||||
model_output: str,
|
model_output: str,
|
||||||
|
|||||||
@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
|
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
|
||||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||||
VLLM_NO_USAGE_STATS: bool = False
|
VLLM_NO_USAGE_STATS: bool = False
|
||||||
|
VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
|
||||||
VLLM_DO_NOT_TRACK: bool = False
|
VLLM_DO_NOT_TRACK: bool = False
|
||||||
VLLM_USAGE_SOURCE: str = ""
|
VLLM_USAGE_SOURCE: str = ""
|
||||||
VLLM_CONFIGURE_LOGGING: int = 1
|
VLLM_CONFIGURE_LOGGING: int = 1
|
||||||
@ -433,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_FLASH_ATTN_VERSION":
|
"VLLM_FLASH_ATTN_VERSION":
|
||||||
lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
|
lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
|
||||||
|
|
||||||
# Internal flag to enable Dynamo fullgraph capture
|
|
||||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
|
|
||||||
lambda: bool(
|
|
||||||
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
|
|
||||||
|
|
||||||
# Feature flag to enable/disable Inductor standalone compile.
|
# Feature flag to enable/disable Inductor standalone compile.
|
||||||
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
|
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
|
||||||
# enabled by default.
|
# enabled by default.
|
||||||
@ -479,6 +475,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
||||||
"VLLM_NO_USAGE_STATS":
|
"VLLM_NO_USAGE_STATS":
|
||||||
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
|
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
|
||||||
|
"VLLM_DISABLE_FLASHINFER_PREFILL":
|
||||||
|
lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1",
|
||||||
"VLLM_DO_NOT_TRACK":
|
"VLLM_DO_NOT_TRACK":
|
||||||
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
|
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
|
||||||
"DO_NOT_TRACK", None) or "0") == "1",
|
"DO_NOT_TRACK", None) or "0") == "1",
|
||||||
|
|||||||
@ -5,11 +5,10 @@ import asyncio
|
|||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
|
from typing import Any, Awaitable, Callable, List, Optional, Set, Union
|
||||||
Union)
|
|
||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar, deprecated
|
||||||
|
|
||||||
import vllm.platforms
|
import vllm.platforms
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
@ -63,10 +62,10 @@ class ExecutorBase(ABC):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def collective_rpc(self,
|
def collective_rpc(self,
|
||||||
method: Union[str, Callable[..., _R]],
|
method: Union[str, Callable[[WorkerBase], _R]],
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
args: Tuple = (),
|
args: tuple = (),
|
||||||
kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
|
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
|
||||||
"""
|
"""
|
||||||
Execute an RPC call on all workers.
|
Execute an RPC call on all workers.
|
||||||
|
|
||||||
@ -91,7 +90,7 @@ class ExecutorBase(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
def determine_num_available_blocks(self) -> tuple[int, int]:
|
||||||
"""Determine the number of available blocks for the GPU KV cache and
|
"""Determine the number of available blocks for the GPU KV cache and
|
||||||
swappable CPU KV cache.
|
swappable CPU KV cache.
|
||||||
|
|
||||||
@ -99,9 +98,10 @@ class ExecutorBase(ABC):
|
|||||||
ExecutorBase may require modification of the result, e.g. to ensure the
|
ExecutorBase may require modification of the result, e.g. to ensure the
|
||||||
selected cache sizes are compatible with all workers.
|
selected cache sizes are compatible with all workers.
|
||||||
|
|
||||||
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
|
||||||
are blocks that are "active" on the device and can be appended to.
|
`num_gpu_blocks` are blocks that are "active" on the device and can be
|
||||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
appended to.
|
||||||
|
`num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
|
||||||
appended to.
|
appended to.
|
||||||
"""
|
"""
|
||||||
results = self.collective_rpc("determine_num_available_blocks")
|
results = self.collective_rpc("determine_num_available_blocks")
|
||||||
@ -127,16 +127,15 @@ class ExecutorBase(ABC):
|
|||||||
self.collective_rpc("initialize_cache",
|
self.collective_rpc("initialize_cache",
|
||||||
args=(num_gpu_blocks, num_cpu_blocks))
|
args=(num_gpu_blocks, num_cpu_blocks))
|
||||||
|
|
||||||
|
@deprecated("`llm_engine.model_executor.apply_model` will no longer work "
|
||||||
|
"in V1 Engine. Please replace with `llm_engine.apply_model` "
|
||||||
|
"and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.")
|
||||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
"""
|
"""
|
||||||
Run a function directly on the model inside each worker,
|
Run a function directly on the model inside each worker,
|
||||||
returning the result for each of them.
|
returning the result for each of them.
|
||||||
"""
|
"""
|
||||||
|
return self.collective_rpc("apply_model", args=(func, ))
|
||||||
def rpc_func(worker: WorkerBase) -> _R:
|
|
||||||
return func(worker.get_model())
|
|
||||||
|
|
||||||
return self.collective_rpc(rpc_func)
|
|
||||||
|
|
||||||
@cached_property # Avoid unnecessary RPC calls
|
@cached_property # Avoid unnecessary RPC calls
|
||||||
def supported_tasks(self) -> tuple[SupportedTask, ...]:
|
def supported_tasks(self) -> tuple[SupportedTask, ...]:
|
||||||
@ -235,9 +234,6 @@ class ExecutorBase(ABC):
|
|||||||
"""Shutdown the executor."""
|
"""Shutdown the executor."""
|
||||||
self.collective_rpc("shutdown")
|
self.collective_rpc("shutdown")
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.shutdown()
|
|
||||||
|
|
||||||
async def execute_model_async(
|
async def execute_model_async(
|
||||||
self,
|
self,
|
||||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||||
@ -311,8 +307,8 @@ class DistributedExecutorBase(ExecutorBase):
|
|||||||
def collective_rpc(self,
|
def collective_rpc(self,
|
||||||
method: Union[str, Callable],
|
method: Union[str, Callable],
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
args: Tuple = (),
|
args: tuple = (),
|
||||||
kwargs: Optional[Dict] = None) -> List[Any]:
|
kwargs: Optional[dict[str, Any]] = None) -> list[Any]:
|
||||||
return self._run_workers(method, *args, **(kwargs or {}))
|
return self._run_workers(method, *args, **(kwargs or {}))
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|||||||
@ -20,10 +20,10 @@ if has_triton_kernels():
|
|||||||
from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
|
from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
|
||||||
matmul_ogs)
|
matmul_ogs)
|
||||||
from triton_kernels.routing import routing
|
from triton_kernels.routing import routing
|
||||||
except ModuleNotFoundError:
|
except (ModuleNotFoundError, AttributeError) as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to import Triton kernels. Please make sure your triton "
|
"Failed to import Triton kernels. Please make sure your triton "
|
||||||
"version is compatible.")
|
"version is compatible. Error: %s", e)
|
||||||
|
|
||||||
|
|
||||||
def triton_kernel_moe_forward(
|
def triton_kernel_moe_forward(
|
||||||
|
|||||||
@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
|
|
||||||
if isinstance(layer, FusedMoE):
|
if isinstance(layer, FusedMoE):
|
||||||
if use_marlin:
|
if use_marlin:
|
||||||
return AWQMoEMethod(quant_args_marlin, layer.moe)
|
return AWQMoEMethod(quant_args_marlin, layer.moe_config)
|
||||||
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
||||||
MoeWNA16Config)
|
MoeWNA16Config)
|
||||||
|
|
||||||
@ -327,7 +327,7 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
|
|
||||||
if isinstance(layer, FusedMoE):
|
if isinstance(layer, FusedMoE):
|
||||||
if use_marlin:
|
if use_marlin:
|
||||||
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
|
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config)
|
||||||
else:
|
else:
|
||||||
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
||||||
MoeWNA16Config)
|
MoeWNA16Config)
|
||||||
|
|||||||
@ -160,6 +160,7 @@ class ModelOptFp8Config(QuantizationConfig):
|
|||||||
def is_layer_excluded(self, prefix: str) -> bool:
|
def is_layer_excluded(self, prefix: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a layer should be excluded from quantization.
|
Check if a layer should be excluded from quantization.
|
||||||
|
Handles both exact matching (for fused layers) and substring matching.
|
||||||
|
|
||||||
This method handles both regular models and multimodal models that use
|
This method handles both regular models and multimodal models that use
|
||||||
the language_model prefix. For multimodal models, it checks if the
|
the language_model prefix. For multimodal models, it checks if the
|
||||||
@ -168,11 +169,18 @@ class ModelOptFp8Config(QuantizationConfig):
|
|||||||
if self.exclude_modules is None:
|
if self.exclude_modules is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check if any excluded module matches the prefix
|
# First check exact matching with fused layer support
|
||||||
|
if is_layer_skipped(prefix, self.exclude_modules,
|
||||||
|
self.packed_modules_mapping):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Then check substring matching for patterns not caught by exact match
|
||||||
for module in self.exclude_modules:
|
for module in self.exclude_modules:
|
||||||
if (module in prefix
|
# Skip exact matches already handled above
|
||||||
or (prefix.startswith("language_model.")
|
if (module != prefix and
|
||||||
and module in prefix.removeprefix("language_model."))):
|
(module in prefix or
|
||||||
|
(prefix.startswith("language_model.")
|
||||||
|
and module in prefix.removeprefix("language_model.")))):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -180,9 +188,10 @@ class ModelOptFp8Config(QuantizationConfig):
|
|||||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||||
from vllm.attention.layer import Attention # Avoid circular import
|
from vllm.attention.layer import Attention # Avoid circular import
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
if (is_layer_skipped(prefix, self.exclude_modules,
|
if self.is_layer_excluded(prefix):
|
||||||
self.packed_modules_mapping)
|
return UnquantizedLinearMethod()
|
||||||
or self.is_layer_excluded(prefix)):
|
# Check if this is a vision model layer that should not be quantized
|
||||||
|
if ("vision_tower" in prefix or "vision_model" in prefix):
|
||||||
return UnquantizedLinearMethod()
|
return UnquantizedLinearMethod()
|
||||||
return ModelOptFp8LinearMethod(self)
|
return ModelOptFp8LinearMethod(self)
|
||||||
elif isinstance(layer, Attention):
|
elif isinstance(layer, Attention):
|
||||||
@ -778,22 +787,34 @@ class ModelOptNvFp4Config(QuantizationConfig):
|
|||||||
return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
|
return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
|
||||||
exclude_modules, group_size)
|
exclude_modules, group_size)
|
||||||
|
|
||||||
def is_layer_excluded(self, prefix: str,
|
def is_layer_excluded(self, prefix: str) -> bool:
|
||||||
exclude_modules: list[str]) -> bool:
|
"""
|
||||||
|
Check if a layer should be excluded from quantization.
|
||||||
|
Handles both exact matching (for fused layers) and pattern matching.
|
||||||
|
"""
|
||||||
|
# First check exact matching with fused layer support
|
||||||
|
if is_layer_skipped(prefix, self.exclude_modules,
|
||||||
|
self.packed_modules_mapping):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check regex pattern matching for patterns not caught by exact match
|
||||||
import regex as re
|
import regex as re
|
||||||
for pattern in exclude_modules:
|
for pattern in self.exclude_modules:
|
||||||
regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
|
# Skip patterns that would be caught by exact matching
|
||||||
if re.fullmatch(regex_str, prefix):
|
if '*' in pattern or '.' in pattern:
|
||||||
return True
|
regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
|
||||||
|
if re.fullmatch(regex_str, prefix):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_quant_method(self, layer: torch.nn.Module,
|
def get_quant_method(self, layer: torch.nn.Module,
|
||||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||||
from vllm.attention.layer import Attention # Avoid circular import
|
from vllm.attention.layer import Attention # Avoid circular import
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
if (is_layer_skipped(prefix, self.exclude_modules,
|
if self.is_layer_excluded(prefix):
|
||||||
self.packed_modules_mapping)
|
return UnquantizedLinearMethod()
|
||||||
or self.is_layer_excluded(prefix, self.exclude_modules)):
|
# Check if this is a vision model layer that should not be quantized
|
||||||
|
if ("vision_tower" in prefix or "vision_model" in prefix):
|
||||||
return UnquantizedLinearMethod()
|
return UnquantizedLinearMethod()
|
||||||
return ModelOptNvFp4LinearMethod(self)
|
return ModelOptNvFp4LinearMethod(self)
|
||||||
elif isinstance(layer, Attention):
|
elif isinstance(layer, Attention):
|
||||||
|
|||||||
@ -6,8 +6,6 @@ from typing import Optional
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils.flashinfer import has_flashinfer
|
|
||||||
|
|
||||||
from .common import apply_rotary_emb_torch
|
from .common import apply_rotary_emb_torch
|
||||||
|
|
||||||
@ -32,13 +30,15 @@ class RotaryEmbedding(CustomOp):
|
|||||||
self.base = base
|
self.base = base
|
||||||
self.is_neox_style = is_neox_style
|
self.is_neox_style = is_neox_style
|
||||||
self.dtype = dtype
|
self.dtype = dtype
|
||||||
|
# TODO(mgoin): disabled for now due to failures
|
||||||
# Flashinfer only supports head_size=64, 128, 256, 512.
|
# Flashinfer only supports head_size=64, 128, 256, 512.
|
||||||
# https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
|
# https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
|
||||||
self.use_flashinfer = (self.enabled()
|
# self.use_flashinfer = (self.enabled()
|
||||||
and dtype in (torch.float16, torch.bfloat16)
|
# and dtype in (torch.float16, torch.bfloat16)
|
||||||
and current_platform.is_cuda()
|
# and current_platform.is_cuda()
|
||||||
and has_flashinfer()
|
# and has_flashinfer()
|
||||||
and self.head_size in [64, 128, 256, 512])
|
# and self.head_size in [64, 128, 256, 512])
|
||||||
|
self.use_flashinfer = False
|
||||||
|
|
||||||
cache = self._compute_cos_sin_cache()
|
cache = self._compute_cos_sin_cache()
|
||||||
if not self.use_flashinfer:
|
if not self.use_flashinfer:
|
||||||
|
|||||||
@ -165,7 +165,11 @@ def device_loading_context(module: torch.nn.Module,
|
|||||||
# New parameters or parameters already on target device are untouched
|
# New parameters or parameters already on target device are untouched
|
||||||
|
|
||||||
|
|
||||||
def get_model_architecture(
|
_MODEL_ARCH_BY_HASH = dict[str, tuple[type[nn.Module], str]]()
|
||||||
|
"""Caches the outputs of `_get_model_architecture`."""
|
||||||
|
|
||||||
|
|
||||||
|
def _get_model_architecture(
|
||||||
model_config: ModelConfig) -> tuple[type[nn.Module], str]:
|
model_config: ModelConfig) -> tuple[type[nn.Module], str]:
|
||||||
architectures = getattr(model_config.hf_config, "architectures", [])
|
architectures = getattr(model_config.hf_config, "architectures", [])
|
||||||
|
|
||||||
@ -209,6 +213,17 @@ def get_model_architecture(
|
|||||||
return model_cls, arch
|
return model_cls, arch
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_architecture(
|
||||||
|
model_config: ModelConfig) -> tuple[type[nn.Module], str]:
|
||||||
|
key = model_config.compute_hash()
|
||||||
|
if key in _MODEL_ARCH_BY_HASH:
|
||||||
|
return _MODEL_ARCH_BY_HASH[key]
|
||||||
|
|
||||||
|
model_arch = _get_model_architecture(model_config)
|
||||||
|
_MODEL_ARCH_BY_HASH[key] = model_arch
|
||||||
|
return model_arch
|
||||||
|
|
||||||
|
|
||||||
def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
|
def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
|
||||||
return get_model_architecture(model_config)[0]
|
return get_model_architecture(model_config)[0]
|
||||||
|
|
||||||
|
|||||||
@ -446,6 +446,22 @@ class Gemma3Model(nn.Module):
|
|||||||
weight_loader(param, loaded_weight)
|
weight_loader(param, loaded_weight)
|
||||||
loaded_params.add(scale_name)
|
loaded_params.add(scale_name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Check if this is a scale parameter that needs remapping first
|
||||||
|
if name.endswith(
|
||||||
|
(".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
|
||||||
|
# Try to remap the scale name first
|
||||||
|
remapped_name = maybe_remap_kv_scale_name(name, params_dict)
|
||||||
|
if remapped_name is not None and remapped_name in params_dict:
|
||||||
|
# Successfully remapped, use the remapped name
|
||||||
|
param = params_dict[remapped_name]
|
||||||
|
weight_loader = getattr(param, "weight_loader",
|
||||||
|
default_weight_loader)
|
||||||
|
weight_loader(param, loaded_weight)
|
||||||
|
loaded_params.add(remapped_name)
|
||||||
|
continue
|
||||||
|
# If remapping failed, continue with normal processing
|
||||||
|
|
||||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||||
if shard_name not in name:
|
if shard_name not in name:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -148,9 +148,11 @@ class Qwen3NextSparseMoeBlock(nn.Module):
|
|||||||
|
|
||||||
def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
|
def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
|
||||||
# GPTQ configs do not have a list of ignored modules, however AutoGPTQ
|
# GPTQ configs do not have a list of ignored modules, however AutoGPTQ
|
||||||
# seems to avoid gate quantization.
|
# seems to avoid gate quantization while AutoRound does.
|
||||||
# See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4
|
if isinstance(
|
||||||
if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
|
quant_config,
|
||||||
|
(GPTQConfig,
|
||||||
|
GPTQMarlinConfig)) and not quant_config.autoround_version:
|
||||||
return None
|
return None
|
||||||
return quant_config
|
return quant_config
|
||||||
|
|
||||||
|
|||||||
@ -122,9 +122,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
|
|||||||
|
|
||||||
def load_fused_expert_weights(self, name: str, params_dict: dict,
|
def load_fused_expert_weights(self, name: str, params_dict: dict,
|
||||||
loaded_weight: torch.Tensor, shard_id: str,
|
loaded_weight: torch.Tensor, shard_id: str,
|
||||||
num_experts: int):
|
num_experts: int) -> bool:
|
||||||
param = params_dict[name]
|
param = params_dict[name]
|
||||||
weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
|
weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
|
||||||
|
loaded_local_expert = False
|
||||||
for expert_id in range(num_experts):
|
for expert_id in range(num_experts):
|
||||||
curr_expert_weight = loaded_weight[expert_id]
|
curr_expert_weight = loaded_weight[expert_id]
|
||||||
success = weight_loader(param,
|
success = weight_loader(param,
|
||||||
@ -133,9 +134,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
|
|||||||
shard_id,
|
shard_id,
|
||||||
expert_id,
|
expert_id,
|
||||||
return_success=True)
|
return_success=True)
|
||||||
if not success:
|
if success:
|
||||||
return False
|
loaded_local_expert = True
|
||||||
return True
|
|
||||||
|
return loaded_local_expert
|
||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str,
|
def load_weights(self, weights: Iterable[tuple[str,
|
||||||
torch.Tensor]]) -> set[str]:
|
torch.Tensor]]) -> set[str]:
|
||||||
@ -345,4 +347,4 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
|
|||||||
for _ in range(self.deepstack_num_level)
|
for _ in range(self.deepstack_num_level)
|
||||||
] if self.use_deepstack else None
|
] if self.use_deepstack else None
|
||||||
self.visual_dim = config.vision_config.out_hidden_size
|
self.visual_dim = config.vision_config.out_hidden_size
|
||||||
self.multiscale_dim = self.visual_dim * self.deepstack_num_level
|
self.multiscale_dim = self.visual_dim * self.deepstack_num_level
|
||||||
|
|||||||
@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
|||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
VocabParallelEmbedding)
|
VocabParallelEmbedding)
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import (
|
||||||
|
default_weight_loader, maybe_remap_kv_scale_name)
|
||||||
|
|
||||||
from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
|
from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
|
||||||
|
|
||||||
@ -506,6 +507,21 @@ class SiglipVisionModel(nn.Module):
|
|||||||
if layer_idx >= layer_count:
|
if layer_idx >= layer_count:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Check if this is a scale parameter that needs remapping first
|
||||||
|
if name.endswith(
|
||||||
|
(".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
|
||||||
|
# Try to remap the scale name first
|
||||||
|
remapped_name = maybe_remap_kv_scale_name(name, params_dict)
|
||||||
|
if remapped_name is not None and remapped_name in params_dict:
|
||||||
|
# Successfully remapped, use the remapped name
|
||||||
|
param = params_dict[remapped_name]
|
||||||
|
weight_loader = getattr(param, "weight_loader",
|
||||||
|
default_weight_loader)
|
||||||
|
weight_loader(param, loaded_weight)
|
||||||
|
loaded_params.add(remapped_name)
|
||||||
|
continue
|
||||||
|
# If remapping failed, continue with normal processing
|
||||||
|
|
||||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||||
if weight_name not in name:
|
if weight_name not in name:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -702,21 +702,45 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
|||||||
class TransformersModel(TransformersBase):
|
class TransformersModel(TransformersBase):
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
|
# Handle BERT-like models
|
||||||
|
"bert": "model",
|
||||||
# Add `model.` prefix for base model checkpoints
|
# Add `model.` prefix for base model checkpoints
|
||||||
"": "model.",
|
"": "model.",
|
||||||
# Remove `model.` from places it should not be
|
# Remove `model.` prefix if it was already there
|
||||||
"model.model.": "model.",
|
"model.model.": "model.",
|
||||||
|
# Pooling adapters will be adjacent to `model`
|
||||||
|
"model.pooler": "pooler",
|
||||||
"model.score": "score",
|
"model.score": "score",
|
||||||
|
# Classifier adapter's classifier layer is renamed to score
|
||||||
|
"model.classifier": "score",
|
||||||
|
},
|
||||||
|
orig_to_new_suffix={
|
||||||
|
# Replace legacy suffixes used for norms
|
||||||
|
".gamma": ".weight",
|
||||||
|
".beta": ".bias",
|
||||||
})
|
})
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
|
|
||||||
# Some encoder models have the position_ids buffer in the checkpoint
|
# After creating a pooling model, `pooler` will be duplicated.
|
||||||
|
# The one inside `model` comes from the Transformers modelling code.
|
||||||
|
# The one after `model` is an adapter from vLLM.
|
||||||
|
# We want to use the adapter so we nullify the original pooler.
|
||||||
|
if getattr(self.model, "pooler", None) is not None:
|
||||||
|
self.skip_prefixes.append("pooler.")
|
||||||
|
self.model.pooler = torch.nn.Identity()
|
||||||
|
|
||||||
|
# Some encoder models have the position_ids buffer in the checkpoint.
|
||||||
# vLLM will always pass position_ids as an argument, so we skip loading
|
# vLLM will always pass position_ids as an argument, so we skip loading
|
||||||
# the buffer if it exists
|
# the buffer if it exists
|
||||||
self.skip_substrs.append("position_ids")
|
self.skip_substrs.append("position_ids")
|
||||||
|
|
||||||
|
# Some encoder models have the bias of the final classifier layer
|
||||||
|
# in the checkpoint. vLLM does not use this bias, so we skip loading
|
||||||
|
# it if it exists
|
||||||
|
self.skip_substrs.append("score.bias")
|
||||||
|
|
||||||
def create_attention_instances(
|
def create_attention_instances(
|
||||||
self, attn_type: AttentionType = AttentionType.DECODER):
|
self, attn_type: AttentionType = AttentionType.DECODER):
|
||||||
# TODO(hmellor): Better way to detect encoder models
|
# TODO(hmellor): Better way to detect encoder models
|
||||||
|
|||||||
@ -987,8 +987,10 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]:
|
|||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
our_pid = os.getpid()
|
||||||
for conn in psutil.net_connections():
|
for conn in psutil.net_connections():
|
||||||
if conn.laddr.port == port:
|
if conn.laddr.port == port and (conn.pid is not None
|
||||||
|
and conn.pid != our_pid):
|
||||||
try:
|
try:
|
||||||
return psutil.Process(conn.pid)
|
return psutil.Process(conn.pid)
|
||||||
except psutil.NoSuchProcess:
|
except psutil.NoSuchProcess:
|
||||||
|
|||||||
@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
kv_data_type=self.kv_cache_dtype,
|
kv_data_type=self.kv_cache_dtype,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
|
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
|
||||||
|
self.device, non_blocking=True)
|
||||||
attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
|
attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
|
||||||
self.device)
|
self.device, non_blocking=True)
|
||||||
|
|
||||||
if num_decodes > 0:
|
if num_decodes > 0:
|
||||||
pure_decode = num_prefills == 0
|
pure_decode = num_prefills == 0
|
||||||
|
|||||||
@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata)
|
|||||||
def use_flashinfer_prefill() -> bool:
|
def use_flashinfer_prefill() -> bool:
|
||||||
# For blackwell default to flashinfer prefill if it's available since
|
# For blackwell default to flashinfer prefill if it's available since
|
||||||
# it is faster than FA2.
|
# it is faster than FA2.
|
||||||
return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
|
return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available
|
||||||
|
and not envs.VLLM_USE_CUDNN_PREFILL
|
||||||
and current_platform.is_device_capability(100))
|
and current_platform.is_device_capability(100))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""KV-Cache Utilities."""
|
"""KV-Cache Utilities."""
|
||||||
|
|
||||||
|
import copy
|
||||||
import os
|
import os
|
||||||
from collections import defaultdict, deque
|
from collections import defaultdict, deque
|
||||||
from collections.abc import Iterable, Sequence
|
from collections.abc import Iterable, Sequence
|
||||||
@ -15,7 +16,8 @@ from vllm.utils import GiB_bytes, cdiv, sha256_cbor
|
|||||||
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||||
FullAttentionSpec, KVCacheConfig,
|
FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheGroupSpec, KVCacheSpec,
|
KVCacheGroupSpec, KVCacheSpec,
|
||||||
KVCacheTensor, SlidingWindowSpec)
|
KVCacheTensor, SlidingWindowSpec,
|
||||||
|
UniformTypeKVCacheSpecs)
|
||||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
|
|
||||||
@ -750,7 +752,7 @@ def create_kv_cache_group_specs(
|
|||||||
return kv_cache_groups
|
return kv_cache_groups
|
||||||
|
|
||||||
|
|
||||||
def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether all layers in the given KVCacheSpec have the same KV cache spec.
|
Whether all layers in the given KVCacheSpec have the same KV cache spec.
|
||||||
Note that we regard FullAttentionSpec with and without sliding window as
|
Note that we regard FullAttentionSpec with and without sliding window as
|
||||||
@ -793,6 +795,21 @@ def get_max_concurrency_for_kv_cache_config(
|
|||||||
return max_concurrency
|
return max_concurrency
|
||||||
|
|
||||||
|
|
||||||
|
def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
|
||||||
|
"""
|
||||||
|
Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
|
||||||
|
"""
|
||||||
|
if vllm_config.cache_config.num_gpu_blocks_override is not None:
|
||||||
|
num_gpu_blocks_override = \
|
||||||
|
vllm_config.cache_config.num_gpu_blocks_override
|
||||||
|
logger.info(
|
||||||
|
"Overriding num_gpu_blocks=%d with "
|
||||||
|
"num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
|
||||||
|
num_blocks = num_gpu_blocks_override
|
||||||
|
|
||||||
|
return num_blocks
|
||||||
|
|
||||||
|
|
||||||
def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
|
def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
|
||||||
available_memory: int, page_size: int) -> int:
|
available_memory: int, page_size: int) -> int:
|
||||||
"""
|
"""
|
||||||
@ -806,13 +823,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
|
|||||||
"""
|
"""
|
||||||
num_blocks = int(available_memory // page_size // num_layers)
|
num_blocks = int(available_memory // page_size // num_layers)
|
||||||
num_blocks = max(num_blocks, 0)
|
num_blocks = max(num_blocks, 0)
|
||||||
if vllm_config.cache_config.num_gpu_blocks_override is not None:
|
num_blocks = may_override_num_blocks(vllm_config, num_blocks)
|
||||||
num_gpu_blocks_override = \
|
|
||||||
vllm_config.cache_config.num_gpu_blocks_override
|
|
||||||
logger.info(
|
|
||||||
"Overriding num_gpu_blocks=%d with "
|
|
||||||
"num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
|
|
||||||
num_blocks = num_gpu_blocks_override
|
|
||||||
return num_blocks
|
return num_blocks
|
||||||
|
|
||||||
|
|
||||||
@ -825,11 +836,11 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
|
|||||||
return page_sizes.pop()
|
return page_sizes.pop()
|
||||||
|
|
||||||
|
|
||||||
def _get_kv_cache_groups_uniform_type(
|
def _get_kv_cache_groups_uniform_spec(
|
||||||
kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
|
kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
|
||||||
"""
|
"""
|
||||||
Generates the KV cache configuration for a model with one type of KV cache.
|
Generates the KV cache configuration for a model with the same KV cache
|
||||||
Divide the available memory equally among all layers.
|
spec for all layers.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
kv_cache_specs: The kv cache spec of each attention layer in the model
|
kv_cache_specs: The kv cache spec of each attention layer in the model
|
||||||
@ -842,6 +853,22 @@ def _get_kv_cache_groups_uniform_type(
|
|||||||
[list(kv_cache_specs.keys())])
|
[list(kv_cache_specs.keys())])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_kv_cache_groups_uniform_type(
|
||||||
|
spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]:
|
||||||
|
"""
|
||||||
|
Generates the KV cache configuration for a model with one type of KV cache
|
||||||
|
but different hidden sizes. All layers are merged into one group.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spec: The UniformTypeKVCacheSpecs of the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The generated KVCacheGroupSpecs
|
||||||
|
"""
|
||||||
|
|
||||||
|
return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)]
|
||||||
|
|
||||||
|
|
||||||
def is_kv_cache_page_size_uniform(
|
def is_kv_cache_page_size_uniform(
|
||||||
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||||
"""
|
"""
|
||||||
@ -1000,28 +1027,45 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Determine how model runners should initialize the KV cache tensors.
|
# Determine how model runners should initialize the KV cache tensors.
|
||||||
# We will have group_size memory pools, each is shared by one layer from
|
if len(kv_cache_groups) == 1 and \
|
||||||
# each group. As layers of different groups have different block table,
|
isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs):
|
||||||
# they will use different parts of the shared Tensor.
|
# Special case: all layers have the same type of KV cache but with
|
||||||
# The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
|
# different hidden size. Allocate different amount of memory for each
|
||||||
# (sw.1, padding) will be: (group_size = 2)
|
# layer based on its hidden size.
|
||||||
# full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
|
num_blocks = available_memory // kv_cache_groups[
|
||||||
# full.1, sw.2: share another Tensor with size=available_memory//2
|
0].kv_cache_spec.page_size_bytes
|
||||||
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
num_blocks = may_override_num_blocks(vllm_config, num_blocks)
|
||||||
|
per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
|
||||||
|
kv_cache_tensors = [
|
||||||
|
KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes *
|
||||||
|
num_blocks,
|
||||||
|
shared_by=[layer_name])
|
||||||
|
for layer_name in kv_cache_groups[0].layer_names
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# General case:
|
||||||
|
# We will have group_size memory pools, each is shared by one layer from
|
||||||
|
# each group. As layers of different groups have different block table,
|
||||||
|
# they will use different parts of the shared Tensor.
|
||||||
|
# The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
|
||||||
|
# (sw.1, padding) will be: (group_size = 2)
|
||||||
|
# full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
|
||||||
|
# full.1, sw.2: share another Tensor with size=available_memory//2
|
||||||
|
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
||||||
|
|
||||||
page_size = get_uniform_page_size(kv_cache_specs)
|
page_size = get_uniform_page_size(kv_cache_specs)
|
||||||
assert group_size > 0, "group_size must be greater than 0"
|
assert group_size > 0, "group_size must be greater than 0"
|
||||||
num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
|
num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
|
||||||
page_size)
|
page_size)
|
||||||
per_memory_pool_size = page_size * num_blocks
|
kv_cache_tensors = []
|
||||||
kv_cache_tensors = []
|
for i in range(group_size):
|
||||||
for i in range(group_size):
|
shared_by = []
|
||||||
shared_by = []
|
for j in range(len(kv_cache_groups)):
|
||||||
for j in range(len(kv_cache_groups)):
|
if i < len(kv_cache_groups[j].layer_names):
|
||||||
if i < len(kv_cache_groups[j].layer_names):
|
shared_by.append(kv_cache_groups[j].layer_names[i])
|
||||||
shared_by.append(kv_cache_groups[j].layer_names[i])
|
kv_cache_tensors.append(
|
||||||
kv_cache_tensors.append(
|
KVCacheTensor(size=page_size * num_blocks,
|
||||||
KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
|
shared_by=shared_by))
|
||||||
|
|
||||||
kv_cache_config = KVCacheConfig(
|
kv_cache_config = KVCacheConfig(
|
||||||
num_blocks=num_blocks,
|
num_blocks=num_blocks,
|
||||||
@ -1059,7 +1103,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
|||||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
if is_kv_cache_spec_uniform(kv_cache_spec):
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -1097,7 +1141,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
|||||||
attention_chunk_size=spec.attention_chunk_size,
|
attention_chunk_size=spec.attention_chunk_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not is_kv_cache_type_uniform(kv_cache_spec):
|
if not is_kv_cache_spec_uniform(kv_cache_spec):
|
||||||
raise ValueError("Hybrid KV cache manager is disabled but failed to "
|
raise ValueError("Hybrid KV cache manager is disabled but failed to "
|
||||||
"convert the KV cache specs to one unified type.")
|
"convert the KV cache specs to one unified type.")
|
||||||
|
|
||||||
@ -1122,11 +1166,16 @@ def get_kv_cache_groups(
|
|||||||
# This returns an empty list to allow for the KVCacheManager to handle
|
# This returns an empty list to allow for the KVCacheManager to handle
|
||||||
# attention free models.
|
# attention free models.
|
||||||
return []
|
return []
|
||||||
elif is_kv_cache_type_uniform(kv_cache_spec):
|
elif is_kv_cache_spec_uniform(kv_cache_spec):
|
||||||
# KV cache of all layers are the same, which is true for
|
# KV cache of all layers are the same, which is true for
|
||||||
# most models. Allocate the same amount of memory for
|
# most models. Allocate the same amount of memory for
|
||||||
# each layer.
|
# each layer.
|
||||||
return _get_kv_cache_groups_uniform_type(kv_cache_spec)
|
return _get_kv_cache_groups_uniform_spec(kv_cache_spec)
|
||||||
|
elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec):
|
||||||
|
# All layers need the same number of token slots (e.g., all layers are
|
||||||
|
# full attention, or all layers are sliding window attention with the
|
||||||
|
# same window size). Put all layers into one group.
|
||||||
|
return _get_kv_cache_groups_uniform_type(uniform_spec)
|
||||||
elif is_kv_cache_page_size_uniform(kv_cache_spec):
|
elif is_kv_cache_page_size_uniform(kv_cache_spec):
|
||||||
# Model contains multiple attention types, but KV cache of all layers
|
# Model contains multiple attention types, but KV cache of all layers
|
||||||
# have the same physical memory per block per layer. Split the layers
|
# have the same physical memory per block per layer. Split the layers
|
||||||
@ -1137,6 +1186,27 @@ def get_kv_cache_groups(
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def generate_scheduler_kv_cache_config(
|
||||||
|
kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig:
|
||||||
|
"""
|
||||||
|
Generate the KV cache configuration for the scheduler.
|
||||||
|
"""
|
||||||
|
assert all([
|
||||||
|
cfg.num_blocks == kv_cache_configs[0].num_blocks
|
||||||
|
for cfg in kv_cache_configs
|
||||||
|
])
|
||||||
|
# All workers have the same kv_cache_config except layer names, so use
|
||||||
|
# an arbitrary one to initialize the scheduler.
|
||||||
|
cfg = copy.deepcopy(kv_cache_configs[0])
|
||||||
|
for group in cfg.kv_cache_groups:
|
||||||
|
if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs):
|
||||||
|
# All layers in the UniformTypeKVCacheSpecs have the same type,
|
||||||
|
# so use an arbitrary one to initialize the scheduler.
|
||||||
|
group.kv_cache_spec = next(
|
||||||
|
iter(group.kv_cache_spec.kv_cache_specs.values()))
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
def get_kv_cache_configs(vllm_config: VllmConfig,
|
def get_kv_cache_configs(vllm_config: VllmConfig,
|
||||||
kv_cache_specs: list[dict[str, KVCacheSpec]],
|
kv_cache_specs: list[dict[str, KVCacheSpec]],
|
||||||
available_memory: list[int]) -> list[KVCacheConfig]:
|
available_memory: list[int]) -> list[KVCacheConfig]:
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
|
from vllm.config import CUDAGraphMode, VllmConfig
|
||||||
from vllm.forward_context import BatchDescriptor
|
from vllm.forward_context import BatchDescriptor
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
@ -39,11 +39,15 @@ class CudagraphDispatcher:
|
|||||||
CUDAGraphMode.FULL: set(),
|
CUDAGraphMode.FULL: set(),
|
||||||
}
|
}
|
||||||
|
|
||||||
assert not self.cudagraph_mode.requires_piecewise_compilation() or \
|
not_use_piecewise_compilation = (
|
||||||
(self.compilation_config.level == CompilationLevel.PIECEWISE and
|
not self.cudagraph_mode.requires_piecewise_compilation())
|
||||||
self.compilation_config.splitting_ops_contain_attention()), \
|
|
||||||
|
assert not_use_piecewise_compilation or \
|
||||||
|
self.compilation_config.is_attention_compiled_piecewise(), \
|
||||||
"Compilation level should be CompilationLevel.PIECEWISE when "\
|
"Compilation level should be CompilationLevel.PIECEWISE when "\
|
||||||
"cudagraph_mode piecewise cudagraphs is used, "\
|
"cudagraph_mode piecewise cudagraphs is used, "\
|
||||||
|
"and attention should be in splitting_ops or "\
|
||||||
|
"inductor splitting should be used. " \
|
||||||
f"cudagraph_mode={self.cudagraph_mode}, "\
|
f"cudagraph_mode={self.cudagraph_mode}, "\
|
||||||
f"compilation_level={self.compilation_config.level}, "\
|
f"compilation_level={self.compilation_config.level}, "\
|
||||||
f"splitting_ops={self.compilation_config.splitting_ops}"
|
f"splitting_ops={self.compilation_config.splitting_ops}"
|
||||||
|
|||||||
@ -29,7 +29,9 @@ from vllm.transformers_utils.config import (
|
|||||||
maybe_register_config_serialize_by_value)
|
maybe_register_config_serialize_by_value)
|
||||||
from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
|
from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
|
||||||
resolve_obj_by_qualname, set_process_title)
|
resolve_obj_by_qualname, set_process_title)
|
||||||
from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs,
|
from vllm.v1.core.kv_cache_utils import (BlockHash,
|
||||||
|
generate_scheduler_kv_cache_config,
|
||||||
|
get_kv_cache_configs,
|
||||||
get_request_block_hasher,
|
get_request_block_hasher,
|
||||||
init_none_hash)
|
init_none_hash)
|
||||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||||
@ -196,16 +198,10 @@ class EngineCore:
|
|||||||
|
|
||||||
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
||||||
available_gpu_memory)
|
available_gpu_memory)
|
||||||
|
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
|
||||||
# All workers have the same kv_cache_config except layer names, so use
|
kv_cache_configs)
|
||||||
# an arbitrary one to initialize the scheduler.
|
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
|
||||||
assert all([
|
|
||||||
cfg.num_blocks == kv_cache_configs[0].num_blocks
|
|
||||||
for cfg in kv_cache_configs
|
|
||||||
])
|
|
||||||
num_gpu_blocks = kv_cache_configs[0].num_blocks
|
|
||||||
num_cpu_blocks = 0
|
num_cpu_blocks = 0
|
||||||
scheduler_kv_cache_config = kv_cache_configs[0]
|
|
||||||
|
|
||||||
# Initialize kv cache and warmup the execution
|
# Initialize kv cache and warmup the execution
|
||||||
self.model_executor.initialize_from_config(kv_cache_configs)
|
self.model_executor.initialize_from_config(kv_cache_configs)
|
||||||
|
|||||||
@ -5,6 +5,7 @@ from collections.abc import Mapping
|
|||||||
from copy import copy
|
from copy import copy
|
||||||
from typing import Any, Callable, Optional, Union
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
|
import torch.nn as nn
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
@ -33,6 +34,7 @@ from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
|
|||||||
StatLoggerFactory)
|
StatLoggerFactory)
|
||||||
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
|
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
|
||||||
from vllm.v1.metrics.stats import IterationStats
|
from vllm.v1.metrics.stats import IterationStats
|
||||||
|
from vllm.v1.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -319,12 +321,15 @@ class LLMEngine:
|
|||||||
return self.engine_core.pin_lora(lora_id)
|
return self.engine_core.pin_lora(lora_id)
|
||||||
|
|
||||||
def collective_rpc(self,
|
def collective_rpc(self,
|
||||||
method: Union[str, Callable[..., _R]],
|
method: Union[str, Callable[[WorkerBase], _R]],
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
args: tuple = (),
|
args: tuple = (),
|
||||||
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
|
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
|
||||||
return self.engine_core.collective_rpc(method, timeout, args, kwargs)
|
return self.engine_core.collective_rpc(method, timeout, args, kwargs)
|
||||||
|
|
||||||
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
|
return self.collective_rpc("apply_model", args=(func, ))
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if dp_group := getattr(self, "dp_group", None):
|
if dp_group := getattr(self, "dp_group", None):
|
||||||
stateless_destroy_torch_distributed_process_group(dp_group)
|
stateless_destroy_torch_distributed_process_group(dp_group)
|
||||||
|
|||||||
@ -235,6 +235,76 @@ class CrossAttentionSpec(AttentionSpec):
|
|||||||
return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
|
return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class UniformTypeKVCacheSpecs(KVCacheSpec):
|
||||||
|
"""
|
||||||
|
A KV cache spec for multiple layers with the same type of attention. Here,
|
||||||
|
same types means always need the same number of token slots. For example,
|
||||||
|
sliding window attentions with different window sizes are not the same type
|
||||||
|
and should not be merged into one UniformTypeKVCacheSpecs.
|
||||||
|
"""
|
||||||
|
kv_cache_specs: dict[str, KVCacheSpec]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def page_size_bytes(self) -> int:
|
||||||
|
return sum(spec.page_size_bytes
|
||||||
|
for spec in self.kv_cache_specs.values())
|
||||||
|
|
||||||
|
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||||
|
max_num_pages = max(
|
||||||
|
cdiv(spec.max_memory_usage_bytes(vllm_config),
|
||||||
|
spec.page_size_bytes)
|
||||||
|
for spec in self.kv_cache_specs.values())
|
||||||
|
return max_num_pages * self.page_size_bytes
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool:
|
||||||
|
"""
|
||||||
|
Whether all layers have the same type of KV cache spec.
|
||||||
|
"""
|
||||||
|
block_sizes = set(spec.block_size for spec in kv_cache_specs.values())
|
||||||
|
if len(block_sizes) > 1:
|
||||||
|
# Different block sizes, not uniform.
|
||||||
|
return False
|
||||||
|
one_spec = next(iter(kv_cache_specs.values()))
|
||||||
|
if isinstance(one_spec, (FullAttentionSpec, CrossAttentionSpec)):
|
||||||
|
return all(
|
||||||
|
isinstance(spec, type(one_spec))
|
||||||
|
for spec in kv_cache_specs.values())
|
||||||
|
elif isinstance(one_spec, SlidingWindowSpec):
|
||||||
|
return all(
|
||||||
|
isinstance(spec, SlidingWindowSpec)
|
||||||
|
and spec.sliding_window == one_spec.sliding_window
|
||||||
|
for spec in kv_cache_specs.values())
|
||||||
|
elif isinstance(one_spec, ChunkedLocalAttentionSpec):
|
||||||
|
return all(
|
||||||
|
isinstance(spec, ChunkedLocalAttentionSpec)
|
||||||
|
and spec.attention_chunk_size == one_spec.attention_chunk_size
|
||||||
|
for spec in kv_cache_specs.values())
|
||||||
|
elif isinstance(one_spec, MambaSpec):
|
||||||
|
return all(
|
||||||
|
isinstance(spec, MambaSpec) and spec.num_speculative_blocks ==
|
||||||
|
one_spec.num_speculative_blocks
|
||||||
|
for spec in kv_cache_specs.values())
|
||||||
|
else:
|
||||||
|
# NOTE(Chen): Please add new branches for new KV cache spec types.
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Unsupported KV cache spec type: {type(one_spec)}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_specs(cls, kv_cache_specs: dict[str,
|
||||||
|
KVCacheSpec]) -> Optional[Self]:
|
||||||
|
"""
|
||||||
|
Return a SameTypeKVCacheSpecs object if all layers have the same type
|
||||||
|
of KV cache spec. Return None if not.
|
||||||
|
"""
|
||||||
|
if cls.is_uniform_type(kv_cache_specs):
|
||||||
|
block_size = next(iter(kv_cache_specs.values())).block_size
|
||||||
|
return cls(block_size=block_size, kv_cache_specs=kv_cache_specs)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class KVCacheTensor:
|
class KVCacheTensor:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import pickle
|
|||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from inspect import isclass
|
from inspect import isclass
|
||||||
from types import FunctionType
|
from types import FunctionType
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
import cloudpickle
|
import cloudpickle
|
||||||
import msgspec
|
import msgspec
|
||||||
@ -59,6 +59,42 @@ def _typestr(val: Any) -> Optional[tuple[str, str]]:
|
|||||||
return t.__module__, t.__qualname__
|
return t.__module__, t.__qualname__
|
||||||
|
|
||||||
|
|
||||||
|
def _encode_type_info_recursive(obj: Any) -> Any:
|
||||||
|
"""Recursively encode type information for nested structures of
|
||||||
|
lists/dicts."""
|
||||||
|
if obj is None:
|
||||||
|
return None
|
||||||
|
if type(obj) is list:
|
||||||
|
return [_encode_type_info_recursive(item) for item in obj]
|
||||||
|
if type(obj) is dict:
|
||||||
|
return {k: _encode_type_info_recursive(v) for k, v in obj.items()}
|
||||||
|
return _typestr(obj)
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_type_info_recursive(
|
||||||
|
type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any],
|
||||||
|
Any]) -> Any:
|
||||||
|
"""Recursively decode type information for nested structures of
|
||||||
|
lists/dicts."""
|
||||||
|
if type_info is None:
|
||||||
|
return data
|
||||||
|
if isinstance(type_info, dict):
|
||||||
|
assert isinstance(data, dict)
|
||||||
|
return {
|
||||||
|
k: _decode_type_info_recursive(type_info[k], data[k], convert_fn)
|
||||||
|
for k in type_info
|
||||||
|
}
|
||||||
|
if isinstance(type_info, list) and (
|
||||||
|
# Exclude serialized tensors/numpy arrays.
|
||||||
|
len(type_info) != 2 or not isinstance(type_info[0], str)):
|
||||||
|
assert isinstance(data, list)
|
||||||
|
return [
|
||||||
|
_decode_type_info_recursive(ti, d, convert_fn)
|
||||||
|
for ti, d in zip(type_info, data)
|
||||||
|
]
|
||||||
|
return convert_fn(type_info, data)
|
||||||
|
|
||||||
|
|
||||||
class MsgpackEncoder:
|
class MsgpackEncoder:
|
||||||
"""Encoder with custom torch tensor and numpy array serialization.
|
"""Encoder with custom torch tensor and numpy array serialization.
|
||||||
|
|
||||||
@ -129,12 +165,10 @@ class MsgpackEncoder:
|
|||||||
result = obj.result
|
result = obj.result
|
||||||
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
||||||
return None, result
|
return None, result
|
||||||
# Since utility results are not strongly typed, we also encode
|
# Since utility results are not strongly typed, we recursively
|
||||||
# the type (or a list of types in the case it's a list) to
|
# encode type information for nested structures of lists/dicts
|
||||||
# help with correct msgspec deserialization.
|
# to help with correct msgspec deserialization.
|
||||||
return _typestr(result) if type(result) is not list else [
|
return _encode_type_info_recursive(result), result
|
||||||
_typestr(v) for v in result
|
|
||||||
], result
|
|
||||||
|
|
||||||
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
||||||
raise TypeError(f"Object of type {type(obj)} is not serializable"
|
raise TypeError(f"Object of type {type(obj)} is not serializable"
|
||||||
@ -288,15 +322,9 @@ class MsgpackDecoder:
|
|||||||
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
||||||
raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
|
raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
|
||||||
"be set to use custom utility result types")
|
"be set to use custom utility result types")
|
||||||
assert isinstance(result_type, list)
|
# Use recursive decoding to handle nested structures
|
||||||
if len(result_type) == 2 and isinstance(result_type[0], str):
|
result = _decode_type_info_recursive(result_type, result,
|
||||||
result = self._convert_result(result_type, result)
|
self._convert_result)
|
||||||
else:
|
|
||||||
assert isinstance(result, list)
|
|
||||||
result = [
|
|
||||||
self._convert_result(rt, r)
|
|
||||||
for rt, r in zip(result_type, result)
|
|
||||||
]
|
|
||||||
return UtilityResult(result)
|
return UtilityResult(result)
|
||||||
|
|
||||||
def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
|
def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from collections import defaultdict
|
|||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -74,7 +74,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
|
|||||||
EncoderOnlyAttentionSpec,
|
EncoderOnlyAttentionSpec,
|
||||||
FullAttentionSpec, KVCacheConfig,
|
FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheGroupSpec, KVCacheSpec,
|
KVCacheGroupSpec, KVCacheSpec,
|
||||||
MambaSpec, SlidingWindowSpec)
|
MambaSpec, SlidingWindowSpec,
|
||||||
|
UniformTypeKVCacheSpecs)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||||
DraftTokenIds, LogprobsLists, LogprobsTensors,
|
DraftTokenIds, LogprobsLists, LogprobsTensors,
|
||||||
@ -1187,7 +1188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
common_prefix_len = self._compute_cascade_attn_prefix_len(
|
common_prefix_len = self._compute_cascade_attn_prefix_len(
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_common_prefix_blocks,
|
num_common_prefix_blocks,
|
||||||
kv_cache_group_spec.kv_cache_spec,
|
attn_group.kv_cache_spec,
|
||||||
builder,
|
builder,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1903,7 +1904,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
**self._init_model_kwargs(num_scheduled_tokens),
|
**self._init_model_kwargs(num_scheduled_tokens),
|
||||||
**self._extract_mm_kwargs(scheduler_output),
|
**self._extract_mm_kwargs(scheduler_output),
|
||||||
}
|
}
|
||||||
elif (self.enable_prompt_embeds and get_pp_group().is_first_rank):
|
elif self.enable_prompt_embeds and get_pp_group().is_first_rank:
|
||||||
# Get the input embeddings for the tokens that are not input embeds,
|
# Get the input embeddings for the tokens that are not input embeds,
|
||||||
# then put them into the appropriate positions.
|
# then put them into the appropriate positions.
|
||||||
# TODO(qthequartermasterman): Since even when prompt embeds are
|
# TODO(qthequartermasterman): Since even when prompt embeds are
|
||||||
@ -2125,6 +2126,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
invalid_req_indices,
|
invalid_req_indices,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def synchronize_input_prep(self):
|
||||||
|
if self.prepare_inputs_event is None:
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ensure prior step has finished with reused CPU tensors.
|
||||||
|
# This is required in the async scheduling case because
|
||||||
|
# the CPU->GPU transfer happens async.
|
||||||
|
self.prepare_inputs_event.synchronize()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
self.prepare_inputs_event.record()
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def execute_model(
|
def execute_model(
|
||||||
self,
|
self,
|
||||||
@ -2132,33 +2148,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
|
) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
|
||||||
with record_function_or_nullcontext("Preprocess"):
|
with record_function_or_nullcontext("Preprocess"):
|
||||||
self._update_states(scheduler_output)
|
with self.synchronize_input_prep():
|
||||||
if not scheduler_output.total_num_scheduled_tokens:
|
# Update persistent batch states.
|
||||||
if not has_kv_transfer_group():
|
self._update_states(scheduler_output)
|
||||||
# Return empty ModelRunnerOutput if there's no work to do.
|
|
||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
if not scheduler_output.total_num_scheduled_tokens:
|
||||||
return self.kv_connector_no_forward(scheduler_output,
|
if not has_kv_transfer_group():
|
||||||
self.vllm_config)
|
# Return empty ModelRunnerOutput if no work to do.
|
||||||
if self.cache_config.kv_sharing_fast_prefill:
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
assert not self.input_batch.num_prompt_logprobs, (
|
return self.kv_connector_no_forward(
|
||||||
"--kv-sharing-fast-prefill produces incorrect logprobs for "
|
scheduler_output, self.vllm_config)
|
||||||
"prompt tokens, tokens, please disable it when the requests"
|
if self.cache_config.kv_sharing_fast_prefill:
|
||||||
" need prompt logprobs")
|
assert not self.input_batch.num_prompt_logprobs, (
|
||||||
|
"--kv-sharing-fast-prefill produces incorrect "
|
||||||
|
"logprobs for prompt tokens, tokens, please disable "
|
||||||
|
"it when the requests need prompt logprobs")
|
||||||
|
|
||||||
if self.prepare_inputs_event is not None:
|
|
||||||
# Ensure prior step has finished with reused CPU tensors.
|
|
||||||
self.prepare_inputs_event.synchronize()
|
|
||||||
try:
|
|
||||||
# Prepare the decoder inputs.
|
# Prepare the decoder inputs.
|
||||||
(attn_metadata, logits_indices, spec_decode_metadata,
|
(attn_metadata, logits_indices, spec_decode_metadata,
|
||||||
num_scheduled_tokens_np, spec_decode_common_attn_metadata,
|
num_scheduled_tokens_np, spec_decode_common_attn_metadata,
|
||||||
max_query_len, ubatch_slices, num_tokens_after_padding
|
max_query_len, ubatch_slices, num_tokens_after_padding
|
||||||
) = self._prepare_inputs(scheduler_output)
|
) = self._prepare_inputs(scheduler_output)
|
||||||
|
|
||||||
finally:
|
|
||||||
if self.prepare_inputs_event is not None:
|
|
||||||
self.prepare_inputs_event.record()
|
|
||||||
|
|
||||||
(
|
(
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_input_tokens,
|
num_input_tokens,
|
||||||
@ -2592,9 +2603,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
backend = self.vllm_config.compilation_config.init_backend(
|
backend = self.vllm_config.compilation_config.init_backend(
|
||||||
self.vllm_config)
|
self.vllm_config)
|
||||||
compilation_counter.dynamo_as_is_count += 1
|
compilation_counter.dynamo_as_is_count += 1
|
||||||
self.model.compile(
|
self.model.compile(fullgraph=True, backend=backend)
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
|
||||||
backend=backend)
|
|
||||||
return
|
return
|
||||||
# for other compilation levels, cudagraph behavior is controlled by
|
# for other compilation levels, cudagraph behavior is controlled by
|
||||||
# CudagraphWraper and CudagraphDispatcher of vllm.
|
# CudagraphWraper and CudagraphDispatcher of vllm.
|
||||||
@ -3445,12 +3454,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
assert len(self.attn_groups) == 0, \
|
assert len(self.attn_groups) == 0, \
|
||||||
"Attention backends are already initialized"
|
"Attention backends are already initialized"
|
||||||
|
|
||||||
def get_attn_backends_for_layers(
|
class AttentionGroupKey(NamedTuple):
|
||||||
layer_names: list[str]
|
attn_backend: type[AttentionBackend]
|
||||||
) -> dict[type[AttentionBackend], list[str]]:
|
kv_cache_spec: KVCacheSpec
|
||||||
layers = get_layers_from_vllm_config(self.vllm_config,
|
|
||||||
AttentionLayerBase,
|
def get_attn_backends_for_group(
|
||||||
layer_names)
|
kv_cache_group_spec: KVCacheGroupSpec,
|
||||||
|
) -> dict[AttentionGroupKey, list[str]]:
|
||||||
|
layers = get_layers_from_vllm_config(
|
||||||
|
self.vllm_config, AttentionLayerBase,
|
||||||
|
kv_cache_group_spec.layer_names)
|
||||||
attn_backends = {}
|
attn_backends = {}
|
||||||
attn_backend_layers = defaultdict(list)
|
attn_backend_layers = defaultdict(list)
|
||||||
# Dedupe based on full class name; this is a bit safer than
|
# Dedupe based on full class name; this is a bit safer than
|
||||||
@ -3458,7 +3471,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
||||||
# they are cached correctly, there will be different objects per
|
# they are cached correctly, there will be different objects per
|
||||||
# layer.
|
# layer.
|
||||||
for layer_name in layer_names:
|
for layer_name in kv_cache_group_spec.layer_names:
|
||||||
attn_backend = layers[layer_name].get_attn_backend()
|
attn_backend = layers[layer_name].get_attn_backend()
|
||||||
|
|
||||||
if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
|
if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
|
||||||
@ -3467,8 +3480,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
attn_backend,
|
attn_backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
key = attn_backend.full_cls_name()
|
full_cls_name = attn_backend.full_cls_name()
|
||||||
attn_backends[key] = attn_backend
|
layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec
|
||||||
|
if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs):
|
||||||
|
layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[
|
||||||
|
layer_name]
|
||||||
|
key = (full_cls_name, layer_kv_cache_spec)
|
||||||
|
attn_backends[key] = AttentionGroupKey(attn_backend,
|
||||||
|
layer_kv_cache_spec)
|
||||||
attn_backend_layers[key].append(layer_name)
|
attn_backend_layers[key].append(layer_name)
|
||||||
return {
|
return {
|
||||||
attn_backends[k]: v
|
attn_backends[k]: v
|
||||||
@ -3476,11 +3495,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def create_attn_groups(
|
def create_attn_groups(
|
||||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
attn_backends_map: dict[AttentionGroupKey, list[str]],
|
||||||
kv_cache_spec: KVCacheSpec,
|
|
||||||
) -> list[AttentionGroup]:
|
) -> list[AttentionGroup]:
|
||||||
attn_groups: list[AttentionGroup] = []
|
attn_groups: list[AttentionGroup] = []
|
||||||
for attn_backend, layer_names in attn_backends_map.items():
|
for (attn_backend,
|
||||||
|
kv_cache_spec), layer_names in attn_backends_map.items():
|
||||||
attn_metadata_builders = []
|
attn_metadata_builders = []
|
||||||
attn_metadata_builders.append(attn_backend.get_builder_cls()(
|
attn_metadata_builders.append(attn_backend.get_builder_cls()(
|
||||||
kv_cache_spec,
|
kv_cache_spec,
|
||||||
@ -3498,16 +3517,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
))
|
))
|
||||||
attn_group = AttentionGroup(attn_backend,
|
attn_group = AttentionGroup(attn_backend,
|
||||||
attn_metadata_builders,
|
attn_metadata_builders,
|
||||||
layer_names)
|
layer_names, kv_cache_spec)
|
||||||
attn_groups.append(attn_group)
|
attn_groups.append(attn_group)
|
||||||
return attn_groups
|
return attn_groups
|
||||||
|
|
||||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||||
kv_cache_spec = kv_cache_group_spec.kv_cache_spec
|
attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
|
||||||
attn_backends = get_attn_backends_for_layers(
|
self.attn_groups.append(create_attn_groups(attn_backends))
|
||||||
kv_cache_group_spec.layer_names)
|
|
||||||
self.attn_groups.append(
|
|
||||||
create_attn_groups(attn_backends, kv_cache_spec))
|
|
||||||
|
|
||||||
# Calculate reorder batch threshold (if needed)
|
# Calculate reorder batch threshold (if needed)
|
||||||
self.calculate_reorder_batch_threshold()
|
self.calculate_reorder_batch_threshold()
|
||||||
@ -3672,14 +3688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
|
def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
|
||||||
return itertools.chain.from_iterable(self.attn_groups)
|
return itertools.chain.from_iterable(self.attn_groups)
|
||||||
|
|
||||||
def _kv_cache_spec_attn_group_iterator(
|
def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]:
|
||||||
self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
|
|
||||||
if not self.kv_cache_config.kv_cache_groups:
|
if not self.kv_cache_config.kv_cache_groups:
|
||||||
return
|
return
|
||||||
for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
|
for attn_groups in self.attn_groups:
|
||||||
for attn_group in attn_groups:
|
yield from attn_groups
|
||||||
yield self.kv_cache_config.kv_cache_groups[
|
|
||||||
kv_cache_spec_id].kv_cache_spec, attn_group
|
|
||||||
|
|
||||||
def _reshape_kv_cache_tensors(
|
def _reshape_kv_cache_tensors(
|
||||||
self,
|
self,
|
||||||
@ -3699,7 +3712,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
"""
|
"""
|
||||||
kv_caches: dict[str, torch.Tensor] = {}
|
kv_caches: dict[str, torch.Tensor] = {}
|
||||||
has_attn, has_mamba = False, False
|
has_attn, has_mamba = False, False
|
||||||
for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
|
for group in self._kv_cache_spec_attn_group_iterator():
|
||||||
|
kv_cache_spec = group.kv_cache_spec
|
||||||
attn_backend = group.backend
|
attn_backend = group.backend
|
||||||
for layer_name in group.layer_names:
|
for layer_name in group.layer_names:
|
||||||
if layer_name in self.runner_only_attn_layers:
|
if layer_name in self.runner_only_attn_layers:
|
||||||
@ -3779,7 +3793,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
kv_caches: The KV cache buffer of each layer.
|
kv_caches: The KV cache buffer of each layer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
|
for group in self._kv_cache_spec_attn_group_iterator():
|
||||||
|
kv_cache_spec = group.kv_cache_spec
|
||||||
for layer_name in group.layer_names:
|
for layer_name in group.layer_names:
|
||||||
kv_cache = kv_caches[layer_name]
|
kv_cache = kv_caches[layer_name]
|
||||||
if (isinstance(kv_cache_spec, AttentionSpec)
|
if (isinstance(kv_cache_spec, AttentionSpec)
|
||||||
|
|||||||
@ -686,8 +686,10 @@ class Worker(WorkerBase):
|
|||||||
self.model_runner.save_tensorized_model(
|
self.model_runner.save_tensorized_model(
|
||||||
tensorizer_config=tensorizer_config, )
|
tensorizer_config=tensorizer_config, )
|
||||||
|
|
||||||
# def shutdown(self) -> None:
|
def shutdown(self) -> None:
|
||||||
# self.model_runner.ensure_kv_transfer_shutdown()
|
# if runner := getattr(self, "model_runner", None):
|
||||||
|
# runner.ensure_kv_transfer_shutdown()
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def init_worker_distributed_environment(
|
def init_worker_distributed_environment(
|
||||||
|
|||||||
@ -15,7 +15,7 @@ from vllm.multimodal.registry import MultiModalRegistry
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
||||||
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
|
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
|
||||||
from vllm.v1.kv_cache_interface import KVCacheGroupSpec
|
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
@ -132,6 +132,7 @@ class AttentionGroup:
|
|||||||
backend: type[AttentionBackend]
|
backend: type[AttentionBackend]
|
||||||
metadata_builders: list[AttentionMetadataBuilder]
|
metadata_builders: list[AttentionMetadataBuilder]
|
||||||
layer_names: list[str]
|
layer_names: list[str]
|
||||||
|
kv_cache_spec: KVCacheSpec
|
||||||
|
|
||||||
def get_metadata_builder(self,
|
def get_metadata_builder(self,
|
||||||
ubatch_id: Optional[int] = None
|
ubatch_id: Optional[int] = None
|
||||||
|
|||||||
@ -18,7 +18,6 @@ import torch.distributed
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||||
from vllm.attention.backends.abstract import AttentionState
|
from vllm.attention.backends.abstract import AttentionState
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
backend = self.vllm_config.compilation_config.init_backend(
|
backend = self.vllm_config.compilation_config.init_backend(
|
||||||
self.vllm_config)
|
self.vllm_config)
|
||||||
compilation_counter.dynamo_as_is_count += 1
|
compilation_counter.dynamo_as_is_count += 1
|
||||||
self.model = torch.compile(
|
self.model = torch.compile(self.model,
|
||||||
self.model,
|
fullgraph=True,
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=backend)
|
||||||
backend=backend)
|
|
||||||
|
|
||||||
def get_model(self) -> nn.Module:
|
def get_model(self) -> nn.Module:
|
||||||
return self.model
|
return self.model
|
||||||
|
|||||||
@ -5,7 +5,8 @@ import dataclasses
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type,
|
||||||
|
TypeVar, Union)
|
||||||
|
|
||||||
import cloudpickle
|
import cloudpickle
|
||||||
import torch
|
import torch
|
||||||
@ -28,6 +29,8 @@ from vllm.worker.model_runner_base import (BroadcastableModelInput,
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
_R = TypeVar("_R")
|
||||||
|
|
||||||
|
|
||||||
@warn_for_unimplemented_methods
|
@warn_for_unimplemented_methods
|
||||||
class WorkerBase:
|
class WorkerBase:
|
||||||
@ -70,6 +73,10 @@ class WorkerBase:
|
|||||||
def get_model(self) -> nn.Module:
|
def get_model(self) -> nn.Module:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
|
||||||
|
"""Apply a function on the model inside this worker."""
|
||||||
|
return fn(self.get_model())
|
||||||
|
|
||||||
def load_model(self) -> None:
|
def load_model(self) -> None:
|
||||||
"""Load model onto target device."""
|
"""Load model onto target device."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user