mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-31 07:57:02 +08:00
Merge branch 'main' into Add_support_for_openpangu_promoe_v2
Signed-off-by: yt0428 <51468697+yt0428@users.noreply.github.com>
This commit is contained in:
commit
28169a6fce
@ -903,11 +903,12 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
|
||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void run_fp4_blockwise_scaled_group_mm(
|
||||
void run_fp4_blockwise_scaled_group_mm_sm100(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM");
|
||||
"Failed to implement GEMM: status=", (int)can_implement_status);
|
||||
|
||||
// Run the GEMM
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr());
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess,
|
||||
"Failed to initialize GEMM: status=", (int)status,
|
||||
" workspace_size=", workspace_size, " num_experts=", num_experts,
|
||||
" M=", M, " N=", N, " K=", K);
|
||||
|
||||
status = gemm_op.run(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
void run_fp4_blockwise_scaled_group_mm_sm120(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
|
||||
int N, int K) {
|
||||
using ProblemShape =
|
||||
cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
|
||||
using ElementType = cutlass::float_e2m1_t;
|
||||
using ElementSFType = cutlass::float_ue4m3_t;
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
|
||||
// NOTE: For SM120 it seems templating the output type is not supported and
|
||||
// we need to hardcode the output type to bfloat16
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
using ElementD = ElementC;
|
||||
using ElementAccumulator = float;
|
||||
// Layout definitions
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using LayoutD = LayoutC;
|
||||
|
||||
// Alignment constraints
|
||||
static constexpr int AlignmentA = 32;
|
||||
static constexpr int AlignmentB = 32;
|
||||
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
// Architecture definitions
|
||||
using ArchTag = cutlass::arch::Sm120;
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
|
||||
|
||||
using ClusterShape = Shape<_1, _1, _1>;
|
||||
using MmaTileShape = Shape<_128, _128, _128>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
|
||||
ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
|
||||
LayoutD*, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
|
||||
LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
|
||||
|
||||
using GemmKernel =
|
||||
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||
CollectiveEpilogue>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
|
||||
using LayoutSFA =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
|
||||
using LayoutSFB =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
|
||||
using ScaleConfig =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
|
||||
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
|
||||
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||
|
||||
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
|
||||
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
|
||||
torch::Tensor c_strides1 =
|
||||
torch::full({num_experts}, output.stride(0), options_int);
|
||||
torch::Tensor a_strides1 =
|
||||
torch::full({num_experts}, a.stride(0) * 2, options_int);
|
||||
torch::Tensor b_strides1 =
|
||||
torch::full({num_experts}, b.stride(1) * 2, options_int);
|
||||
|
||||
run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
|
||||
a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
|
||||
layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
|
||||
expert_offsets, sf_offsets, problem_sizes, M, N, K);
|
||||
|
||||
// Create an instance of the GEMM
|
||||
Gemm gemm_op;
|
||||
|
||||
// Initialize problem_sizes_as_shapes correctly
|
||||
UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||
|
||||
// Set the Scheduler info
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
|
||||
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
|
||||
scheduler.raster_order = RasterOrderOptions::AlongM;
|
||||
hw_info.device_id = a.get_device();
|
||||
static std::unordered_map<int, int> cached_sm_counts;
|
||||
if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
|
||||
cached_sm_counts[hw_info.device_id] =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
}
|
||||
hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
|
||||
|
||||
// Mainloop Arguments
|
||||
typename GemmKernel::MainloopArguments mainloop_args{
|
||||
static_cast<const ElementType**>(a_ptrs.data_ptr()),
|
||||
static_cast<StrideA*>(a_strides1.data_ptr()),
|
||||
static_cast<const ElementType**>(b_ptrs.data_ptr()),
|
||||
static_cast<StrideB*>(b_strides1.data_ptr()),
|
||||
static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
|
||||
static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
|
||||
|
||||
// Epilogue Arguments
|
||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||
{}, // epilogue.thread
|
||||
nullptr,
|
||||
static_cast<StrideC*>(c_strides1.data_ptr()),
|
||||
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||
static_cast<StrideC*>(c_strides1.data_ptr())};
|
||||
auto& fusion_args = epilogue_args.thread;
|
||||
fusion_args.alpha_ptr_array =
|
||||
reinterpret_cast<float**>(alpha_ptrs.data_ptr());
|
||||
fusion_args.dAlpha = {_0{}, _0{}, 1};
|
||||
fusion_args.beta = 0.0f;
|
||||
|
||||
// Gemm Arguments
|
||||
typename GemmKernel::Arguments args{
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{num_experts, problem_sizes_as_shapes, nullptr},
|
||||
mainloop_args,
|
||||
epilogue_args,
|
||||
hw_info,
|
||||
scheduler};
|
||||
|
||||
size_t workspace_size = Gemm::get_workspace_size(args);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM: status=", (int)can_implement_status);
|
||||
|
||||
// Run the GEMM
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr());
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess,
|
||||
"Failed to initialize GEMM: status=", (int)status,
|
||||
" workspace_size=", workspace_size, " num_experts=", num_experts,
|
||||
" M=", M, " N=", N, " K=", K);
|
||||
|
||||
status = gemm_op.run(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void run_fp4_blockwise_scaled_group_mm(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
|
||||
int N, int K) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||
if (version_num >= 120 && version_num < 130) {
|
||||
run_fp4_blockwise_scaled_group_mm_sm120(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
if (version_num >= 100 && version_num < 120) {
|
||||
run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
|
||||
version_num, ". Required capability: 100 or 120");
|
||||
}
|
||||
|
||||
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
|
||||
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
|
||||
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
||||
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
||||
#endif
|
||||
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
|
||||
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
|
||||
// Input validation
|
||||
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
|
||||
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
|
||||
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
} else {
|
||||
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||
int32_t version_num = get_sm_version_num();
|
||||
if (version_num >= 120 && version_num < 130) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
|
||||
output.scalar_type());
|
||||
}
|
||||
#endif
|
||||
run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
|
||||
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
|
||||
"12.8 or above.");
|
||||
"be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
|
||||
"and CUDA 12.8 or above.");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
|
||||
constexpr auto INT = at::ScalarType::Int;
|
||||
constexpr auto UINT8 = at::ScalarType::Byte;
|
||||
|
||||
void scaled_fp4_experts_quant_sm100a(
|
||||
void scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
|
||||
@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
torch::Tensor const& input_sf);
|
||||
#endif
|
||||
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
void scaled_fp4_experts_quant_sm100a(
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
torch::Tensor const& output_scale_offset_by_experts) {
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
return scaled_fp4_experts_quant_sm100a(
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return scaled_fp4_experts_quant_sm1xxa(
|
||||
output, output_scale, input, input_global_scale, input_offset_by_experts,
|
||||
output_scale_offset_by_experts);
|
||||
#endif
|
||||
|
||||
@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
||||
#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
|
||||
void get_cutlass_moe_mm_data_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
||||
problem_sizes2, input_permutation,
|
||||
output_permutation, num_experts, n, k,
|
||||
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
|
||||
false,
|
||||
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
|
||||
"CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
||||
problem_sizes2, num_experts, n, k,
|
||||
blockscale_offsets);
|
||||
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
|
||||
false,
|
||||
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
|
||||
"kernel for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
|
||||
problem_sizes2, expert_num_tokens,
|
||||
num_local_experts, padded_m, n, k);
|
||||
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
false,
|
||||
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
|
||||
"for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||
|
||||
@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
|
||||
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
|
||||
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
|
||||
- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
|
||||
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
|
||||
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
|
||||
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
|
||||
|
||||
@ -133,7 +133,7 @@ def main(args):
|
||||
tensor_parallel_size=args.tp,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
enforce_eager=args.enforce_eager,
|
||||
gpu_memory_utilization=0.8,
|
||||
gpu_memory_utilization=0.9,
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
max_model_len=args.max_model_len,
|
||||
|
||||
@ -55,7 +55,7 @@ class SillyModel(nn.Module):
|
||||
def _run_simple_model(
|
||||
splitting_ops,
|
||||
use_inductor_graph_partition,
|
||||
use_inductor,
|
||||
backend,
|
||||
expected_num_piecewise_graphs_seen,
|
||||
expected_num_piecewise_capturable_graphs_seen,
|
||||
expected_num_backend_compilations,
|
||||
@ -64,7 +64,7 @@ def _run_simple_model(
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor=use_inductor,
|
||||
backend=backend,
|
||||
splitting_ops=splitting_ops,
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
cudagraph_copy_inputs=True,
|
||||
@ -124,14 +124,14 @@ def _run_simple_model(
|
||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||
@torch.inference_mode()
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
def test_simple_piecewise_compile(backend):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
use_inductor_graph_partition=False,
|
||||
use_inductor=use_inductor,
|
||||
backend=backend,
|
||||
# 2 * num_layers + 1
|
||||
expected_num_piecewise_graphs_seen=5,
|
||||
# 1 + num_layers
|
||||
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
use_inductor_graph_partition=True,
|
||||
use_inductor=True,
|
||||
backend="inductor",
|
||||
# Since not splitting at fx graph level
|
||||
expected_num_piecewise_graphs_seen=1,
|
||||
# Since not splitting at fx graph level
|
||||
|
||||
@ -249,14 +249,13 @@ def test_compilation_config():
|
||||
args = parser.parse_args(
|
||||
[
|
||||
"-O",
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": false}',
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and not args.compilation_config.use_inductor
|
||||
and args.compilation_config.backend == "eager"
|
||||
)
|
||||
|
||||
# set to string form of a dict
|
||||
@ -264,13 +263,13 @@ def test_compilation_config():
|
||||
[
|
||||
"--compilation-config="
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": true}',
|
||||
'"backend": "inductor"}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and args.compilation_config.use_inductor
|
||||
and args.compilation_config.backend == "inductor"
|
||||
)
|
||||
|
||||
|
||||
@ -278,8 +277,9 @@ def test_prefix_cache_default():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
args = parser.parse_args([])
|
||||
|
||||
# should be None by default (depends on model).
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert engine_args.enable_prefix_caching, "prefix caching should default to on."
|
||||
assert engine_args.enable_prefix_caching is None
|
||||
|
||||
# with flag to turn it on.
|
||||
args = parser.parse_args(["--enable-prefix-caching"])
|
||||
|
||||
@ -2,6 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content,
|
||||
ResponseReasoningItem,
|
||||
@ -76,6 +79,18 @@ class TestResponsesUtils:
|
||||
== 'Hmm, the user has just started with a simple "Hello,"'
|
||||
)
|
||||
|
||||
tool_call_output = ResponseFunctionToolCallOutputItem(
|
||||
id="temp_id",
|
||||
type="function_call_output",
|
||||
call_id="temp",
|
||||
output="1234",
|
||||
status="completed",
|
||||
)
|
||||
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
|
||||
assert formatted_item["role"] == "tool"
|
||||
assert formatted_item["content"] == "1234"
|
||||
assert formatted_item["tool_call_id"] == "temp"
|
||||
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
|
||||
240
tests/models/test_gguf_download.py
Normal file
240
tests/models/test_gguf_download.py
Normal file
@ -0,0 +1,240 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
|
||||
from vllm.model_executor.model_loader.weight_utils import download_gguf
|
||||
|
||||
|
||||
class TestGGUFDownload:
|
||||
"""Test GGUF model downloading functionality."""
|
||||
|
||||
@patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
|
||||
def test_download_gguf_single_file(self, mock_download):
|
||||
"""Test downloading a single GGUF file."""
|
||||
# Setup mock
|
||||
mock_folder = "/tmp/mock_cache"
|
||||
mock_download.return_value = mock_folder
|
||||
|
||||
# Mock glob to return a single file
|
||||
with patch("glob.glob") as mock_glob:
|
||||
mock_glob.side_effect = lambda pattern, **kwargs: (
|
||||
[f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
|
||||
)
|
||||
|
||||
result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
|
||||
|
||||
# Verify download_weights_from_hf was called with correct patterns
|
||||
mock_download.assert_called_once_with(
|
||||
model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
|
||||
cache_dir=None,
|
||||
allow_patterns=[
|
||||
"*-IQ1_S.gguf",
|
||||
"*-IQ1_S-*.gguf",
|
||||
"*/*-IQ1_S.gguf",
|
||||
"*/*-IQ1_S-*.gguf",
|
||||
],
|
||||
revision=None,
|
||||
ignore_patterns=None,
|
||||
)
|
||||
|
||||
# Verify result is the file path, not folder
|
||||
assert result == f"{mock_folder}/model-IQ1_S.gguf"
|
||||
|
||||
@patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
|
||||
def test_download_gguf_sharded_files(self, mock_download):
|
||||
"""Test downloading sharded GGUF files."""
|
||||
mock_folder = "/tmp/mock_cache"
|
||||
mock_download.return_value = mock_folder
|
||||
|
||||
# Mock glob to return sharded files
|
||||
with patch("glob.glob") as mock_glob:
|
||||
mock_glob.side_effect = lambda pattern, **kwargs: (
|
||||
[
|
||||
f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
|
||||
f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
|
||||
]
|
||||
if "Q2_K" in pattern
|
||||
else []
|
||||
)
|
||||
|
||||
result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
|
||||
|
||||
# Should return the first file after sorting
|
||||
assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
|
||||
|
||||
@patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
|
||||
def test_download_gguf_subdir(self, mock_download):
|
||||
"""Test downloading GGUF files from subdirectory."""
|
||||
mock_folder = "/tmp/mock_cache"
|
||||
mock_download.return_value = mock_folder
|
||||
|
||||
with patch("glob.glob") as mock_glob:
|
||||
mock_glob.side_effect = lambda pattern, **kwargs: (
|
||||
[f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
|
||||
if "Q2_K" in pattern or "**/*.gguf" in pattern
|
||||
else []
|
||||
)
|
||||
|
||||
result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
|
||||
|
||||
assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
|
||||
|
||||
@patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
|
||||
@patch("glob.glob", return_value=[])
|
||||
def test_download_gguf_no_files_found(self, mock_glob, mock_download):
|
||||
"""Test error when no GGUF files are found."""
|
||||
mock_folder = "/tmp/mock_cache"
|
||||
mock_download.return_value = mock_folder
|
||||
|
||||
with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
|
||||
download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
|
||||
|
||||
|
||||
class TestGGUFModelLoader:
|
||||
"""Test GGUFModelLoader class methods."""
|
||||
|
||||
@patch("os.path.isfile", return_value=True)
|
||||
def test_prepare_weights_local_file(self, mock_isfile):
|
||||
"""Test _prepare_weights with local file."""
|
||||
load_config = LoadConfig(load_format="gguf")
|
||||
loader = GGUFModelLoader(load_config)
|
||||
|
||||
# Create a simple mock ModelConfig with only the model attribute
|
||||
model_config = MagicMock()
|
||||
model_config.model = "/path/to/model.gguf"
|
||||
|
||||
result = loader._prepare_weights(model_config)
|
||||
assert result == "/path/to/model.gguf"
|
||||
mock_isfile.assert_called_once_with("/path/to/model.gguf")
|
||||
|
||||
@patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
|
||||
@patch("os.path.isfile", return_value=False)
|
||||
def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
|
||||
"""Test _prepare_weights with HTTPS URL."""
|
||||
load_config = LoadConfig(load_format="gguf")
|
||||
loader = GGUFModelLoader(load_config)
|
||||
|
||||
mock_hf_download.return_value = "/downloaded/model.gguf"
|
||||
|
||||
# Create a simple mock ModelConfig with only the model attribute
|
||||
model_config = MagicMock()
|
||||
model_config.model = "https://huggingface.co/model.gguf"
|
||||
|
||||
result = loader._prepare_weights(model_config)
|
||||
assert result == "/downloaded/model.gguf"
|
||||
mock_hf_download.assert_called_once_with(
|
||||
url="https://huggingface.co/model.gguf"
|
||||
)
|
||||
|
||||
@patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
|
||||
@patch("os.path.isfile", return_value=False)
|
||||
def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
|
||||
"""Test _prepare_weights with repo_id/filename.gguf format."""
|
||||
load_config = LoadConfig(load_format="gguf")
|
||||
loader = GGUFModelLoader(load_config)
|
||||
|
||||
mock_hf_download.return_value = "/downloaded/model.gguf"
|
||||
|
||||
# Create a simple mock ModelConfig with only the model attribute
|
||||
model_config = MagicMock()
|
||||
model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
|
||||
|
||||
result = loader._prepare_weights(model_config)
|
||||
assert result == "/downloaded/model.gguf"
|
||||
mock_hf_download.assert_called_once_with(
|
||||
repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
|
||||
)
|
||||
|
||||
@patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
|
||||
@patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
|
||||
@patch("vllm.config.model.get_config")
|
||||
@patch("vllm.config.model.is_gguf", return_value=True)
|
||||
@patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
|
||||
@patch("os.path.isfile", return_value=False)
|
||||
def test_prepare_weights_repo_quant_type(
|
||||
self,
|
||||
mock_isfile,
|
||||
mock_download_gguf,
|
||||
mock_is_gguf,
|
||||
mock_get_config,
|
||||
mock_file_exists,
|
||||
mock_get_image_config,
|
||||
):
|
||||
"""Test _prepare_weights with repo_id:quant_type format."""
|
||||
mock_hf_config = MagicMock()
|
||||
mock_hf_config.architectures = ["Qwen3ForCausalLM"]
|
||||
|
||||
class MockTextConfig:
|
||||
max_position_embeddings = 4096
|
||||
sliding_window = None
|
||||
model_type = "qwen3"
|
||||
num_attention_heads = 32
|
||||
|
||||
mock_text_config = MockTextConfig()
|
||||
mock_hf_config.get_text_config.return_value = mock_text_config
|
||||
mock_hf_config.dtype = "bfloat16"
|
||||
mock_get_config.return_value = mock_hf_config
|
||||
|
||||
load_config = LoadConfig(load_format="gguf")
|
||||
loader = GGUFModelLoader(load_config)
|
||||
|
||||
mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
|
||||
)
|
||||
result = loader._prepare_weights(model_config)
|
||||
# The actual result will be the downloaded file path from mock
|
||||
assert result == "/downloaded/model-IQ1_S.gguf"
|
||||
mock_download_gguf.assert_called_once_with(
|
||||
"unsloth/Qwen3-0.6B-GGUF",
|
||||
"IQ1_S",
|
||||
cache_dir=None,
|
||||
revision=None,
|
||||
ignore_patterns=["original/**/*"],
|
||||
)
|
||||
|
||||
@patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
|
||||
@patch("vllm.config.model.get_config")
|
||||
@patch("vllm.config.model.is_gguf", return_value=False)
|
||||
@patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
|
||||
@patch("os.path.isfile", return_value=False)
|
||||
def test_prepare_weights_invalid_format(
|
||||
self,
|
||||
mock_isfile,
|
||||
mock_check_gguf,
|
||||
mock_is_gguf,
|
||||
mock_get_config,
|
||||
mock_get_image_config,
|
||||
):
|
||||
"""Test _prepare_weights with invalid format."""
|
||||
mock_hf_config = MagicMock()
|
||||
mock_hf_config.architectures = ["Qwen3ForCausalLM"]
|
||||
|
||||
class MockTextConfig:
|
||||
max_position_embeddings = 4096
|
||||
sliding_window = None
|
||||
model_type = "qwen3"
|
||||
num_attention_heads = 32
|
||||
|
||||
mock_text_config = MockTextConfig()
|
||||
mock_hf_config.get_text_config.return_value = mock_text_config
|
||||
mock_hf_config.dtype = "bfloat16"
|
||||
mock_get_config.return_value = mock_hf_config
|
||||
|
||||
load_config = LoadConfig(load_format="gguf")
|
||||
loader = GGUFModelLoader(load_config)
|
||||
|
||||
# Create ModelConfig with a valid repo_id to avoid validation errors
|
||||
# Then test _prepare_weights with invalid format
|
||||
model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
|
||||
# Manually set model to invalid format after creation
|
||||
model_config.model = "invalid-format"
|
||||
with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
|
||||
loader._prepare_weights(model_config)
|
||||
@ -1,11 +1,17 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.transformers_utils.utils import (
|
||||
is_cloud_storage,
|
||||
is_gcs,
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
is_s3,
|
||||
split_remote_gguf,
|
||||
)
|
||||
|
||||
|
||||
@ -28,3 +34,143 @@ def test_is_cloud_storage():
|
||||
assert is_cloud_storage("s3://model-path/path-to-model")
|
||||
assert not is_cloud_storage("/unix/local/path")
|
||||
assert not is_cloud_storage("nfs://nfs-fqdn.local")
|
||||
|
||||
|
||||
class TestIsRemoteGGUF:
|
||||
"""Test is_remote_gguf utility function."""
|
||||
|
||||
def test_is_remote_gguf_with_colon_and_slash(self):
|
||||
"""Test is_remote_gguf with repo_id:quant_type format."""
|
||||
# Valid quant types
|
||||
assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
|
||||
assert is_remote_gguf("user/repo:Q2_K")
|
||||
assert is_remote_gguf("repo/model:Q4_K")
|
||||
assert is_remote_gguf("repo/model:Q8_0")
|
||||
|
||||
# Invalid quant types should return False
|
||||
assert not is_remote_gguf("repo/model:quant")
|
||||
assert not is_remote_gguf("repo/model:INVALID")
|
||||
assert not is_remote_gguf("repo/model:invalid_type")
|
||||
|
||||
def test_is_remote_gguf_without_colon(self):
|
||||
"""Test is_remote_gguf without colon."""
|
||||
assert not is_remote_gguf("repo/model")
|
||||
assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
|
||||
|
||||
def test_is_remote_gguf_without_slash(self):
|
||||
"""Test is_remote_gguf without slash."""
|
||||
assert not is_remote_gguf("model.gguf")
|
||||
# Even with valid quant_type, no slash means not remote GGUF
|
||||
assert not is_remote_gguf("model:IQ1_S")
|
||||
assert not is_remote_gguf("model:quant")
|
||||
|
||||
def test_is_remote_gguf_local_path(self):
|
||||
"""Test is_remote_gguf with local file path."""
|
||||
assert not is_remote_gguf("/path/to/model.gguf")
|
||||
assert not is_remote_gguf("./model.gguf")
|
||||
|
||||
def test_is_remote_gguf_with_path_object(self):
|
||||
"""Test is_remote_gguf with Path object."""
|
||||
assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
|
||||
assert not is_remote_gguf(Path("repo/model"))
|
||||
|
||||
def test_is_remote_gguf_with_http_https(self):
|
||||
"""Test is_remote_gguf with HTTP/HTTPS URLs."""
|
||||
# HTTP/HTTPS URLs should return False even with valid quant_type
|
||||
assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
|
||||
assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
|
||||
assert not is_remote_gguf("http://repo/model:Q4_K")
|
||||
assert not is_remote_gguf("https://repo/model:Q8_0")
|
||||
|
||||
def test_is_remote_gguf_with_cloud_storage(self):
|
||||
"""Test is_remote_gguf with cloud storage paths."""
|
||||
# Cloud storage paths should return False even with valid quant_type
|
||||
assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
|
||||
assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
|
||||
assert not is_remote_gguf("s3://repo/model:Q4_K")
|
||||
assert not is_remote_gguf("gs://repo/model:Q8_0")
|
||||
|
||||
|
||||
class TestSplitRemoteGGUF:
|
||||
"""Test split_remote_gguf utility function."""
|
||||
|
||||
def test_split_remote_gguf_valid(self):
|
||||
"""Test split_remote_gguf with valid repo_id:quant_type format."""
|
||||
repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
|
||||
assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
|
||||
assert quant_type == "IQ1_S"
|
||||
|
||||
repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
|
||||
assert repo_id == "repo/model"
|
||||
assert quant_type == "Q2_K"
|
||||
|
||||
def test_split_remote_gguf_with_path_object(self):
|
||||
"""Test split_remote_gguf with Path object."""
|
||||
repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
|
||||
assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
|
||||
assert quant_type == "IQ1_S"
|
||||
|
||||
def test_split_remote_gguf_invalid(self):
|
||||
"""Test split_remote_gguf with invalid format."""
|
||||
# Invalid format (no colon) - is_remote_gguf returns False
|
||||
with pytest.raises(ValueError, match="Wrong GGUF model"):
|
||||
split_remote_gguf("repo/model")
|
||||
|
||||
# Invalid quant type - is_remote_gguf returns False
|
||||
with pytest.raises(ValueError, match="Wrong GGUF model"):
|
||||
split_remote_gguf("repo/model:INVALID_TYPE")
|
||||
|
||||
# HTTP URL - is_remote_gguf returns False
|
||||
with pytest.raises(ValueError, match="Wrong GGUF model"):
|
||||
split_remote_gguf("http://repo/model:IQ1_S")
|
||||
|
||||
# Cloud storage - is_remote_gguf returns False
|
||||
with pytest.raises(ValueError, match="Wrong GGUF model"):
|
||||
split_remote_gguf("s3://bucket/repo/model:Q2_K")
|
||||
|
||||
|
||||
class TestIsGGUF:
|
||||
"""Test is_gguf utility function."""
|
||||
|
||||
@patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
|
||||
def test_is_gguf_with_local_file(self, mock_check_gguf):
|
||||
"""Test is_gguf with local GGUF file."""
|
||||
assert is_gguf("/path/to/model.gguf")
|
||||
assert is_gguf("./model.gguf")
|
||||
|
||||
def test_is_gguf_with_remote_gguf(self):
|
||||
"""Test is_gguf with remote GGUF format."""
|
||||
# Valid remote GGUF format (repo_id:quant_type with valid quant_type)
|
||||
assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
|
||||
assert is_gguf("repo/model:Q2_K")
|
||||
assert is_gguf("repo/model:Q4_K")
|
||||
|
||||
# Invalid quant_type should return False
|
||||
assert not is_gguf("repo/model:quant")
|
||||
assert not is_gguf("repo/model:INVALID")
|
||||
|
||||
@patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
|
||||
def test_is_gguf_false(self, mock_check_gguf):
|
||||
"""Test is_gguf returns False for non-GGUF models."""
|
||||
assert not is_gguf("unsloth/Qwen3-0.6B")
|
||||
assert not is_gguf("repo/model")
|
||||
assert not is_gguf("model")
|
||||
|
||||
def test_is_gguf_edge_cases(self):
|
||||
"""Test is_gguf with edge cases."""
|
||||
# Empty string
|
||||
assert not is_gguf("")
|
||||
|
||||
# Only colon, no slash (even with valid quant_type)
|
||||
assert not is_gguf("model:IQ1_S")
|
||||
|
||||
# Only slash, no colon
|
||||
assert not is_gguf("repo/model")
|
||||
|
||||
# HTTP/HTTPS URLs
|
||||
assert not is_gguf("http://repo/model:IQ1_S")
|
||||
assert not is_gguf("https://repo/model:Q2_K")
|
||||
|
||||
# Cloud storage
|
||||
assert not is_gguf("s3://bucket/repo/model:IQ1_S")
|
||||
assert not is_gguf("gs://bucket/repo/model:Q2_K")
|
||||
|
||||
@ -166,7 +166,7 @@ def test_dict_args(parser):
|
||||
"--hf-overrides.key2.key4",
|
||||
"val3",
|
||||
# Test compile config and compilation mode
|
||||
"-O.use_inductor=true",
|
||||
"-O.use_inductor_graph_partition=true",
|
||||
"-O.backend",
|
||||
"custom",
|
||||
"-O1",
|
||||
@ -219,7 +219,7 @@ def test_dict_args(parser):
|
||||
}
|
||||
assert parsed_args.compilation_config == {
|
||||
"mode": 1,
|
||||
"use_inductor": True,
|
||||
"use_inductor_graph_partition": True,
|
||||
"backend": "custom",
|
||||
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
|
||||
}
|
||||
|
||||
@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
|
||||
)
|
||||
|
||||
# Test case 1: Requires additional lookahead tokens
|
||||
kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
|
||||
kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
|
||||
)
|
||||
blocks = kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens=3,
|
||||
@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
|
||||
assert len(blocks.get_block_ids()[0]) == 2 # ceil(5/4)=2 blocks
|
||||
|
||||
# Test case 2: With precomputed blocks
|
||||
kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
|
||||
kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
|
||||
)
|
||||
# required_blocks = ceil((3 + 2) /4) = 2
|
||||
blocks = kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():
|
||||
|
||||
# Test case 3: With precomputed blocks
|
||||
# required_blocks = ceil((3 + 4) / 4) = 2
|
||||
kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
|
||||
kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
|
||||
)
|
||||
blocks = kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens=3,
|
||||
@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker():
|
||||
),
|
||||
],
|
||||
)
|
||||
# different hidden size
|
||||
|
||||
# different hidden size but same type, use UniformTypeKVCacheSpecs
|
||||
kv_cache_specs_hybrid = {
|
||||
"layer_1": new_kv_cache_spec(head_size=128),
|
||||
"layer_2": new_kv_cache_spec(head_size=64),
|
||||
@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
|
||||
],
|
||||
)
|
||||
|
||||
# Different hidden size and different type, align by different block size
|
||||
kv_cache_specs_hybrid = {
|
||||
"layer_1": new_kv_cache_spec(head_size=64),
|
||||
"layer_2": new_sliding_window_spec(head_size=32),
|
||||
}
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
|
||||
)[0]
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
|
||||
),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
|
||||
KVCacheGroupSpec(
|
||||
["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# different hidden size that cannot be aligned by using different block size
|
||||
kv_cache_specs_hybrid = {
|
||||
"layer_1": new_kv_cache_spec(head_size=64),
|
||||
"layer_2": new_sliding_window_spec(head_size=96),
|
||||
}
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
|
||||
)[0]
|
||||
|
||||
# Test num_gpu_blocks_override
|
||||
vllm_config.cache_config.num_gpu_blocks_override = 16
|
||||
kv_cache_config_override_blocks = get_kv_cache_configs(
|
||||
|
||||
@ -134,6 +134,7 @@ def test_prefill(hash_fn):
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Complete 3 blocks (48 tokens)
|
||||
@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
|
||||
make_kv_cache_config_hybrid_model(block_size, 21),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
hash_fn = sha256
|
||||
@ -416,6 +418,7 @@ def test_prefill_plp():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
# the default hash function is sha256
|
||||
hash_fn = sha256
|
||||
@ -523,6 +526,7 @@ def test_decode():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Complete 3 blocks (48 tokens)
|
||||
@ -585,6 +589,7 @@ def test_evict():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
last_token_id = 5 * 16 + 7
|
||||
@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
|
||||
make_kv_cache_config(16, 2),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Allocate 1 block and cache it.
|
||||
@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
|
||||
make_kv_cache_config(block_size, 3),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Allocate a block and cache it.
|
||||
@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
|
||||
make_kv_cache_config(block_size, 5),
|
||||
max_model_len=8192,
|
||||
enable_caching=False,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
req1 = make_request(
|
||||
@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=5,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
# Req:
|
||||
# Block 0: [0, 1, 2, 3]
|
||||
@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
|
||||
This tests that blocks are cached correctly for different kv cache groups.
|
||||
"""
|
||||
block_size = 4
|
||||
block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
|
||||
# Req:
|
||||
# Block 0/4: [0, 1, 2, 3]
|
||||
@ -921,6 +932,7 @@ def test_mm_prefix_caching():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Common prompt tokens (T is text tokens and P is image placeholder tokens)
|
||||
@ -1020,6 +1032,7 @@ def test_cache_key_salting():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# 3 complete blocks and an incomplete block with 11 tokens.
|
||||
@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
# Complete 3 blocks (48 tokens)
|
||||
# | Common-0 | Common-1 | Common-2 | ... |
|
||||
@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
full_block_token_ids = [i for i in range(3) for _ in range(16)]
|
||||
@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
log_stats=False, # Disable logging stats
|
||||
)
|
||||
assert manager.prefix_cache_stats is None
|
||||
@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():
|
||||
|
||||
|
||||
def test_maybe_evict_cached_block():
|
||||
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
|
||||
pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
|
||||
block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
|
||||
block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
|
||||
block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
|
||||
@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
enable_kv_cache_events=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
num_tokens = block_size * blocks_to_cache
|
||||
@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
enable_kv_cache_events=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Test with LoRA request
|
||||
@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
use_eagle=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# Request with 3 full blocks (48 tokens)
|
||||
@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
use_eagle=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
# 2 full blocks + 5 tokens (non-divisible length)
|
||||
token_ids = [0] * (2 * block_size + 5)
|
||||
@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
use_eagle=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# 2 full blocks + 5 tokens (non-divisible length)
|
||||
@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
|
||||
assert num_tokens == 0
|
||||
|
||||
|
||||
def test_different_block_size():
|
||||
block_size = 16
|
||||
# full attention and sliding window attention layers have the same page size:
|
||||
# (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=100,
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(
|
||||
["layer1"],
|
||||
FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
|
||||
),
|
||||
KVCacheGroupSpec(
|
||||
["layer2"],
|
||||
SlidingWindowSpec(
|
||||
block_size,
|
||||
1,
|
||||
1,
|
||||
torch.float32,
|
||||
sliding_window=2 * block_size,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
manager = KVCacheManager(
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_model_len=8192,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
)
|
||||
|
||||
# 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
|
||||
common_token_ids = [i for i in range(10) for _ in range(block_size)]
|
||||
|
||||
req0 = make_request("0", common_token_ids, block_size, sha256)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
||||
assert not computed_blocks.blocks[0]
|
||||
assert not computed_blocks.blocks[1]
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(
|
||||
req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
|
||||
)
|
||||
assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
|
||||
req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
||||
assert len(computed_blocks.blocks[0]) == 3
|
||||
assert len(computed_blocks.blocks[1]) == 6
|
||||
assert num_computed_tokens == 6 * 16
|
||||
|
||||
req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
||||
assert len(computed_blocks.blocks[0]) == 3
|
||||
assert len(computed_blocks.blocks[1]) == 6
|
||||
assert num_computed_tokens == 6 * 16
|
||||
|
||||
# Evict some blocks to make sliding window cache hit length 5*16
|
||||
# But should return 4 * 16 because full attention cache hit length must be
|
||||
# a multiple of 32
|
||||
manager.block_pool.cached_block_hash_to_block.pop(
|
||||
make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
|
||||
)
|
||||
manager.block_pool.cached_block_hash_to_block.pop(
|
||||
make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
|
||||
)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
||||
assert len(computed_blocks.blocks[0]) == 2
|
||||
assert len(computed_blocks.blocks[1]) == 4
|
||||
assert num_computed_tokens == 4 * 16
|
||||
|
||||
|
||||
def test_block_lookup_cache_single_block_per_key():
|
||||
cache = BlockHashToBlockMap()
|
||||
key0 = BlockHashWithGroupId(b"hash0")
|
||||
|
||||
@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
|
||||
attention_chunk_size=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_chunked_local_attention_manager(
|
||||
chunked_local_attention_spec, block_pool
|
||||
)
|
||||
@ -70,6 +72,7 @@ def test_chunked_local_attention_possible_cached_prefix():
|
||||
block_pool=block_pool,
|
||||
kv_cache_spec=chunked_local_attention_spec,
|
||||
use_eagle=False,
|
||||
alignment_tokens=block_size,
|
||||
)[0]
|
||||
assert len(computed_blocks) == expect_length
|
||||
|
||||
@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
|
||||
sliding_window=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
|
||||
def run_one_case(block_is_cached, expect_length):
|
||||
@ -138,6 +143,7 @@ def test_sliding_window_possible_cached_prefix():
|
||||
block_pool=block_pool,
|
||||
kv_cache_spec=sliding_window_spec,
|
||||
use_eagle=False,
|
||||
alignment_tokens=block_size,
|
||||
)[0]
|
||||
assert len(computed_blocks) == expect_length
|
||||
|
||||
@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
|
||||
attention_chunk_size=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
|
||||
|
||||
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
|
||||
|
||||
@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
|
||||
sliding_window=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
|
||||
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
|
||||
@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
|
||||
sliding_window=4, # Placeholder value, not related to test result
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
|
||||
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
|
||||
@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
|
||||
attention_chunk_size=4, # Placeholder value, not related to test result
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
|
||||
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
|
||||
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
|
||||
|
||||
@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
# Set small draft model len to force doesn't-fit-in-drafter case.
|
||||
spec_config_short = spec_config | {"max_model_len": 50}
|
||||
|
||||
test_sampling_params = [
|
||||
dict(),
|
||||
dict(logprobs=2),
|
||||
]
|
||||
|
||||
# test_preemption, executor, async_scheduling,
|
||||
# spec_config, test_prefill_chunking
|
||||
test_configs = [
|
||||
@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
(True, "uni", True, spec_config_short, True),
|
||||
]
|
||||
|
||||
run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
|
||||
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
|
||||
|
||||
|
||||
@dynamo_config.patch(cache_size_limit=16)
|
||||
|
||||
@ -11,6 +11,7 @@ import pprint
|
||||
import time
|
||||
from collections.abc import Callable, Sequence
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||
self.vllm_backend.compiler_manager.compile(
|
||||
submod,
|
||||
args,
|
||||
self.compilation_config.inductor_compile_config,
|
||||
self.vllm_backend.inductor_config,
|
||||
self.compilation_config,
|
||||
graph_index=index,
|
||||
num_graphs=len(self.compile_submod_names),
|
||||
@ -531,6 +532,9 @@ class VllmBackend:
|
||||
sym_tensor_indices: list[int]
|
||||
input_buffers: list[torch.Tensor]
|
||||
compiler_manager: CompilerManager
|
||||
# Copy of CompilationConfig.inductor_compile_config +
|
||||
# an entry for PostGradPassManager
|
||||
inductor_config: dict[str, Any]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -561,25 +565,30 @@ class VllmBackend:
|
||||
self.compilation_config
|
||||
)
|
||||
|
||||
# Deepcopy the inductor config to detach the post-grad custom pass
|
||||
# from CompilationConfig.
|
||||
# We want to avoid PostGradPassManager in CompilationConfig because
|
||||
# in future we need PostGradPassManager.uuid() to be executed
|
||||
# only at compile time.
|
||||
self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
|
||||
# `torch.compile` is JIT compiled, so we don't need to
|
||||
# do anything here
|
||||
|
||||
def configure_post_pass(self):
|
||||
config = self.compilation_config
|
||||
self.pass_manager.configure(self.vllm_config)
|
||||
|
||||
# Post-grad custom passes are run using the post_grad_custom_post_pass
|
||||
# hook. If a pass for that hook exists, add it to the pass manager.
|
||||
inductor_config = config.inductor_compile_config
|
||||
if self.pass_key in inductor_config:
|
||||
if isinstance(inductor_config[self.pass_key], PostGradPassManager):
|
||||
# PassManager already added to config, make sure it's correct
|
||||
assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
|
||||
if self.pass_key in self.inductor_config:
|
||||
if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
|
||||
raise ValueError(
|
||||
"PostGradPassManager can not be kept in CompilationConfig."
|
||||
)
|
||||
else:
|
||||
# Config should automatically wrap all inductor passes
|
||||
assert isinstance(inductor_config[self.pass_key], InductorPass)
|
||||
self.pass_manager.add(inductor_config[self.pass_key])
|
||||
inductor_config[self.pass_key] = self.pass_manager
|
||||
assert isinstance(self.inductor_config[self.pass_key], InductorPass)
|
||||
self.pass_manager.add(self.inductor_config[self.pass_key])
|
||||
self.inductor_config[self.pass_key] = self.pass_manager
|
||||
|
||||
def __call__(
|
||||
self, graph: fx.GraphModule, example_inputs
|
||||
@ -638,9 +647,7 @@ class VllmBackend:
|
||||
self.compilation_config.local_cache_dir = local_cache_dir
|
||||
|
||||
# Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
|
||||
disable_cache = not is_compile_cache_enabled(
|
||||
self.compilation_config.inductor_compile_config
|
||||
)
|
||||
disable_cache = not is_compile_cache_enabled(self.inductor_config)
|
||||
|
||||
if disable_cache:
|
||||
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import inspect
|
||||
import os
|
||||
import pickle
|
||||
@ -14,6 +13,7 @@ import vllm.envs as envs
|
||||
from vllm.config import VllmConfig, get_current_vllm_config
|
||||
from vllm.config.utils import hash_factors
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
try:
|
||||
from torch._dynamo.aot_compile import SerializableCallable
|
||||
@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
|
||||
# e.g. exec(). We can't actually check these.
|
||||
continue
|
||||
hash_content.append(content)
|
||||
return hashlib.md5(
|
||||
return safe_hash(
|
||||
"\n".join(hash_content).encode(), usedforsecurity=False
|
||||
).hexdigest()
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
import copy
|
||||
import hashlib
|
||||
import os
|
||||
from collections.abc import Callable
|
||||
from contextlib import ExitStack
|
||||
@ -16,6 +15,7 @@ import torch.fx as fx
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
@ -197,9 +197,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
|
||||
|
||||
def compute_hash(self, vllm_config: VllmConfig) -> str:
|
||||
factors = get_inductor_factors()
|
||||
hash_str = hashlib.md5(
|
||||
str(factors).encode(), usedforsecurity=False
|
||||
).hexdigest()[:10]
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
|
||||
:10
|
||||
]
|
||||
return hash_str
|
||||
|
||||
def initialize_cache(
|
||||
@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):
|
||||
|
||||
def compute_hash(self, vllm_config: VllmConfig) -> str:
|
||||
factors = get_inductor_factors()
|
||||
hash_str = hashlib.md5(
|
||||
str(factors).encode(), usedforsecurity=False
|
||||
).hexdigest()[:10]
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
|
||||
:10
|
||||
]
|
||||
return hash_str
|
||||
|
||||
def initialize_cache(
|
||||
|
||||
@ -107,7 +107,7 @@ class PiecewiseBackend:
|
||||
entry.runnable = self.vllm_backend.compiler_manager.compile(
|
||||
self.graph,
|
||||
args,
|
||||
self.compilation_config.inductor_compile_config,
|
||||
self.vllm_backend.inductor_config,
|
||||
self.compilation_config,
|
||||
graph_index=self.piecewise_compile_index,
|
||||
num_graphs=self.total_piecewise_compiles,
|
||||
|
||||
@ -144,7 +144,7 @@ class CacheConfig:
|
||||
|
||||
kv_offloading_backend: KVOffloadingBackend | None = None
|
||||
"""The backend to use for KV cache offloading. Supported backends include
|
||||
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
|
||||
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
|
||||
together with kv_offloading_size."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
@ -167,8 +167,6 @@ class CacheConfig:
|
||||
"num_gpu_blocks_override",
|
||||
"enable_prefix_caching",
|
||||
"prefix_caching_hash_algo",
|
||||
# `cpu_offload_gb` does not use `torch.compile` yet.
|
||||
"cpu_offload_gb",
|
||||
"cpu_kvcache_space_bytes",
|
||||
"mamba_page_size_padded",
|
||||
# Post-init/derived counters
|
||||
|
||||
@ -264,7 +264,6 @@ class CompilationConfig:
|
||||
- [`cudagraph_copy_inputs`]
|
||||
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
|
||||
- Inductor compilation:
|
||||
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
|
||||
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
|
||||
- [`inductor_compile_config`]
|
||||
[vllm.config.CompilationConfig.inductor_compile_config]
|
||||
@ -348,7 +347,7 @@ class CompilationConfig:
|
||||
- 'none,+op1,+op2' to enable only op1 and op2
|
||||
|
||||
By default, all custom ops are enabled when running without Inductor and
|
||||
disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
|
||||
disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
|
||||
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
||||
splitting_ops: list[str] | None = None
|
||||
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
||||
@ -374,24 +373,6 @@ class CompilationConfig:
|
||||
Disabled by default until more models are supported/tested to work."""
|
||||
|
||||
# Inductor capture
|
||||
use_inductor: bool | None = None
|
||||
"""
|
||||
Whether to use inductor compilation.
|
||||
|
||||
This flag is deprecated and will be removed in the next release 0.12.0.
|
||||
Please use the 'backend' option instead.
|
||||
|
||||
- False: inductor compilation is not used. graph runs in eager
|
||||
(custom_ops enabled by default).
|
||||
- True: inductor compilation is used (custom_ops disabled by default).
|
||||
One graph for symbolic shape and one graph per size in compile_sizes
|
||||
are compiled using configurations in inductor_compile_config.
|
||||
|
||||
This setting is ignored if mode<VLLM_COMPILE.
|
||||
|
||||
For future compatibility:
|
||||
If use_inductor is True, backend="inductor" otherwise backend="eager".
|
||||
"""
|
||||
compile_sizes: list[int | str] | None = None
|
||||
"""Sizes to compile for inductor. In addition
|
||||
to integers, it also supports "cudagraph_capture_sizes" to
|
||||
@ -759,14 +740,6 @@ class CompilationConfig:
|
||||
f"Invalid backend for piecewise compilation: {self.backend}"
|
||||
)
|
||||
|
||||
if self.use_inductor is not None:
|
||||
logger.warning_once(
|
||||
"The 'use_inductor' flag is deprecated and will be "
|
||||
"removed in the next release (v0.12.0). "
|
||||
"Please use the 'backend' option instead.",
|
||||
)
|
||||
self.backend = "inductor" if self.use_inductor else "eager"
|
||||
|
||||
if self.backend == "":
|
||||
self.backend = current_platform.get_compile_backend()
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal
|
||||
|
||||
@ -10,6 +9,7 @@ from pydantic import ConfigDict, SkipValidation
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
|
||||
|
||||
@ -45,7 +45,7 @@ class DeviceConfig:
|
||||
# the device/platform information will be summarized
|
||||
# by torch/vllm automatically.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import uuid
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, get_args
|
||||
@ -9,6 +8,7 @@ from typing import Any, Literal, get_args
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
KVProducer = Literal["kv_producer", "kv_both"]
|
||||
KVConsumer = Literal["kv_consumer", "kv_both"]
|
||||
@ -79,7 +79,7 @@ class KVTransferConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.model_loader import LoadFormats
|
||||
@ -104,7 +104,7 @@ class LoadConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("load_format", mode="after")
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import torch
|
||||
@ -11,6 +10,7 @@ from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@ -74,7 +74,7 @@ class LoRAConfig:
|
||||
factors.append(self.fully_sharded_loras)
|
||||
factors.append(self.lora_dtype)
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@model_validator(mode="after")
|
||||
|
||||
@ -39,7 +39,12 @@ from vllm.transformers_utils.gguf_utils import (
|
||||
maybe_patch_hf_config_from_gguf,
|
||||
)
|
||||
from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
|
||||
from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
|
||||
from vllm.transformers_utils.utils import (
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
maybe_model_redirect,
|
||||
split_remote_gguf,
|
||||
)
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.utils.torch_utils import common_broadcastable_dtype
|
||||
|
||||
@ -294,9 +299,6 @@ class ModelConfig:
|
||||
pooler_config: PoolerConfig | None = None
|
||||
"""Pooler config which controls the behaviour of output pooling in pooling
|
||||
models."""
|
||||
override_pooler_config: dict | PoolerConfig | None = None
|
||||
"""[DEPRECATED] Use `pooler_config` instead. This field will be removed in
|
||||
v0.12.0 or v1.0.0, whichever is sooner."""
|
||||
|
||||
# Multimodal config and init vars
|
||||
multimodal_config: MultiModalConfig | None = None
|
||||
@ -343,7 +345,6 @@ class ModelConfig:
|
||||
"logprobs_mode",
|
||||
"disable_cascade_attn",
|
||||
"skip_tokenizer_init",
|
||||
"enable_prompt_embeds",
|
||||
"served_model_name",
|
||||
"config_format",
|
||||
"hf_token",
|
||||
@ -354,7 +355,6 @@ class ModelConfig:
|
||||
"logits_processors",
|
||||
"io_processor_plugin",
|
||||
"pooler_config",
|
||||
"override_pooler_config",
|
||||
"multimodal_config",
|
||||
"limit_mm_per_prompt",
|
||||
"media_io_kwargs",
|
||||
@ -440,7 +440,8 @@ class ModelConfig:
|
||||
self.model = maybe_model_redirect(self.model)
|
||||
# The tokenizer is consistent with the model by default.
|
||||
if self.tokenizer is None:
|
||||
if check_gguf_file(self.model):
|
||||
# Check if this is a GGUF model (either local file or remote GGUF)
|
||||
if is_gguf(self.model):
|
||||
raise ValueError(
|
||||
"Using a tokenizer is mandatory when loading a GGUF model. "
|
||||
"Please specify the tokenizer path or name using the "
|
||||
@ -642,18 +643,6 @@ class ModelConfig:
|
||||
|
||||
# Init pooler config if needed
|
||||
if self.runner_type == "pooling":
|
||||
if self.override_pooler_config is not None:
|
||||
logger.warning_once(
|
||||
"`override_pooler_config` is deprecated and will be "
|
||||
"removed in v0.12.0 or v1.0.0, whichever is sooner. "
|
||||
"Please use `pooler_config` instead."
|
||||
)
|
||||
|
||||
if isinstance(self.override_pooler_config, dict):
|
||||
self.pooler_config = PoolerConfig(**self.override_pooler_config)
|
||||
else:
|
||||
self.pooler_config = self.override_pooler_config
|
||||
|
||||
if self.pooler_config is None:
|
||||
self.pooler_config = PoolerConfig()
|
||||
|
||||
@ -832,7 +821,10 @@ class ModelConfig:
|
||||
self.tokenizer = object_storage_tokenizer.dir
|
||||
|
||||
def _get_encoder_config(self):
|
||||
return get_sentence_transformer_tokenizer_config(self.model, self.revision)
|
||||
model = self.model
|
||||
if is_remote_gguf(model):
|
||||
model, _ = split_remote_gguf(model)
|
||||
return get_sentence_transformer_tokenizer_config(model, self.revision)
|
||||
|
||||
def _verify_tokenizer_mode(self) -> None:
|
||||
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Mapping
|
||||
from typing import TYPE_CHECKING, Any, Literal, TypeAlias
|
||||
|
||||
@ -9,6 +8,7 @@ from pydantic import ConfigDict, Field, field_validator, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
@ -216,7 +216,7 @@ class MultiModalConfig:
|
||||
if self.mm_encoder_attn_backend is not None
|
||||
else None
|
||||
]
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def get_limit_per_prompt(self, modality: str) -> int:
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from functools import cached_property
|
||||
from typing import Any, Literal, cast
|
||||
|
||||
@ -11,6 +10,7 @@ from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm import version
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
DetailedTraceModules = Literal["model", "worker", "all"]
|
||||
|
||||
@ -78,7 +78,7 @@ class ObservabilityConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("show_hidden_metrics_for_version")
|
||||
|
||||
@ -593,9 +593,10 @@ class ParallelConfig:
|
||||
"max_parallel_loading_workers is currently "
|
||||
"not supported and will be ignored."
|
||||
)
|
||||
if self.distributed_executor_backend != "mp" and self.nnodes > 1:
|
||||
if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
|
||||
raise ValueError(
|
||||
"nnodes > 1 can only be set when distributed exectuor backend is mp."
|
||||
"nnodes > 1 can only be set when distributed executor "
|
||||
"backend is mp or uni."
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import Any
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -102,7 +102,7 @@ class PoolerConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Callable
|
||||
from dataclasses import InitVar
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
|
||||
@ -12,6 +11,7 @@ from typing_extensions import Self, deprecated
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -178,7 +178,7 @@ class SchedulerConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("scheduler_cls", "async_scheduling", mode="wrap")
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import ast
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, Literal, get_args
|
||||
|
||||
from pydantic import Field, SkipValidation, model_validator
|
||||
@ -13,6 +12,7 @@ from vllm.config.model import ModelConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.utils.import_utils import LazyLoader, has_arctic_inference
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -162,7 +162,7 @@ class SpeculativeConfig:
|
||||
# Eagle3 affects the computation graph because it returns intermediate
|
||||
# hidden states in addition to the final hidden state.
|
||||
factors.append(self.method == "eagle3")
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import model_validator
|
||||
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
StructuredOutputsBackend = Literal[
|
||||
"auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
|
||||
@ -58,7 +58,7 @@ class StructuredOutputsConfig:
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@model_validator(mode="after")
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
import copy
|
||||
import getpass
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
@ -25,6 +24,7 @@ from vllm.config.speculative import EagleModelTypes
|
||||
from vllm.logger import enable_trace_function_call, init_logger
|
||||
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
from .cache import CacheConfig
|
||||
from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||
@ -193,7 +193,7 @@ class VllmConfig:
|
||||
vllm_factors.append("None")
|
||||
if self.additional_config:
|
||||
if isinstance(additional_config := self.additional_config, dict):
|
||||
additional_config_hash = hashlib.md5(
|
||||
additional_config_hash = safe_hash(
|
||||
json.dumps(additional_config, sort_keys=True).encode(),
|
||||
usedforsecurity=False,
|
||||
).hexdigest()
|
||||
@ -204,9 +204,9 @@ class VllmConfig:
|
||||
vllm_factors.append("None")
|
||||
factors.append(vllm_factors)
|
||||
|
||||
hash_str = hashlib.md5(
|
||||
str(factors).encode(), usedforsecurity=False
|
||||
).hexdigest()[:10]
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
|
||||
:10
|
||||
]
|
||||
return hash_str
|
||||
|
||||
def pad_for_cudagraph(self, batch_size: int) -> int:
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import hashlib
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
@ -15,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorRole,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.v1.attention.backends.mla.common import MLACommonMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
@ -423,7 +423,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
|
||||
if mm_hashes:
|
||||
mm_str = "-".join(mm_hashes)
|
||||
token_bytes += mm_str.encode("utf-8")
|
||||
input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
|
||||
input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()
|
||||
|
||||
foldername = os.path.join(self._storage_path, input_ids_hash)
|
||||
if create_folder:
|
||||
|
||||
@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
from vllm.utils.network_utils import get_distributed_init_method
|
||||
from vllm.utils.system_utils import suppress_stdout
|
||||
from vllm.utils.torch_utils import (
|
||||
direct_register_custom_op,
|
||||
supports_custom_op,
|
||||
@ -329,7 +330,8 @@ class GroupCoordinator:
|
||||
)
|
||||
# a group with `gloo` backend, to allow direct coordination between
|
||||
# processes through the CPU.
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
with suppress_stdout():
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
if self.rank in ranks:
|
||||
self.ranks = ranks
|
||||
self.world_size = len(ranks)
|
||||
|
||||
@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.network_utils import get_tcp_uri
|
||||
from vllm.utils.system_utils import suppress_stdout
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -427,33 +428,34 @@ def init_gloo_process_group(
|
||||
Stateless init ProcessGroup with gloo backend compatible with
|
||||
different torch versions.
|
||||
"""
|
||||
if is_torch_equal_or_newer("2.6"):
|
||||
pg = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
)
|
||||
else:
|
||||
options = ProcessGroup.Options(backend="gloo")
|
||||
pg = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
options,
|
||||
)
|
||||
from torch.distributed.distributed_c10d import ProcessGroupGloo
|
||||
with suppress_stdout():
|
||||
if is_torch_equal_or_newer("2.6"):
|
||||
pg = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
)
|
||||
else:
|
||||
options = ProcessGroup.Options(backend="gloo")
|
||||
pg = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
options,
|
||||
)
|
||||
from torch.distributed.distributed_c10d import ProcessGroupGloo
|
||||
|
||||
backend_class = ProcessGroupGloo(
|
||||
prefix_store, group_rank, group_size, timeout=timeout
|
||||
)
|
||||
backend_type = ProcessGroup.BackendType.GLOO
|
||||
device = torch.device("cpu")
|
||||
if is_torch_equal_or_newer("2.6"):
|
||||
# _set_default_backend is supported in torch >= 2.6
|
||||
pg._set_default_backend(backend_type)
|
||||
backend_class._set_sequence_number_for_group()
|
||||
backend_class = ProcessGroupGloo(
|
||||
prefix_store, group_rank, group_size, timeout=timeout
|
||||
)
|
||||
backend_type = ProcessGroup.BackendType.GLOO
|
||||
device = torch.device("cpu")
|
||||
if is_torch_equal_or_newer("2.6"):
|
||||
# _set_default_backend is supported in torch >= 2.6
|
||||
pg._set_default_backend(backend_type)
|
||||
backend_class._set_sequence_number_for_group()
|
||||
|
||||
pg._register_backend(device, backend_type, backend_class)
|
||||
pg._register_backend(device, backend_type, backend_class)
|
||||
return pg
|
||||
|
||||
|
||||
|
||||
@ -29,7 +29,7 @@ import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from pydantic.fields import FieldInfo
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
@ -86,7 +86,7 @@ from vllm.transformers_utils.config import (
|
||||
is_interleaved,
|
||||
maybe_override_with_speculators,
|
||||
)
|
||||
from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
|
||||
from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_ip
|
||||
@ -520,9 +520,6 @@ class EngineArgs:
|
||||
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
|
||||
|
||||
pooler_config: PoolerConfig | None = ModelConfig.pooler_config
|
||||
override_pooler_config: dict | PoolerConfig | None = (
|
||||
ModelConfig.override_pooler_config
|
||||
)
|
||||
compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
|
||||
worker_cls: str = ParallelConfig.worker_cls
|
||||
worker_extension_cls: str = ParallelConfig.worker_extension_cls
|
||||
@ -659,11 +656,6 @@ class EngineArgs:
|
||||
)
|
||||
model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
|
||||
model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
|
||||
model_group.add_argument(
|
||||
"--override-pooler-config",
|
||||
**model_kwargs["override_pooler_config"],
|
||||
deprecated=True,
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
|
||||
)
|
||||
@ -880,7 +872,11 @@ class EngineArgs:
|
||||
"--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
|
||||
)
|
||||
cache_group.add_argument(
|
||||
"--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
|
||||
"--enable-prefix-caching",
|
||||
**{
|
||||
**cache_kwargs["enable_prefix_caching"],
|
||||
"default": None,
|
||||
},
|
||||
)
|
||||
cache_group.add_argument(
|
||||
"--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
|
||||
@ -1144,8 +1140,8 @@ class EngineArgs:
|
||||
return engine_args
|
||||
|
||||
def create_model_config(self) -> ModelConfig:
|
||||
# gguf file needs a specific model loader and doesn't use hf_repo
|
||||
if check_gguf_file(self.model):
|
||||
# gguf file needs a specific model loader
|
||||
if is_gguf(self.model):
|
||||
self.quantization = self.load_format = "gguf"
|
||||
|
||||
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
||||
@ -1239,7 +1235,6 @@ class EngineArgs:
|
||||
mm_encoder_tp_mode=self.mm_encoder_tp_mode,
|
||||
mm_encoder_attn_backend=self.mm_encoder_attn_backend,
|
||||
pooler_config=self.pooler_config,
|
||||
override_pooler_config=self.override_pooler_config,
|
||||
logits_processor_pattern=self.logits_processor_pattern,
|
||||
generation_config=self.generation_config,
|
||||
override_generation_config=self.override_generation_config,
|
||||
@ -1812,9 +1807,11 @@ class EngineArgs:
|
||||
if model_config.runner_type != "pooling":
|
||||
default_chunked_prefill = True
|
||||
|
||||
# Disable prefix caching default for hybrid models
|
||||
# since the feature is still experimental.
|
||||
default_prefix_caching = not model_config.is_hybrid
|
||||
# Disable prefix caching default for hybrid models and mamba-only
|
||||
# models since the feature is still experimental.
|
||||
default_prefix_caching = not (
|
||||
model_config.is_hybrid or model_config.is_attention_free
|
||||
)
|
||||
else:
|
||||
assert model_config.pooler_config is not None
|
||||
|
||||
@ -2041,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs):
|
||||
|
||||
enable_log_requests: bool = False
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"`disable_log_requests` is deprecated and has been replaced with "
|
||||
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
|
||||
"`enable_log_requests` instead."
|
||||
)
|
||||
def disable_log_requests(self) -> bool:
|
||||
return not self.enable_log_requests
|
||||
|
||||
@disable_log_requests.setter
|
||||
@deprecated(
|
||||
"`disable_log_requests` is deprecated and has been replaced with "
|
||||
"`enable_log_requests`. This will be removed in v0.12.0. Please use "
|
||||
"`enable_log_requests` instead."
|
||||
)
|
||||
def disable_log_requests(self, value: bool):
|
||||
self.enable_log_requests = not value
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(
|
||||
parser: FlexibleArgumentParser, async_args_only: bool = False
|
||||
|
||||
@ -174,9 +174,6 @@ class LLM:
|
||||
For example, for Phi-3-Vision: `{"num_crops": 4}`.
|
||||
pooler_config: Initialize non-default pooling config for the pooling
|
||||
model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
|
||||
override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
|
||||
argument is deprecated and will be removed in v0.12.0 or v1.0.0,
|
||||
whichever is sooner.
|
||||
compilation_config: Either an integer or a dictionary. If it is an
|
||||
integer, it is used as the mode of compilation optimization. If it
|
||||
is a dictionary, it can specify the full compilation configuration.
|
||||
@ -214,7 +211,6 @@ class LLM:
|
||||
hf_overrides: HfOverrides | None = None,
|
||||
mm_processor_kwargs: dict[str, Any] | None = None,
|
||||
pooler_config: PoolerConfig | None = None,
|
||||
override_pooler_config: PoolerConfig | None = None,
|
||||
structured_outputs_config: dict[str, Any]
|
||||
| StructuredOutputsConfig
|
||||
| None = None,
|
||||
@ -330,7 +326,6 @@ class LLM:
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
pooler_config=pooler_config,
|
||||
override_pooler_config=override_pooler_config,
|
||||
structured_outputs_config=structured_outputs_instance,
|
||||
compilation_config=compilation_config_instance,
|
||||
logits_processors=logits_processors,
|
||||
|
||||
@ -29,7 +29,6 @@ from openai.types.responses import (
|
||||
ResponseOutputItemAddedEvent,
|
||||
ResponseOutputItemDoneEvent,
|
||||
ResponsePrompt,
|
||||
ResponseReasoningItem,
|
||||
ResponseReasoningTextDeltaEvent,
|
||||
ResponseReasoningTextDoneEvent,
|
||||
ResponseStatus,
|
||||
@ -304,9 +303,7 @@ def get_logits_processors(
|
||||
return None
|
||||
|
||||
|
||||
ResponseInputOutputItem: TypeAlias = (
|
||||
ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
|
||||
)
|
||||
ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
|
||||
|
||||
|
||||
class ResponsesRequest(OpenAIBaseModel):
|
||||
|
||||
@ -10,6 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
|
||||
Function as FunctionCallTool,
|
||||
)
|
||||
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_reasoning_item import ResponseReasoningItem
|
||||
from openai.types.responses.tool import Tool
|
||||
@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call(
|
||||
"role": "assistant",
|
||||
"reasoning": reasoning_content,
|
||||
}
|
||||
elif isinstance(item, ResponseFunctionToolCallOutputItem):
|
||||
return ChatCompletionToolMessageParam(
|
||||
role="tool",
|
||||
content=item.output,
|
||||
tool_call_id=item.call_id,
|
||||
)
|
||||
elif item.get("type") == "function_call_output":
|
||||
# Append the function call output as a tool message.
|
||||
return ChatCompletionToolMessageParam(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
@ -12,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceNoOP,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.import_utils import has_triton_kernels
|
||||
|
||||
@ -88,14 +90,17 @@ def triton_kernel_moe_forward(
|
||||
gating_output, topk, sm_first=not renormalize
|
||||
)
|
||||
|
||||
output = torch.empty_like(hidden_states)
|
||||
|
||||
return triton_kernel_fused_experts(
|
||||
None,
|
||||
output,
|
||||
hidden_states,
|
||||
w1,
|
||||
w2,
|
||||
routing_data,
|
||||
gather_idx,
|
||||
scatter_idx,
|
||||
topk=topk,
|
||||
activation=activation,
|
||||
quant_config=quant_config,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
@ -113,6 +118,7 @@ def triton_kernel_fused_experts(
|
||||
routing_data, # RoutingData
|
||||
gather_indx, # GatherIndx
|
||||
scatter_indx, # ScatterIndx
|
||||
topk: int,
|
||||
activation: str = "silu",
|
||||
quant_config: FusedMoEQuantConfig | None = None,
|
||||
swiglu_alpha: float = 1.702,
|
||||
@ -120,6 +126,7 @@ def triton_kernel_fused_experts(
|
||||
apply_router_weight_on_input: bool = False,
|
||||
global_num_experts: int = -1,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
intermediate_cache: torch.Tensor | None = None,
|
||||
a1q_scale: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
if quant_config is None:
|
||||
@ -131,14 +138,30 @@ def triton_kernel_fused_experts(
|
||||
assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
|
||||
|
||||
# Shape check, only check non-mxfp4
|
||||
assert hidden_states.ndim == 2
|
||||
assert hidden_states.shape[-1] == w1.shape[-2]
|
||||
assert w2.shape[-1] == w1.shape[1]
|
||||
|
||||
batch_dim = 1
|
||||
M, K = hidden_states.shape[-2:]
|
||||
E, _, N = w1.shape
|
||||
|
||||
if global_num_experts == -1:
|
||||
global_num_experts = E
|
||||
|
||||
if intermediate_cache is None:
|
||||
intermediate_cache = torch.empty(
|
||||
(batch_dim, M * topk, N // 2),
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype,
|
||||
)
|
||||
|
||||
# Add batch_dim to output buffer because matmul_ogs expects 3D output
|
||||
intermediate_cache = _resize_cache(
|
||||
intermediate_cache, (batch_dim, M * topk, N // 2)
|
||||
)
|
||||
output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
|
||||
|
||||
act = FusedActivation(
|
||||
FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
|
||||
(swiglu_alpha, swiglu_limit),
|
||||
@ -146,7 +169,7 @@ def triton_kernel_fused_experts(
|
||||
)
|
||||
gammas = routing_data.gate_scal if routing_data else None
|
||||
|
||||
intermediate_cache1 = matmul_ogs(
|
||||
matmul_ogs(
|
||||
hidden_states,
|
||||
w1,
|
||||
quant_config.w1_bias,
|
||||
@ -155,10 +178,11 @@ def triton_kernel_fused_experts(
|
||||
precision_config=quant_config.w1_precision,
|
||||
gammas=gammas if apply_router_weight_on_input else None,
|
||||
fused_activation=act,
|
||||
y=intermediate_cache,
|
||||
)
|
||||
|
||||
intermediate_cache3 = matmul_ogs(
|
||||
intermediate_cache1,
|
||||
matmul_ogs(
|
||||
intermediate_cache.view(M * topk, N // 2),
|
||||
w2,
|
||||
quant_config.w2_bias,
|
||||
routing_data,
|
||||
@ -167,7 +191,8 @@ def triton_kernel_fused_experts(
|
||||
gammas=None if apply_router_weight_on_input else gammas,
|
||||
y=output_tensor,
|
||||
)
|
||||
return intermediate_cache3
|
||||
output_tensor = output_tensor.view(M, K)
|
||||
return output_tensor
|
||||
|
||||
|
||||
def make_routing_data(
|
||||
@ -221,6 +246,42 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
def supports_expert_map(self) -> bool:
|
||||
return True
|
||||
|
||||
def moe_problem_size(
|
||||
self,
|
||||
a1: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
) -> tuple[int, int, int, int, int]:
|
||||
"""
|
||||
Extract the MoE problem size from the given tensor arguments:
|
||||
- a: The hidden states, input to the MoE layer.
|
||||
- w1: The first set of expert weights.
|
||||
- w2: The second set of expert weights.
|
||||
- topk_ids: The topk ids.
|
||||
Note: extracting the problem shape from the weight and activation
|
||||
tensors is not obvious. It needs to be done this way specifically
|
||||
due to subtle issues with particular kernels, e.g. the int4 kernels
|
||||
divide the trailing dimension by two, so it's not "correct" to
|
||||
extract N or K from the trailing dimension of w1 or w2. Similarly,
|
||||
some kernels transpose the weights, so this needs to be kept in mind.
|
||||
Note: This implementation covers most cases. However, if experts
|
||||
require a specialized implementation, like MarlinExperts, they are free
|
||||
to override this function.
|
||||
"""
|
||||
assert w1.dim() == 3 and w2.dim() == 3
|
||||
E, _, N = w1.size()
|
||||
K = a1.size(-1)
|
||||
|
||||
assert a1.dim() == 2
|
||||
assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
|
||||
M = a1.size(0)
|
||||
|
||||
assert topk_ids.dim() == 2
|
||||
topk = topk_ids.size(1)
|
||||
|
||||
return E, M, N, K, topk
|
||||
|
||||
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
|
||||
# Weight application and reduction happens in the fused_experts kernel.
|
||||
return TopKWeightAndReduceNoOP()
|
||||
@ -263,8 +324,8 @@ class OAITritonExperts(BaseOAITritonExperts):
|
||||
expert_tokens_meta: mk.ExpertTokensMetadata | None,
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||
# workspace are allocated inside the kernel
|
||||
workspace1 = (M, K)
|
||||
workspace2 = (0, 0)
|
||||
workspace1 = (0, 0)
|
||||
workspace2 = (M * topk, N // 2)
|
||||
output = (M, K)
|
||||
return (workspace1, workspace2, output)
|
||||
|
||||
@ -297,20 +358,21 @@ class OAITritonExperts(BaseOAITritonExperts):
|
||||
topk_ids, topk_weights, local_num_experts
|
||||
)
|
||||
|
||||
experts_output = triton_kernel_fused_experts(
|
||||
None,
|
||||
topk = topk_ids.size(1)
|
||||
triton_kernel_fused_experts(
|
||||
output,
|
||||
hidden_states,
|
||||
w1,
|
||||
w2,
|
||||
routing_data,
|
||||
gather_indx,
|
||||
scatter_indx,
|
||||
topk=topk,
|
||||
activation=activation,
|
||||
quant_config=self.quant_config,
|
||||
apply_router_weight_on_input=False,
|
||||
global_num_experts=local_num_experts,
|
||||
expert_map=None, # applied already
|
||||
intermediate_cache=workspace2,
|
||||
a1q_scale=a1q_scale,
|
||||
)
|
||||
|
||||
output.copy_(experts_output, non_blocking=True)
|
||||
|
||||
@ -103,7 +103,7 @@ __all__ = [
|
||||
"CompressedTensorsW8A8Int8MoEMethod",
|
||||
"CompressedTensorsWNA16MarlinMoEMethod",
|
||||
"CompressedTensorsWNA16MoEMethod",
|
||||
"CompressedTensorsW4A4MoeMethod",
|
||||
"CompressedTensorsW4A4Nvfp4MoeMethod",
|
||||
"CompressedTensorsW4A8Int8MoEMethod",
|
||||
]
|
||||
|
||||
@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
quant_config, layer.moe_config
|
||||
)
|
||||
elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
|
||||
return CompressedTensorsW4A4MoeMethod(layer.moe_config)
|
||||
return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
|
||||
elif (
|
||||
quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
|
||||
or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
|
||||
@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
|
||||
class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
|
||||
def __init__(self, moe: FusedMoEConfig):
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
|
||||
detect_nvfp4_moe_support,
|
||||
@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
self.flashinfer_moe_backend = get_flashinfer_moe_backend()
|
||||
logger.info_once(
|
||||
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
|
||||
" for CompressedTensorsW4A4MoeMethod."
|
||||
" for CompressedTensorsW4A4Nvfp4MoeMethod."
|
||||
)
|
||||
elif self.use_marlin:
|
||||
logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
|
||||
else:
|
||||
logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
assert expert_map is None, (
|
||||
"Expert Parallelism / expert_map "
|
||||
"is currently not supported for "
|
||||
"CompressedTensorsW4A4MoeMethod."
|
||||
"CompressedTensorsW4A4Nvfp4MoeMethod."
|
||||
)
|
||||
assert self.moe_quant_config is not None
|
||||
|
||||
|
||||
@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
|
||||
" for ModelOptNvFp4FusedMoE."
|
||||
)
|
||||
elif self.use_marlin:
|
||||
logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
|
||||
else:
|
||||
logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
|
||||
|
||||
def maybe_make_prepare_finalize(
|
||||
self,
|
||||
|
||||
@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig):
|
||||
# TODO: Add support for MXFP4 Linear Method.
|
||||
# MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
|
||||
# if you are interested in enabling MXFP4 here.
|
||||
logger.warning_once(
|
||||
logger.debug_once(
|
||||
"MXFP4 linear layer is not implemented - falling back to "
|
||||
"UnquantizedLinearMethod."
|
||||
"UnquantizedLinearMethod.",
|
||||
scope="local",
|
||||
)
|
||||
return UnquantizedLinearMethod()
|
||||
elif isinstance(layer, FusedMoE):
|
||||
@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig):
|
||||
return Mxfp4MoEMethod(layer.moe_config)
|
||||
elif isinstance(layer, Attention):
|
||||
# TODO: Add support for MXFP4 Attention.
|
||||
logger.warning_once(
|
||||
logger.debug_once(
|
||||
"MXFP4 attention layer is not implemented. "
|
||||
"Skipping quantization for this layer."
|
||||
"Skipping quantization for this layer.",
|
||||
scope="local",
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ from vllm.model_executor.model_loader.utils import (
|
||||
process_weights_after_loading,
|
||||
)
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_gguf,
|
||||
get_gguf_extra_tensor_names,
|
||||
get_gguf_weight_type_map,
|
||||
gguf_quant_weights_iterator,
|
||||
@ -43,7 +44,8 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
f"load format {load_config.load_format}"
|
||||
)
|
||||
|
||||
def _prepare_weights(self, model_name_or_path: str):
|
||||
def _prepare_weights(self, model_config: ModelConfig):
|
||||
model_name_or_path = model_config.model
|
||||
if os.path.isfile(model_name_or_path):
|
||||
return model_name_or_path
|
||||
# for raw HTTPS link
|
||||
@ -55,12 +57,23 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
|
||||
repo_id, filename = model_name_or_path.rsplit("/", 1)
|
||||
return hf_hub_download(repo_id=repo_id, filename=filename)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unrecognised GGUF reference: {model_name_or_path} "
|
||||
"(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
|
||||
# repo_id:quant_type
|
||||
elif "/" in model_name_or_path and ":" in model_name_or_path:
|
||||
repo_id, quant_type = model_name_or_path.rsplit(":", 1)
|
||||
return download_gguf(
|
||||
repo_id,
|
||||
quant_type,
|
||||
cache_dir=self.load_config.download_dir,
|
||||
revision=model_config.revision,
|
||||
ignore_patterns=self.load_config.ignore_patterns,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"Unrecognised GGUF reference: {model_name_or_path} "
|
||||
"(expected local file, raw URL, <repo_id>/<filename>.gguf, "
|
||||
"or <repo_id>:<quant_type>)"
|
||||
)
|
||||
|
||||
def _get_gguf_weights_map(self, model_config: ModelConfig):
|
||||
"""
|
||||
GGUF uses this naming convention for their tensors from HF checkpoint:
|
||||
@ -244,7 +257,7 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
gguf_to_hf_name_map: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
weight_type_map = get_gguf_weight_type_map(
|
||||
model_config.model, gguf_to_hf_name_map
|
||||
model_name_or_path, gguf_to_hf_name_map
|
||||
)
|
||||
is_multimodal = hasattr(model_config.hf_config, "vision_config")
|
||||
if is_multimodal:
|
||||
@ -290,10 +303,10 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
|
||||
|
||||
def download_model(self, model_config: ModelConfig) -> None:
|
||||
self._prepare_weights(model_config.model)
|
||||
self._prepare_weights(model_config)
|
||||
|
||||
def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
|
||||
local_model_path = self._prepare_weights(model_config.model)
|
||||
local_model_path = self._prepare_weights(model_config)
|
||||
gguf_weights_map = self._get_gguf_weights_map(model_config)
|
||||
model.load_weights(
|
||||
self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
|
||||
@ -303,7 +316,7 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
self, vllm_config: VllmConfig, model_config: ModelConfig
|
||||
) -> nn.Module:
|
||||
device_config = vllm_config.device_config
|
||||
local_model_path = self._prepare_weights(model_config.model)
|
||||
local_model_path = self._prepare_weights(model_config)
|
||||
gguf_weights_map = self._get_gguf_weights_map(model_config)
|
||||
# we can only know if tie word embeddings after mapping weights
|
||||
if "lm_head.weight" in get_gguf_extra_tensor_names(
|
||||
|
||||
@ -369,6 +369,52 @@ def get_sparse_attention_config(
|
||||
return config
|
||||
|
||||
|
||||
def download_gguf(
|
||||
repo_id: str,
|
||||
quant_type: str,
|
||||
cache_dir: str | None = None,
|
||||
revision: str | None = None,
|
||||
ignore_patterns: str | list[str] | None = None,
|
||||
) -> str:
|
||||
# Use patterns that snapshot_download can handle directly
|
||||
# Patterns to match:
|
||||
# - *-{quant_type}.gguf (root)
|
||||
# - *-{quant_type}-*.gguf (root sharded)
|
||||
# - */*-{quant_type}.gguf (subdir)
|
||||
# - */*-{quant_type}-*.gguf (subdir sharded)
|
||||
allow_patterns = [
|
||||
f"*-{quant_type}.gguf",
|
||||
f"*-{quant_type}-*.gguf",
|
||||
f"*/*-{quant_type}.gguf",
|
||||
f"*/*-{quant_type}-*.gguf",
|
||||
]
|
||||
|
||||
# Use download_weights_from_hf which handles caching and downloading
|
||||
folder = download_weights_from_hf(
|
||||
model_name_or_path=repo_id,
|
||||
cache_dir=cache_dir,
|
||||
allow_patterns=allow_patterns,
|
||||
revision=revision,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
|
||||
# Find the downloaded file(s) in the folder
|
||||
local_files = []
|
||||
for pattern in allow_patterns:
|
||||
# Convert pattern to glob pattern for local filesystem
|
||||
glob_pattern = os.path.join(folder, pattern)
|
||||
local_files.extend(glob.glob(glob_pattern))
|
||||
|
||||
if not local_files:
|
||||
raise ValueError(
|
||||
f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
|
||||
)
|
||||
|
||||
# Sort to ensure consistent ordering (prefer non-sharded files)
|
||||
local_files.sort(key=lambda x: (x.count("-"), x))
|
||||
return local_files[0]
|
||||
|
||||
|
||||
def download_weights_from_hf(
|
||||
model_name_or_path: str,
|
||||
cache_dir: str | None,
|
||||
|
||||
@ -233,7 +233,7 @@ class BaiChuanDecoderLayer(nn.Module):
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
position_embedding=position_embedding,
|
||||
rope_parameters=config.rope_parameters,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
|
||||
@ -289,9 +289,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
model_config = vllm_config.model_config
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
if cache_config.mamba_block_size is None:
|
||||
cache_config.mamba_block_size = model_config.max_model_len
|
||||
|
||||
if cache_config.enable_prefix_caching:
|
||||
if model_config.supports_mamba_prefix_caching:
|
||||
logger.info(
|
||||
@ -299,6 +296,11 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
"Its support for Mamba layers is experimental. "
|
||||
"Please report any issues you may observe."
|
||||
)
|
||||
# By default, mamba block size will be set to max_model_len (see
|
||||
# below). When enabling prefix caching, we align mamba block size
|
||||
# to the block size as the basic granularity for prefix caching.
|
||||
if cache_config.mamba_block_size is None:
|
||||
cache_config.mamba_block_size = cache_config.block_size
|
||||
else:
|
||||
logger.info(
|
||||
"Hybrid or mamba-based model detected without "
|
||||
@ -306,6 +308,9 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
)
|
||||
cache_config.enable_prefix_caching = False
|
||||
|
||||
if cache_config.mamba_block_size is None:
|
||||
cache_config.mamba_block_size = model_config.max_model_len
|
||||
|
||||
# TODO(tdoublep): remove once cascade attention is supported
|
||||
logger.info(
|
||||
"Disabling cascade attention since it is not supported for hybrid models."
|
||||
|
||||
@ -100,7 +100,7 @@ class GPTJAttention(nn.Module):
|
||||
self.head_size,
|
||||
rotary_dim=config.rotary_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
is_neox_style=False,
|
||||
)
|
||||
self.attn = Attention(
|
||||
|
||||
@ -239,7 +239,7 @@ class Grok1DecoderLayer(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
rope_parameters=config.rope_parameters,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
|
||||
@ -262,7 +262,7 @@ class LlamaAttention(nn.Module):
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
@ -5,7 +5,6 @@ Whenever you add an architecture to this page, please also update
|
||||
`tests/models/registry.py` with example HuggingFace models for it.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
@ -32,6 +31,7 @@ from vllm.config import (
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logging_utils import logtime
|
||||
from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
from .interfaces import (
|
||||
has_inner_state,
|
||||
@ -655,7 +655,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
|
||||
|
||||
if model_path.exists():
|
||||
with open(model_path, "rb") as f:
|
||||
module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
|
||||
module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()
|
||||
|
||||
mi = self._load_modelinfo_from_cache(module_hash)
|
||||
if mi is not None:
|
||||
|
||||
@ -407,9 +407,6 @@ class CudaPlatformBase(Platform):
|
||||
|
||||
# We have found some valid backends. Select the one with the
|
||||
# highest priority.
|
||||
logger.info(
|
||||
"Valid backends: %s", [b[0].name for b in valid_backends_priorities]
|
||||
)
|
||||
sorted_indices = sorted(
|
||||
range(len(valid_backends_priorities)),
|
||||
key=lambda i: valid_backends_priorities[i][1],
|
||||
@ -417,8 +414,9 @@ class CudaPlatformBase(Platform):
|
||||
selected_index = sorted_indices[0]
|
||||
selected_backend = valid_backends_priorities[selected_index][0]
|
||||
logger.info(
|
||||
"Using %s backend.",
|
||||
"Using %s attention backend out of potential backends: %s",
|
||||
selected_backend.name,
|
||||
[b[0].name for b in valid_backends_priorities],
|
||||
)
|
||||
|
||||
return selected_backend.get_path()
|
||||
|
||||
@ -42,7 +42,10 @@ from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config_parser_base import ConfigParserBase
|
||||
from vllm.transformers_utils.utils import (
|
||||
check_gguf_file,
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
parse_safetensors_file_metadata,
|
||||
split_remote_gguf,
|
||||
)
|
||||
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
@ -453,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
|
||||
|
||||
def patch_rope_parameters(config: PretrainedConfig) -> None:
|
||||
"""Provide backwards compatibility for RoPE."""
|
||||
# Retrieve rope_parameters differently based on Transformers version
|
||||
# Patch rope_parameters differently based on Transformers version
|
||||
if Version(version("transformers")) >= Version("5.0.0.dev0"):
|
||||
from transformers.modeling_rope_utils import RopeParameters
|
||||
|
||||
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
|
||||
config, "rope_parameters", None
|
||||
from transformers.modeling_rope_utils import (
|
||||
rope_config_validation,
|
||||
standardize_rope_params,
|
||||
)
|
||||
elif hasattr(config, "rope_parameters"):
|
||||
# We are in Transformers v4 and rope_parameters
|
||||
# has already been patched for this config
|
||||
return
|
||||
|
||||
# When Transformers v5 is installed, legacy rope_theta may be present
|
||||
# when using custom code models written for Transformers v4
|
||||
if (rope_theta := getattr(config, "rope_theta", None)) is not None:
|
||||
standardize_rope_params(config, rope_theta=rope_theta)
|
||||
rope_config_validation(config)
|
||||
# Delete rope_theta to avoid confusion in downstream code
|
||||
del config.rope_theta
|
||||
else:
|
||||
# Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
|
||||
rope_theta: float | None = getattr(config, "rope_theta", None)
|
||||
rope_scaling: dict | None = getattr(config, "rope_scaling", None)
|
||||
rope_parameters = rope_scaling
|
||||
# Move rope_theta into rope_parameters
|
||||
if rope_theta is not None:
|
||||
rope_parameters = rope_parameters or {"rope_type": "default"}
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
# Add original_max_position_embeddings if present
|
||||
if rope_parameters and (
|
||||
ompe := getattr(config, "original_max_position_embeddings", None)
|
||||
):
|
||||
rope_parameters["original_max_position_embeddings"] = ompe
|
||||
# Write back to config
|
||||
config.rope_parameters = rope_parameters
|
||||
# When Transformers v4 is installed, legacy rope_scaling may be present
|
||||
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
|
||||
config.rope_parameters = rope_scaling
|
||||
# When Transformers v4 is installed, legacy rope_theta may be present
|
||||
if (rope_theta := getattr(config, "rope_theta", None)) is not None:
|
||||
if not hasattr(config, "rope_parameters"):
|
||||
config.rope_parameters = {"rope_type": "default"}
|
||||
config.rope_parameters["rope_theta"] = rope_theta
|
||||
|
||||
# No RoPE parameters to patch
|
||||
if rope_parameters is None:
|
||||
if not hasattr(config, "rope_parameters"):
|
||||
return
|
||||
|
||||
# Add original_max_position_embeddings if present
|
||||
if ompe := getattr(config, "original_max_position_embeddings", None):
|
||||
config.rope_parameters["original_max_position_embeddings"] = ompe
|
||||
|
||||
# Handle nested rope_parameters in interleaved sliding attention models
|
||||
if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
|
||||
for rope_parameters_layer_type in rope_parameters.values():
|
||||
if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
|
||||
for rope_parameters_layer_type in config.rope_parameters.values():
|
||||
patch_rope_parameters_dict(rope_parameters_layer_type)
|
||||
else:
|
||||
patch_rope_parameters_dict(rope_parameters)
|
||||
patch_rope_parameters_dict(config.rope_parameters)
|
||||
|
||||
|
||||
def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
|
||||
if "rope_type" in rope_parameters and "type" in rope_parameters:
|
||||
rope_type = rope_parameters["rope_type"]
|
||||
rope_type_legacy = rope_parameters["type"]
|
||||
if rope_type != rope_type_legacy:
|
||||
if (rope_type_legacy == "su" and rope_type == "longrope") or (
|
||||
rope_type_legacy == "mrope" and rope_type == "default"
|
||||
):
|
||||
pass # No action needed
|
||||
elif rope_type != rope_type_legacy:
|
||||
raise ValueError(
|
||||
f"Found conflicts between 'rope_type={rope_type}' (modern "
|
||||
f"field) and 'type={rope_type_legacy}' (legacy field). "
|
||||
@ -629,10 +636,12 @@ def maybe_override_with_speculators(
|
||||
Returns:
|
||||
Tuple of (resolved_model, resolved_tokenizer, speculative_config)
|
||||
"""
|
||||
is_gguf = check_gguf_file(model)
|
||||
if is_gguf:
|
||||
if check_gguf_file(model):
|
||||
kwargs["gguf_file"] = Path(model).name
|
||||
gguf_model_repo = Path(model).parent
|
||||
elif is_remote_gguf(model):
|
||||
repo_id, _ = split_remote_gguf(model)
|
||||
gguf_model_repo = Path(repo_id)
|
||||
else:
|
||||
gguf_model_repo = None
|
||||
kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
|
||||
@ -678,10 +687,18 @@ def get_config(
|
||||
) -> PretrainedConfig:
|
||||
# Separate model folder from file path for GGUF models
|
||||
|
||||
is_gguf = check_gguf_file(model)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(model).name
|
||||
model = Path(model).parent
|
||||
_is_gguf = is_gguf(model)
|
||||
_is_remote_gguf = is_remote_gguf(model)
|
||||
if _is_gguf:
|
||||
if check_gguf_file(model):
|
||||
# Local GGUF file
|
||||
kwargs["gguf_file"] = Path(model).name
|
||||
model = Path(model).parent
|
||||
elif _is_remote_gguf:
|
||||
# Remote GGUF - extract repo_id from repo_id:quant_type format
|
||||
# The actual GGUF file will be downloaded later by GGUFModelLoader
|
||||
# Keep model as repo_id:quant_type for download, but use repo_id for config
|
||||
model, _ = split_remote_gguf(model)
|
||||
|
||||
if config_format == "auto":
|
||||
try:
|
||||
@ -689,10 +706,25 @@ def get_config(
|
||||
# Transformers implementation.
|
||||
if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
|
||||
config_format = "mistral"
|
||||
elif is_gguf or file_or_path_exists(
|
||||
elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
|
||||
model, HF_CONFIG_NAME, revision=revision
|
||||
):
|
||||
config_format = "hf"
|
||||
# Remote GGUF models must have config.json in repo,
|
||||
# otherwise the config can't be parsed correctly.
|
||||
# FIXME(Isotr0py): Support remote GGUF repos without config.json
|
||||
elif _is_remote_gguf and not file_or_path_exists(
|
||||
model, HF_CONFIG_NAME, revision=revision
|
||||
):
|
||||
err_msg = (
|
||||
"Could not find config.json for remote GGUF model repo. "
|
||||
"To load remote GGUF model through `<repo_id>:<quant_type>`, "
|
||||
"ensure your model has config.json (HF format) file. "
|
||||
"Otherwise please specify --hf-config-path <original_repo> "
|
||||
"in engine args to fetch config from unquantized hf model."
|
||||
)
|
||||
logger.error(err_msg)
|
||||
raise ValueError(err_msg)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Could not detect config format for no config file found. "
|
||||
@ -713,9 +745,6 @@ def get_config(
|
||||
"'config.json'.\n"
|
||||
" - For Mistral models: ensure the presence of a "
|
||||
"'params.json'.\n"
|
||||
"3. For GGUF: pass the local path of the GGUF checkpoint.\n"
|
||||
" Loading GGUF from a remote repo directly is not yet "
|
||||
"supported.\n"
|
||||
).format(model=model)
|
||||
|
||||
raise ValueError(error_message) from e
|
||||
@ -729,7 +758,7 @@ def get_config(
|
||||
**kwargs,
|
||||
)
|
||||
# Special architecture mapping check for GGUF models
|
||||
if is_gguf:
|
||||
if _is_gguf:
|
||||
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
||||
raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
|
||||
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
|
||||
@ -889,6 +918,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
|
||||
A dictionary containing the pooling type and whether
|
||||
normalization is used, or None if no pooling configuration is found.
|
||||
"""
|
||||
if is_remote_gguf(model):
|
||||
model, _ = split_remote_gguf(model)
|
||||
|
||||
modules_file_name = "modules.json"
|
||||
|
||||
@ -1108,6 +1139,8 @@ def get_hf_image_processor_config(
|
||||
# Separate model folder from file path for GGUF models
|
||||
if check_gguf_file(model):
|
||||
model = Path(model).parent
|
||||
elif is_remote_gguf(model):
|
||||
model, _ = split_remote_gguf(model)
|
||||
return get_image_processor_config(
|
||||
model, token=hf_token, revision=revision, **kwargs
|
||||
)
|
||||
|
||||
@ -18,7 +18,7 @@ from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.video_processing_utils import BaseVideoProcessor
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
|
||||
from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
|
||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -236,8 +236,8 @@ def cached_processor_from_config(
|
||||
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
if check_gguf_file(model_config.model):
|
||||
assert not check_gguf_file(model_config.tokenizer), (
|
||||
if is_gguf(model_config.model):
|
||||
assert not is_gguf(model_config.tokenizer), (
|
||||
"For multimodal GGUF models, the original tokenizer "
|
||||
"should be used to correctly load processor."
|
||||
)
|
||||
@ -350,8 +350,8 @@ def cached_image_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
if check_gguf_file(model_config.model):
|
||||
assert not check_gguf_file(model_config.tokenizer), (
|
||||
if is_gguf(model_config.model):
|
||||
assert not is_gguf(model_config.tokenizer), (
|
||||
"For multimodal GGUF models, the original tokenizer "
|
||||
"should be used to correctly load image processor."
|
||||
)
|
||||
|
||||
@ -20,7 +20,12 @@ from vllm.transformers_utils.config import (
|
||||
list_filtered_repo_files,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.transformers_utils.utils import (
|
||||
check_gguf_file,
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
split_remote_gguf,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@ -180,10 +185,12 @@ def get_tokenizer(
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
is_gguf = check_gguf_file(tokenizer_name)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
if is_gguf(tokenizer_name):
|
||||
if check_gguf_file(tokenizer_name):
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
elif is_remote_gguf(tokenizer_name):
|
||||
tokenizer_name, _ = split_remote_gguf(tokenizer_name)
|
||||
|
||||
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
|
||||
# first to use official Mistral tokenizer if possible.
|
||||
|
||||
@ -9,6 +9,8 @@ from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from gguf import GGMLQuantizationType
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@cache
|
||||
def is_remote_gguf(model: str | Path) -> bool:
|
||||
"""Check if the model is a remote GGUF model."""
|
||||
model = str(model)
|
||||
return (
|
||||
(not is_cloud_storage(model))
|
||||
and (not model.startswith(("http://", "https://")))
|
||||
and ("/" in model and ":" in model)
|
||||
and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
|
||||
)
|
||||
|
||||
|
||||
def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
|
||||
"""Check if the quant type is a valid GGUF quant type."""
|
||||
return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
|
||||
|
||||
|
||||
def split_remote_gguf(model: str | Path) -> tuple[str, str]:
|
||||
"""Split the model into repo_id and quant type."""
|
||||
model = str(model)
|
||||
if is_remote_gguf(model):
|
||||
parts = model.rsplit(":", 1)
|
||||
return (parts[0], parts[1])
|
||||
raise ValueError(
|
||||
"Wrong GGUF model or invalid GGUF quant type: %s.\n"
|
||||
"- It should be in repo_id:quant_type format.\n"
|
||||
"- Valid GGMLQuantizationType values: %s",
|
||||
model,
|
||||
GGMLQuantizationType._member_names_,
|
||||
)
|
||||
|
||||
|
||||
def is_gguf(model: str | Path) -> bool:
|
||||
"""Check if the model is a GGUF model.
|
||||
|
||||
Args:
|
||||
model: Model name, path, or Path object to check.
|
||||
|
||||
Returns:
|
||||
True if the model is a GGUF model, False otherwise.
|
||||
"""
|
||||
model = str(model)
|
||||
|
||||
# Check if it's a local GGUF file
|
||||
if check_gguf_file(model):
|
||||
return True
|
||||
|
||||
# Check if it's a remote GGUF model (repo_id:quant_type format)
|
||||
return is_remote_gguf(model)
|
||||
|
||||
|
||||
def modelscope_list_repo_files(
|
||||
repo_id: str,
|
||||
revision: str | None = None,
|
||||
|
||||
@ -73,14 +73,6 @@ class FlexibleArgumentParser(ArgumentParser):
|
||||
# Enable the deprecated kwarg for Python 3.12 and below
|
||||
|
||||
def parse_known_args(self, args=None, namespace=None):
|
||||
if args is not None and "--disable-log-requests" in args:
|
||||
# Special case warning because the warning below won't trigger
|
||||
# if –-disable-log-requests because its value is default.
|
||||
logger.warning_once(
|
||||
"argument '--disable-log-requests' is deprecated and "
|
||||
"replaced with '--enable-log-requests'. This will be "
|
||||
"removed in v0.12.0."
|
||||
)
|
||||
namespace, args = super().parse_known_args(args, namespace)
|
||||
for action in FlexibleArgumentParser._deprecated:
|
||||
if (
|
||||
|
||||
@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import pickle
|
||||
from _hashlib import HASH, UnsupportedDigestmodError
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
|
||||
return sha256_cbor
|
||||
|
||||
raise ValueError(f"Unsupported hash function: {hash_fn_name}")
|
||||
|
||||
|
||||
def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
|
||||
"""Hash for configs, defaulting to md5 but falling back to sha256
|
||||
in FIPS constrained environments.
|
||||
|
||||
Args:
|
||||
data: bytes
|
||||
usedforsecurity: Whether the hash is used for security purposes
|
||||
|
||||
Returns:
|
||||
Hash object
|
||||
"""
|
||||
try:
|
||||
return hashlib.md5(data, usedforsecurity=usedforsecurity)
|
||||
except (UnsupportedDigestmodError, ValueError):
|
||||
return hashlib.sha256(data)
|
||||
|
||||
@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
|
||||
os.environ[key] = old
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def suppress_stdout():
|
||||
"""
|
||||
Suppress stdout from C libraries at the file descriptor level.
|
||||
|
||||
Only suppresses stdout, not stderr, to preserve error messages.
|
||||
Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
|
||||
|
||||
Example:
|
||||
with suppress_stdout():
|
||||
# C library calls that would normally print to stdout
|
||||
torch.distributed.new_group(ranks, backend="gloo")
|
||||
"""
|
||||
# Don't suppress if logging level is DEBUG
|
||||
if envs.VLLM_LOGGING_LEVEL == "DEBUG":
|
||||
yield
|
||||
return
|
||||
|
||||
stdout_fd = sys.stdout.fileno()
|
||||
stdout_dup = os.dup(stdout_fd)
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
|
||||
try:
|
||||
sys.stdout.flush()
|
||||
os.dup2(devnull_fd, stdout_fd)
|
||||
yield
|
||||
finally:
|
||||
sys.stdout.flush()
|
||||
os.dup2(stdout_dup, stdout_fd)
|
||||
os.close(stdout_dup)
|
||||
os.close(devnull_fd)
|
||||
|
||||
|
||||
# File path utilities
|
||||
|
||||
|
||||
|
||||
@ -13,6 +13,8 @@ from vllm.distributed.kv_events import (
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
BlockHash,
|
||||
BlockHashList,
|
||||
BlockHashListWithBlockSize,
|
||||
BlockHashWithGroupId,
|
||||
ExternalBlockHash,
|
||||
FreeKVCacheBlockQueue,
|
||||
@ -133,6 +135,10 @@ class BlockPool:
|
||||
Args:
|
||||
num_gpu_blocks: The number of blocks in the pool.
|
||||
enable_caching: Whether to enable prefix caching.
|
||||
hash_block_size: The block size of which the block hashes are computed.
|
||||
The actual block size usually equals hash_block_size, but in cases
|
||||
where different KV cache groups have different block sizes, the
|
||||
actual block size can be a multiple of hash_block_size.
|
||||
enable_kv_cache_events: Whether to enable kv cache events.
|
||||
"""
|
||||
|
||||
@ -140,11 +146,13 @@ class BlockPool:
|
||||
self,
|
||||
num_gpu_blocks: int,
|
||||
enable_caching: bool,
|
||||
hash_block_size: int,
|
||||
enable_kv_cache_events: bool = False,
|
||||
):
|
||||
assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.enable_caching = enable_caching
|
||||
self.hash_block_size = hash_block_size
|
||||
# All kv-cache blocks.
|
||||
self.blocks: list[KVCacheBlock] = [
|
||||
KVCacheBlock(idx) for idx in range(num_gpu_blocks)
|
||||
@ -223,8 +231,20 @@ class BlockPool:
|
||||
return
|
||||
new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
|
||||
assert len(request.block_hashes) >= num_full_blocks
|
||||
new_block_hashes = request.block_hashes[num_cached_blocks:]
|
||||
if block_size == self.hash_block_size:
|
||||
# Common case.
|
||||
block_hashes: BlockHashList = request.block_hashes
|
||||
else:
|
||||
# block_size is a multiple of hash_block_size. This happens when
|
||||
# different KV cache groups have different block sizes.
|
||||
assert block_size % self.hash_block_size == 0
|
||||
# Recalculate block_hashes at the granularity of block_size, using
|
||||
# the original block_hashes (at the granularity of hash_block_size).
|
||||
block_hashes = BlockHashListWithBlockSize(
|
||||
request.block_hashes, self.hash_block_size, block_size
|
||||
)
|
||||
|
||||
new_block_hashes = block_hashes[num_cached_blocks:]
|
||||
new_hashes: list[ExternalBlockHash] | None = (
|
||||
[] if self.enable_kv_cache_events else None
|
||||
)
|
||||
|
||||
@ -2,15 +2,25 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from math import lcm
|
||||
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
BlockHash,
|
||||
BlockHashList,
|
||||
BlockHashListWithBlockSize,
|
||||
KVCacheBlock,
|
||||
)
|
||||
from vllm.v1.core.single_type_kv_cache_manager import (
|
||||
CrossAttentionManager,
|
||||
FullAttentionManager,
|
||||
get_manager_for_kv_cache_spec,
|
||||
)
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
FullAttentionSpec,
|
||||
KVCacheConfig,
|
||||
KVCacheSpec,
|
||||
)
|
||||
from vllm.v1.request import Request
|
||||
|
||||
|
||||
@ -28,13 +38,17 @@ class KVCacheCoordinator(ABC):
|
||||
enable_kv_cache_events: bool,
|
||||
dcp_world_size: int,
|
||||
pcp_world_size: int,
|
||||
hash_block_size: int,
|
||||
):
|
||||
self.kv_cache_config = kv_cache_config
|
||||
self.max_model_len = max_model_len
|
||||
self.enable_caching = enable_caching
|
||||
|
||||
self.block_pool = BlockPool(
|
||||
kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
|
||||
kv_cache_config.num_blocks,
|
||||
enable_caching,
|
||||
hash_block_size,
|
||||
enable_kv_cache_events,
|
||||
)
|
||||
|
||||
# Needs special handling for find_longest_cache_hit if eagle is enabled
|
||||
@ -213,6 +227,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
|
||||
enable_kv_cache_events: bool,
|
||||
dcp_world_size: int,
|
||||
pcp_world_size: int,
|
||||
hash_block_size: int,
|
||||
):
|
||||
super().__init__(
|
||||
kv_cache_config,
|
||||
@ -222,6 +237,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
hash_block_size=hash_block_size,
|
||||
)
|
||||
self.num_single_type_manager = len(self.single_type_managers)
|
||||
|
||||
@ -255,6 +271,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
|
||||
enable_kv_cache_events: bool,
|
||||
dcp_world_size: int,
|
||||
pcp_world_size: int,
|
||||
hash_block_size: int,
|
||||
):
|
||||
super().__init__(
|
||||
kv_cache_config,
|
||||
@ -264,6 +281,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
hash_block_size=hash_block_size,
|
||||
)
|
||||
self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
|
||||
self.block_size = self.kv_cache_spec.block_size
|
||||
@ -273,6 +291,11 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
|
||||
self.block_size *= dcp_world_size
|
||||
if pcp_world_size > 1:
|
||||
self.block_size *= pcp_world_size
|
||||
# For models using only Mamba, block_size is set to max_model_len when
|
||||
# prefix caching is disabled, and hash_block_size validation is skipped.
|
||||
assert not enable_caching or (hash_block_size == self.block_size), (
|
||||
"UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
|
||||
)
|
||||
assert len(self.kv_cache_config.kv_cache_groups) == 1, (
|
||||
"UnitaryKVCacheCoordinator assumes only one kv cache group"
|
||||
)
|
||||
@ -289,6 +312,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
|
||||
block_pool=self.block_pool,
|
||||
kv_cache_spec=self.kv_cache_spec,
|
||||
use_eagle=self.use_eagle,
|
||||
alignment_tokens=self.block_size,
|
||||
dcp_world_size=self.dcp_world_size,
|
||||
pcp_world_size=self.pcp_world_size,
|
||||
)
|
||||
@ -313,6 +337,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
|
||||
enable_kv_cache_events: bool,
|
||||
dcp_world_size: int,
|
||||
pcp_world_size: int,
|
||||
hash_block_size: int,
|
||||
):
|
||||
super().__init__(
|
||||
kv_cache_config,
|
||||
@ -322,7 +347,17 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
hash_block_size=hash_block_size,
|
||||
)
|
||||
# hash_block_size: the block size used to compute block hashes.
|
||||
# The actual block size usually equals hash_block_size, but in cases where
|
||||
# different KV cache groups have different block sizes, the actual block size
|
||||
# can be a multiple of hash_block_size.
|
||||
self.hash_block_size = hash_block_size
|
||||
assert all(
|
||||
g.kv_cache_spec.block_size % hash_block_size == 0
|
||||
for g in kv_cache_config.kv_cache_groups
|
||||
), "block_size must be divisible by hash_block_size"
|
||||
assert dcp_world_size == 1, "DCP not support hybrid attn now."
|
||||
assert pcp_world_size == 1, "PCP not support hybrid attn now."
|
||||
self.verify_and_split_kv_cache_groups()
|
||||
@ -373,14 +408,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
|
||||
self.other_spec = other_spec
|
||||
self.full_attention_block_size = self.full_attention_spec.block_size
|
||||
self.other_block_size = self.other_spec.block_size
|
||||
|
||||
if self.enable_caching:
|
||||
# this requirement is only needed for the prefix caching logic
|
||||
divisible = self.other_block_size % self.full_attention_block_size
|
||||
assert divisible == 0, (
|
||||
"KVCacheCoordinator assumes the block_size of full "
|
||||
"attention layers is divisible by other layers now."
|
||||
)
|
||||
# The LCM of the block sizes of full attention and other attention.
|
||||
# The cache hit length must be a multiple of the LCM of the block sizes
|
||||
# to make sure the cache hit length is a multiple of the block size of
|
||||
# each attention type. Requiring this because we don't support partial
|
||||
# block cache hit yet.
|
||||
self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
|
||||
|
||||
if max(self.full_attention_group_ids) < min(self.other_group_ids):
|
||||
self.full_attn_first = True
|
||||
@ -414,25 +447,48 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
|
||||
- The number of tokens of the longest cache hit.
|
||||
"""
|
||||
# First, find the longest cache hit for full attention.
|
||||
if self.full_attention_spec.block_size == self.hash_block_size:
|
||||
# Common case.
|
||||
full_attention_block_hashes: BlockHashList = block_hashes
|
||||
else:
|
||||
# block_size is a multiple of hash_block_size. This happens when different
|
||||
# KV cache groups have different block sizes. In this case, we need to
|
||||
# recalculate block_hashes at the granularity of block_size, using the
|
||||
# original block_hashes (at the granularity of hash_block_size).
|
||||
full_attention_block_hashes = BlockHashListWithBlockSize(
|
||||
block_hashes, self.hash_block_size, self.full_attention_spec.block_size
|
||||
)
|
||||
hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
|
||||
block_hashes=block_hashes,
|
||||
block_hashes=full_attention_block_hashes,
|
||||
max_length=max_cache_hit_length,
|
||||
kv_cache_group_ids=self.full_attention_group_ids,
|
||||
block_pool=self.block_pool,
|
||||
kv_cache_spec=self.full_attention_spec,
|
||||
use_eagle=self.use_eagle,
|
||||
alignment_tokens=self.lcm_block_size,
|
||||
)
|
||||
hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
|
||||
|
||||
# Next, find the cache hit for the other attention WITHIN
|
||||
# the cache hit of full attention.
|
||||
if self.other_spec.block_size == self.hash_block_size:
|
||||
# Common case.
|
||||
other_block_hashes: BlockHashList = block_hashes
|
||||
else:
|
||||
# Similar to the full attention case, here we need to recalculate
|
||||
# block_hashes at the granularity of block_size, using the original
|
||||
# block_hashes (at the granularity of hash_block_size).
|
||||
other_block_hashes = BlockHashListWithBlockSize(
|
||||
block_hashes, self.hash_block_size, self.other_spec.block_size
|
||||
)
|
||||
hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
|
||||
block_hashes=block_hashes,
|
||||
block_hashes=other_block_hashes,
|
||||
max_length=hit_length,
|
||||
kv_cache_group_ids=self.other_group_ids,
|
||||
block_pool=self.block_pool,
|
||||
kv_cache_spec=self.other_spec,
|
||||
use_eagle=self.use_eagle,
|
||||
alignment_tokens=self.lcm_block_size,
|
||||
)
|
||||
hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
|
||||
|
||||
@ -466,6 +522,7 @@ def get_kv_cache_coordinator(
|
||||
enable_kv_cache_events: bool,
|
||||
dcp_world_size: int,
|
||||
pcp_world_size: int,
|
||||
hash_block_size: int,
|
||||
) -> KVCacheCoordinator:
|
||||
if not enable_caching:
|
||||
return KVCacheCoordinatorNoPrefixCache(
|
||||
@ -473,8 +530,9 @@ def get_kv_cache_coordinator(
|
||||
max_model_len,
|
||||
use_eagle,
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
dcp_world_size,
|
||||
pcp_world_size,
|
||||
hash_block_size,
|
||||
)
|
||||
if len(kv_cache_config.kv_cache_groups) == 1:
|
||||
return UnitaryKVCacheCoordinator(
|
||||
@ -483,8 +541,9 @@ def get_kv_cache_coordinator(
|
||||
use_eagle,
|
||||
enable_caching,
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
dcp_world_size,
|
||||
pcp_world_size,
|
||||
hash_block_size,
|
||||
)
|
||||
return HybridKVCacheCoordinator(
|
||||
kv_cache_config,
|
||||
@ -492,6 +551,7 @@ def get_kv_cache_coordinator(
|
||||
use_eagle,
|
||||
enable_caching,
|
||||
enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
dcp_world_size,
|
||||
pcp_world_size,
|
||||
hash_block_size,
|
||||
)
|
||||
|
||||
@ -95,6 +95,7 @@ class KVCacheManager:
|
||||
self,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
max_model_len: int,
|
||||
hash_block_size: int,
|
||||
enable_caching: bool = True,
|
||||
use_eagle: bool = False,
|
||||
log_stats: bool = False,
|
||||
@ -107,28 +108,11 @@ class KVCacheManager:
|
||||
self.enable_caching = enable_caching
|
||||
self.use_eagle = use_eagle
|
||||
self.log_stats = log_stats
|
||||
# FIXME: make prefix cache stats conditional on log_stats
|
||||
# FIXME: make prefix cache stats conditional on log_stats. We still need
|
||||
# this comment because when the log stats is enabled there are still
|
||||
# potential configs we could expose in the future.
|
||||
self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
|
||||
|
||||
self.block_size: int | None = None
|
||||
if self.enable_caching:
|
||||
assert (
|
||||
len(
|
||||
set(
|
||||
g.kv_cache_spec.block_size
|
||||
for g in kv_cache_config.kv_cache_groups
|
||||
)
|
||||
)
|
||||
== 1
|
||||
), "Only one block size is supported for now"
|
||||
self.block_size = kv_cache_config.kv_cache_groups[
|
||||
0
|
||||
].kv_cache_spec.block_size
|
||||
|
||||
if dcp_world_size * pcp_world_size > 1:
|
||||
assert len(kv_cache_config.kv_cache_groups) == 1
|
||||
self.block_size *= dcp_world_size * pcp_world_size
|
||||
|
||||
self.coordinator = get_kv_cache_coordinator(
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_model_len=self.max_model_len,
|
||||
@ -137,6 +121,7 @@ class KVCacheManager:
|
||||
enable_kv_cache_events=enable_kv_cache_events,
|
||||
dcp_world_size=dcp_world_size,
|
||||
pcp_world_size=pcp_world_size,
|
||||
hash_block_size=hash_block_size,
|
||||
)
|
||||
self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
|
||||
self.block_pool = self.coordinator.block_pool
|
||||
|
||||
@ -5,9 +5,9 @@
|
||||
import copy
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterable, Sequence
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, NewType, TypeAlias
|
||||
from collections.abc import Callable, Iterable, Iterator, Sequence
|
||||
from dataclasses import dataclass, replace
|
||||
from typing import Any, NewType, TypeAlias, overload
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
@ -825,11 +825,11 @@ def get_num_blocks(
|
||||
return num_blocks
|
||||
|
||||
|
||||
def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
|
||||
def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
|
||||
"""
|
||||
Get the page size of the KV cache.
|
||||
"""
|
||||
page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
|
||||
page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
|
||||
assert len(page_sizes) == 1
|
||||
return page_sizes.pop()
|
||||
|
||||
@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
|
||||
return len(page_sizes) == 1
|
||||
|
||||
|
||||
def unify_kv_cache_spec_page_size(
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
) -> dict[str, KVCacheSpec]:
|
||||
"""
|
||||
Unify the page size of the given KVCacheSpec. If the page size of all layers
|
||||
are the same, return the original KVCacheSpec. If not same, unify the page
|
||||
size by increasing the block size of layers with smaller page size. Raise
|
||||
NotImplementedError if failed to unify the page size.
|
||||
|
||||
Args:
|
||||
kv_cache_spec: The KVCacheSpec of each attention layer in the model
|
||||
|
||||
Returns:
|
||||
The updated KVCacheSpec with the same page_size_bytes.
|
||||
"""
|
||||
page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
|
||||
if len(page_sizes) <= 1:
|
||||
# All layers have the same page size, no need to unify.
|
||||
return kv_cache_spec
|
||||
|
||||
max_page_size = max(page_sizes)
|
||||
new_kv_cache_spec = {}
|
||||
for layer_name, layer_spec in kv_cache_spec.items():
|
||||
if layer_spec.page_size_bytes == max_page_size:
|
||||
new_kv_cache_spec[layer_name] = layer_spec
|
||||
else:
|
||||
layer_page_size = layer_spec.page_size_bytes
|
||||
if max_page_size % layer_page_size != 0:
|
||||
raise NotImplementedError(
|
||||
"The page size of the layer is not divisible by the "
|
||||
"maximum page size. Cannot unify by adjusting block_size."
|
||||
)
|
||||
ratio = max_page_size // layer_page_size
|
||||
new_block_size = layer_spec.block_size * ratio
|
||||
new_spec = replace(layer_spec, block_size=new_block_size)
|
||||
assert new_spec.page_size_bytes == max_page_size
|
||||
new_kv_cache_spec[layer_name] = new_spec
|
||||
return new_kv_cache_spec
|
||||
|
||||
|
||||
def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||
# kv_cache_spec is an empty dict for attention free models
|
||||
return not kv_cache_spec
|
||||
@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
|
||||
def get_kv_cache_config_from_groups(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
kv_cache_specs: dict[str, KVCacheSpec],
|
||||
available_memory: int,
|
||||
) -> KVCacheConfig:
|
||||
"""
|
||||
@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups(
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_groups: The KV cache groups
|
||||
kv_cache_specs: The KV cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups(
|
||||
# full.1, sw.2: share another Tensor with size=available_memory//2
|
||||
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
||||
|
||||
page_size = get_uniform_page_size(kv_cache_specs)
|
||||
page_size = get_uniform_page_size(
|
||||
[group.kv_cache_spec for group in kv_cache_groups]
|
||||
)
|
||||
assert group_size > 0, "group_size must be greater than 0"
|
||||
num_blocks = get_num_blocks(
|
||||
vllm_config, group_size, available_memory, page_size
|
||||
@ -1166,7 +1206,8 @@ def get_kv_cache_groups(
|
||||
# This returns an empty list to allow for the KVCacheManager to handle
|
||||
# attention free models.
|
||||
return []
|
||||
elif is_kv_cache_spec_uniform(kv_cache_spec):
|
||||
|
||||
if is_kv_cache_spec_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
@ -1176,14 +1217,16 @@ def get_kv_cache_groups(
|
||||
# full attention, or all layers are sliding window attention with the
|
||||
# same window size). Put all layers into one group.
|
||||
return _get_kv_cache_groups_uniform_type(uniform_spec)
|
||||
elif is_kv_cache_page_size_uniform(kv_cache_spec):
|
||||
# Model contains multiple attention types, but KV cache of all layers
|
||||
# have the same physical memory per block per layer. Split the layers
|
||||
# into groups with the same number of layers, and thus same total page
|
||||
# size.
|
||||
return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
|
||||
|
||||
raise NotImplementedError
|
||||
# As KVCacheManager can only allocate memory of one size, we need to unify
|
||||
# the page size of the layers. For cases cannot be unified, this function
|
||||
# will raise an error.
|
||||
kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
|
||||
# Model contains multiple attention types, but KV cache of all layers
|
||||
# have the same physical memory per block per layer. Split the layers
|
||||
# into groups with the same number of layers, and thus same total page
|
||||
# size.
|
||||
return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
|
||||
|
||||
|
||||
def generate_scheduler_kv_cache_config(
|
||||
@ -1327,10 +1370,7 @@ def get_kv_cache_configs(
|
||||
) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
|
||||
kv_cache_configs.append(
|
||||
get_kv_cache_config_from_groups(
|
||||
vllm_config,
|
||||
kv_cache_groups_one_worker,
|
||||
kv_cache_spec_one_worker,
|
||||
available_memory_one_worker,
|
||||
vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
|
||||
)
|
||||
)
|
||||
|
||||
@ -1353,3 +1393,79 @@ def get_kv_cache_configs(
|
||||
_report_kv_cache_config(vllm_config, kv_cache_config)
|
||||
|
||||
return kv_cache_configs
|
||||
|
||||
|
||||
class BlockHashListWithBlockSize:
|
||||
"""
|
||||
Convert block-hash granularity from `hash_block_size` to `target_block_size`.
|
||||
Used when KV cache groups have different block sizes: `hash_block_size`
|
||||
is the size used to compute the original `block_hashes`; `target_block_size`
|
||||
is the group's actual block size.
|
||||
|
||||
Currently, only scaling up by an integer factor is supported (i.e.,
|
||||
`target_block_size` is a multiple of `hash_block_size`). Conversion is
|
||||
performed lazily on access for efficiency, by concatenating consecutive
|
||||
hashes at `hash_block_size` to form each hash at `target_block_size`.
|
||||
|
||||
Example (`hash_block_size` = 16, `target_block_size` = 32):
|
||||
concatenating two 16-size hashes yields one 32-size hash:
|
||||
|
||||
Block hashes with block_size 16:
|
||||
| Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
|
||||
|-------------|------|-------|-------|-------|
|
||||
| Hash | A | B | C | D |
|
||||
|
||||
Block hashes with block_size 32:
|
||||
| Token Range | 0-31 | 32-63 |
|
||||
|-------------|------|-------|
|
||||
| Hash | AB | CD |
|
||||
|
||||
Args:
|
||||
block_hashes: Block hashes to convert, computed at `hash_block_size`.
|
||||
hash_block_size: Block size at which `block_hashes` were computed.
|
||||
target_block_size: Desired block size; must be a multiple of `hash_block_size`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_hashes: list[BlockHash],
|
||||
hash_block_size: int,
|
||||
target_block_size: int,
|
||||
):
|
||||
self.block_hashes = block_hashes
|
||||
assert target_block_size % hash_block_size == 0
|
||||
self.scale_factor = target_block_size // hash_block_size
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.block_hashes) // self.scale_factor
|
||||
|
||||
@overload
|
||||
def __getitem__(self, idx: int) -> BlockHash: ...
|
||||
|
||||
@overload
|
||||
def __getitem__(self, idx: slice) -> list[BlockHash]: ...
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if isinstance(idx, int):
|
||||
return self._get_value_at(idx)
|
||||
|
||||
if isinstance(idx, slice):
|
||||
start, stop, step = idx.indices(len(self))
|
||||
return [self._get_value_at(i) for i in range(start, stop, step)]
|
||||
|
||||
raise TypeError(f"Invalid index type: {type(idx)!r}")
|
||||
|
||||
def __iter__(self) -> Iterator[BlockHash]:
|
||||
for i in range(len(self)):
|
||||
yield self._get_value_at(i)
|
||||
|
||||
def _get_value_at(self, idx: int) -> BlockHash:
|
||||
base = idx * self.scale_factor
|
||||
end = base + self.scale_factor
|
||||
merged_hash: bytes = self.block_hashes[base]
|
||||
for i in range(base + 1, end):
|
||||
merged_hash += self.block_hashes[i]
|
||||
return BlockHash(merged_hash)
|
||||
|
||||
|
||||
BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
|
||||
|
||||
@ -186,6 +186,7 @@ class Scheduler(SchedulerInterface):
|
||||
enable_kv_cache_events=self.enable_kv_cache_events,
|
||||
dcp_world_size=self.dcp_world_size,
|
||||
pcp_world_size=self.pcp_world_size,
|
||||
hash_block_size=self.block_size,
|
||||
)
|
||||
sink_len = getattr(vllm_config.model_config.hf_config, "param_sink_number", 0)
|
||||
if sink_len > 0:
|
||||
@ -1093,8 +1094,6 @@ class Scheduler(SchedulerInterface):
|
||||
and request.sampling_params.logprobs is not None
|
||||
and logprobs
|
||||
):
|
||||
# NOTE: once we support N tokens per step (spec decode),
|
||||
# the outer lists can be of length > 1.
|
||||
new_logprobs = logprobs.slice(req_index, req_index + 1)
|
||||
|
||||
if new_token_ids and self.structured_output_manager.should_advance(request):
|
||||
|
||||
@ -7,7 +7,7 @@ from collections.abc import Sequence
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
|
||||
from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
ChunkedLocalAttentionSpec,
|
||||
CrossAttentionSpec,
|
||||
@ -208,12 +208,13 @@ class SingleTypeKVCacheManager(ABC):
|
||||
@abstractmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
@ -233,6 +234,11 @@ class SingleTypeKVCacheManager(ABC):
|
||||
block_pool: The block pool.
|
||||
kv_cache_spec: The kv cache spec.
|
||||
use_eagle: Whether to use eagle.
|
||||
alignment_tokens: The returned cache hit length (in tokens) should
|
||||
be a multiple of this value (in tokens). By default, it should
|
||||
be set to the block_size.
|
||||
dcp_world_size: The world size of decode context parallelism.
|
||||
pcp_world_size: The world size of prefill context parallelism.
|
||||
|
||||
Returns:
|
||||
A list of cached blocks with skipped blocks replaced by null block
|
||||
@ -300,18 +306,19 @@ class FullAttentionManager(SingleTypeKVCacheManager):
|
||||
@classmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
assert isinstance(
|
||||
kv_cache_spec,
|
||||
(FullAttentionSpec, FullDiffkvAttentionSpec, ChunkedLocalAttentionSpec),
|
||||
FullAttentionSpec | ChunkedLocalAttentionSpec | FullDiffkvAttentionSpec
|
||||
), (
|
||||
"FullAttentionManager can only be used for full attention "
|
||||
"and chunked local attention groups"
|
||||
@ -335,6 +342,13 @@ class FullAttentionManager(SingleTypeKVCacheManager):
|
||||
else:
|
||||
break
|
||||
if use_eagle and computed_blocks[0]:
|
||||
# Need to drop the last matched block if eagle is enabled.
|
||||
for computed in computed_blocks:
|
||||
computed.pop()
|
||||
while (
|
||||
block_size != alignment_tokens # Faster for common case.
|
||||
and len(computed_blocks[0]) * block_size % alignment_tokens != 0
|
||||
):
|
||||
for computed in computed_blocks:
|
||||
computed.pop()
|
||||
return computed_blocks
|
||||
@ -361,12 +375,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
|
||||
@classmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
@ -398,6 +413,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
|
||||
[block_pool.null_block] * max_num_blocks
|
||||
for _ in range(len(kv_cache_group_ids))
|
||||
)
|
||||
block_size = kv_cache_spec.block_size
|
||||
num_contiguous_blocks = 0
|
||||
match_found = False
|
||||
# Search from right to left and early stop when a match is found.
|
||||
@ -405,6 +421,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
|
||||
if cached_block := block_pool.get_cached_block(
|
||||
block_hashes[i], kv_cache_group_ids
|
||||
):
|
||||
# Skip prefix matching check if the block is not aligned with
|
||||
# `alignment_tokens`.
|
||||
if (
|
||||
num_contiguous_blocks == 0
|
||||
and block_size != alignment_tokens # Faster for common case.
|
||||
and (i + 1) * block_size % alignment_tokens != 0
|
||||
):
|
||||
continue
|
||||
# Add the cached block to the computed blocks.
|
||||
for computed, cached in zip(computed_blocks, cached_block):
|
||||
computed[i] = cached
|
||||
num_contiguous_blocks += 1
|
||||
@ -423,7 +448,16 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
|
||||
# `num_contiguous_blocks < sliding_window_contiguous_blocks`.
|
||||
for computed in computed_blocks:
|
||||
del computed[num_contiguous_blocks:]
|
||||
while (
|
||||
block_size != alignment_tokens # Faster for common case.
|
||||
and len(computed_blocks[0]) * block_size % alignment_tokens != 0
|
||||
):
|
||||
for computed in computed_blocks:
|
||||
computed.pop()
|
||||
if use_eagle and computed_blocks[0]:
|
||||
assert kv_cache_spec.block_size == alignment_tokens, (
|
||||
"aligned_length is not compatible with eagle now"
|
||||
)
|
||||
for computed in computed_blocks:
|
||||
computed.pop()
|
||||
return computed_blocks
|
||||
@ -477,12 +511,13 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
|
||||
@classmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
@ -513,6 +548,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
|
||||
block_pool: The block pool.
|
||||
kv_cache_spec: The kv cache spec.
|
||||
use_eagle: Whether to use eagle.
|
||||
dcp_world_size: The world size of decode context parallelism.
|
||||
pcp_world_size: The world size of prefill context parallelism.
|
||||
alignment_tokens: The returned cache hit length (in tokens) should
|
||||
be a multiple of this value (in tokens).
|
||||
|
||||
Returns:
|
||||
A list of cached blocks
|
||||
@ -526,6 +565,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
|
||||
)
|
||||
assert dcp_world_size == 1, "DCP not support chunked local attn now."
|
||||
assert pcp_world_size == 1, "PCP not support chunked local attn now."
|
||||
assert kv_cache_spec.block_size == alignment_tokens, (
|
||||
"KV cache groups with different block sizes are not compatible with "
|
||||
"chunked local attention now"
|
||||
)
|
||||
max_num_blocks = max_length // kv_cache_spec.block_size
|
||||
if max_length > 0:
|
||||
local_attention_start_idx = (
|
||||
@ -614,12 +657,13 @@ class MambaManager(SingleTypeKVCacheManager):
|
||||
@classmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
@ -632,12 +676,21 @@ class MambaManager(SingleTypeKVCacheManager):
|
||||
[] for _ in range(len(kv_cache_group_ids))
|
||||
)
|
||||
|
||||
max_num_blocks = max_length // kv_cache_spec.block_size
|
||||
block_size = kv_cache_spec.block_size
|
||||
max_num_blocks = max_length // block_size
|
||||
# Search from right to left and early stop when a match is found.
|
||||
for i in range(max_num_blocks - 1, -1, -1):
|
||||
if cached_block := block_pool.get_cached_block(
|
||||
block_hashes[i], kv_cache_group_ids
|
||||
):
|
||||
# When enable Mamba prefix caching, `block_size` will be aligned
|
||||
# across full attention layers and Mamba layers to ensure the
|
||||
# prefix hit length aligned at block
|
||||
if (
|
||||
block_size != alignment_tokens # Faster for common case.
|
||||
and (i + 1) * block_size % alignment_tokens != 0
|
||||
):
|
||||
continue
|
||||
for computed, cached in zip(computed_blocks, cached_block):
|
||||
# the hit length logic later assumes:
|
||||
# hit_length = len(hit_blocks_other_attn[0])
|
||||
@ -710,12 +763,13 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
|
||||
@classmethod
|
||||
def find_longest_cache_hit(
|
||||
cls,
|
||||
block_hashes: list[BlockHash],
|
||||
block_hashes: BlockHashList,
|
||||
max_length: int,
|
||||
kv_cache_group_ids: list[int],
|
||||
block_pool: BlockPool,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
use_eagle: bool,
|
||||
alignment_tokens: int,
|
||||
dcp_world_size: int = 1,
|
||||
pcp_world_size: int = 1,
|
||||
) -> tuple[list[KVCacheBlock], ...]:
|
||||
|
||||
@ -31,7 +31,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.async_utils import cancel_task_threadsafe
|
||||
from vllm.utils.collection_utils import as_list
|
||||
from vllm.utils.func_utils import deprecate_kwargs
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
@ -195,12 +194,6 @@ class AsyncLLM(EngineClient):
|
||||
self.profiler = None
|
||||
|
||||
@classmethod
|
||||
@deprecate_kwargs(
|
||||
"disable_log_requests",
|
||||
additional_message=(
|
||||
"This argument will have no effect. Use `enable_log_requests` instead."
|
||||
),
|
||||
)
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
@ -213,7 +206,6 @@ class AsyncLLM(EngineClient):
|
||||
client_addresses: dict[str, str] | None = None,
|
||||
client_count: int = 1,
|
||||
client_index: int = 0,
|
||||
disable_log_requests: bool = True, # Deprecated, will be removed
|
||||
) -> "AsyncLLM":
|
||||
# Create the LLMEngine.
|
||||
return cls(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import replace
|
||||
|
||||
import torch
|
||||
@ -204,7 +205,9 @@ class RejectionSampler(nn.Module):
|
||||
def parse_output(
|
||||
output_token_ids: torch.Tensor,
|
||||
vocab_size: int,
|
||||
) -> list[list[int]]:
|
||||
discard_req_indices: Sequence[int] = (),
|
||||
return_cu_num_tokens: bool = False,
|
||||
) -> tuple[list[list[int]], list[int] | None]:
|
||||
"""Parse the output of the rejection sampler.
|
||||
Args:
|
||||
output_token_ids: The sampled token IDs in shape
|
||||
@ -212,6 +215,8 @@ class RejectionSampler(nn.Module):
|
||||
replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
|
||||
and will be filtered out in this function.
|
||||
vocab_size: The size of the vocabulary.
|
||||
discard_req_indices: Optional row indices to discard tokens in.
|
||||
return_cu_num_tokens: Whether to also return cumulative token counts.
|
||||
Returns:
|
||||
A list of lists of token IDs.
|
||||
"""
|
||||
@ -220,10 +225,15 @@ class RejectionSampler(nn.Module):
|
||||
valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
|
||||
output_token_ids_np < vocab_size
|
||||
)
|
||||
cu_num_tokens = None
|
||||
if return_cu_num_tokens:
|
||||
cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
|
||||
if len(discard_req_indices) > 0:
|
||||
valid_mask[discard_req_indices] = False
|
||||
outputs = [
|
||||
row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
|
||||
]
|
||||
return outputs
|
||||
return outputs, cu_num_tokens
|
||||
|
||||
def apply_logits_processors(
|
||||
self,
|
||||
|
||||
@ -1055,11 +1055,11 @@ class EagleProposer:
|
||||
elif (
|
||||
isinstance(target_embed_tokens.weight, torch.Tensor)
|
||||
and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
|
||||
and torch.allclose(
|
||||
# TODO: Offload to CPU for comparison to avoid extra GPU memory
|
||||
# usage in CI testing environments with limited GPU memory
|
||||
and torch.equal(
|
||||
target_embed_tokens.weight.cpu(),
|
||||
self.model.model.embed_tokens.weight.cpu(),
|
||||
rtol=1e-5,
|
||||
atol=1e-7,
|
||||
)
|
||||
):
|
||||
share_embeddings = True
|
||||
@ -1105,8 +1105,11 @@ class EagleProposer:
|
||||
hasattr(target_language_model, "lm_head")
|
||||
and isinstance(target_language_model.lm_head.weight, torch.Tensor)
|
||||
and isinstance(self.model.lm_head.weight, torch.Tensor)
|
||||
# TODO: Offload to CPU for comparison to avoid extra GPU memory
|
||||
# usage in CI testing environments with limited GPU memory
|
||||
and torch.equal(
|
||||
target_language_model.lm_head.weight, self.model.lm_head.weight
|
||||
target_language_model.lm_head.weight.cpu(),
|
||||
self.model.lm_head.weight.cpu(),
|
||||
)
|
||||
):
|
||||
share_lm_head = True
|
||||
|
||||
@ -186,7 +186,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
|
||||
self,
|
||||
model_runner_output: ModelRunnerOutput,
|
||||
sampled_token_ids: torch.Tensor,
|
||||
logprobs_tensors: torch.Tensor | None,
|
||||
logprobs_tensors: LogprobsTensors | None,
|
||||
invalid_req_indices: list[int],
|
||||
async_output_copy_stream: torch.cuda.Stream,
|
||||
vocab_size: int,
|
||||
@ -222,28 +222,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
|
||||
|
||||
This function blocks until the copy is finished.
|
||||
"""
|
||||
max_gen_len = self.sampled_token_ids_cpu.shape[-1]
|
||||
self.async_copy_ready_event.synchronize()
|
||||
|
||||
# Release the device tensors once the copy has completed.
|
||||
del self._logprobs_tensors
|
||||
del self._sampled_token_ids
|
||||
max_gen_len = self.sampled_token_ids_cpu.shape[-1]
|
||||
if max_gen_len == 1:
|
||||
valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
|
||||
for i in self._invalid_req_indices:
|
||||
valid_sampled_token_ids[i].clear()
|
||||
cu_num_tokens = None
|
||||
else:
|
||||
valid_sampled_token_ids = RejectionSampler.parse_output(
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
self.sampled_token_ids_cpu,
|
||||
self.vocab_size,
|
||||
self._invalid_req_indices,
|
||||
return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
|
||||
)
|
||||
for i in self._invalid_req_indices:
|
||||
valid_sampled_token_ids[i].clear()
|
||||
|
||||
output = self._model_runner_output
|
||||
output.sampled_token_ids = valid_sampled_token_ids
|
||||
if self._logprobs_tensors_cpu:
|
||||
# NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
|
||||
# for async sched + spec decode + logprobs compatibility.
|
||||
output.logprobs = self._logprobs_tensors_cpu.tolists()
|
||||
output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
|
||||
return output
|
||||
|
||||
|
||||
@ -2629,28 +2630,24 @@ class GPUModelRunner(
|
||||
sampled_token_ids = sampler_output.sampled_token_ids
|
||||
logprobs_tensors = sampler_output.logprobs_tensors
|
||||
invalid_req_indices = []
|
||||
cu_num_new_tokens: list[int] | None = None
|
||||
cu_num_tokens: list[int] | None = None
|
||||
if not self.use_async_scheduling:
|
||||
# Get the valid generated tokens.
|
||||
max_gen_len = sampled_token_ids.shape[-1]
|
||||
if max_gen_len == 1:
|
||||
# No spec decode tokens.
|
||||
valid_sampled_token_ids = self._to_list(sampled_token_ids)
|
||||
# Mask out the sampled tokens that should not be sampled.
|
||||
for i in discard_sampled_tokens_req_indices:
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
# Includes spec decode tokens.
|
||||
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
||||
valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
discard_sampled_tokens_req_indices,
|
||||
return_cu_num_tokens=logprobs_tensors is not None,
|
||||
)
|
||||
if logprobs_tensors:
|
||||
# Needed for extracting logprobs when spec decoding.
|
||||
# This must be done prior to discarding sampled tokens.
|
||||
cu_num_new_tokens = [0]
|
||||
for toks in valid_sampled_token_ids:
|
||||
cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
|
||||
# Mask out the sampled tokens that should not be sampled.
|
||||
for i in discard_sampled_tokens_req_indices:
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
|
||||
@ -2704,7 +2701,7 @@ class GPUModelRunner(
|
||||
req_state.output_token_ids.extend(sampled_ids)
|
||||
|
||||
logprobs_lists = (
|
||||
logprobs_tensors.tolists(cu_num_new_tokens)
|
||||
logprobs_tensors.tolists(cu_num_tokens)
|
||||
if not self.use_async_scheduling and logprobs_tensors is not None
|
||||
else None
|
||||
)
|
||||
@ -2824,7 +2821,7 @@ class GPUModelRunner(
|
||||
# returns True. before returning early here we call
|
||||
# dummy run to ensure coordinate_batch_across_dp
|
||||
# is called into to avoid out of sync issues.
|
||||
self._dummy_run(1)
|
||||
self._dummy_run(self._get_num_input_tokens(1))
|
||||
if not has_kv_transfer_group():
|
||||
# Return empty ModelRunnerOutput if no work to do.
|
||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||
@ -3495,6 +3492,10 @@ class GPUModelRunner(
|
||||
scope="local",
|
||||
)
|
||||
prepare_communication_buffer_for_model(self.model)
|
||||
if (drafter := getattr(self, "drafter", None)) and (
|
||||
drafter_model := getattr(drafter, "model", None)
|
||||
):
|
||||
prepare_communication_buffer_for_model(drafter_model)
|
||||
mm_config = self.model_config.multimodal_config
|
||||
self.is_multimodal_pruning_enabled = (
|
||||
supports_multimodal_pruning(self.get_model())
|
||||
@ -4277,14 +4278,18 @@ class GPUModelRunner(
|
||||
# NOTE: This happens when encoder cache needs to store
|
||||
# the embeddings that encoder outputs are scattered onto.
|
||||
# In this case we create dummy embeddings of size
|
||||
# (encode_budget, hidden_size) and scatter encoder
|
||||
# output into it.
|
||||
# (max_tokens_for_modality, hidden_size) and scatter
|
||||
# encoder output into it.
|
||||
encoder_output_shape = dummy_encoder_outputs[0].shape
|
||||
if encoder_output_shape[0] < encoder_budget:
|
||||
max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
|
||||
dummy_modality
|
||||
]
|
||||
if encoder_output_shape[0] < max_mm_tokens_per_item:
|
||||
encoder_hidden_size = encoder_output_shape[-1]
|
||||
expanded_outputs = []
|
||||
for output in dummy_encoder_outputs:
|
||||
expanded = output.new_zeros(
|
||||
(encoder_budget, encoder_output_shape[-1])
|
||||
(max_mm_tokens_per_item, encoder_hidden_size)
|
||||
)
|
||||
num_tokens = output.shape[0]
|
||||
expanded[:num_tokens].copy_(output)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user