mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 02:09:08 +08:00
Apply fixes for CUDA 13 (#24599)
Signed-off-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>
This commit is contained in:
parent
9fccd04e30
commit
bfe9380161
@ -175,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set CUDA include flags for CXX compiler.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
||||||
|
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||||
|
|||||||
17
csrc/cub_helpers.h
Normal file
17
csrc/cub_helpers.h
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#if CUB_VERSION >= 200800
|
||||||
|
#include <cuda/std/functional>
|
||||||
|
using CubAddOp = cuda::std::plus<>;
|
||||||
|
using CubMaxOp = cuda::maximum<>;
|
||||||
|
#else // if CUB_VERSION < 200800
|
||||||
|
using CubAddOp = cub::Sum;
|
||||||
|
using CubMaxOp = cub::Max;
|
||||||
|
#endif // CUB_VERSION
|
||||||
|
#else
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
using CubAddOp = cub::Sum;
|
||||||
|
using CubMaxOp = cub::Max;
|
||||||
|
#endif // USE_ROCM
|
||||||
@ -1,15 +1,10 @@
|
|||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
#include "cub_helpers.h"
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#else
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
@ -30,7 +25,7 @@ __global__ void rms_norm_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
@ -85,7 +80,7 @@ fused_add_rms_norm_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
@ -126,7 +121,7 @@ fused_add_rms_norm_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
|||||||
@ -8,16 +8,11 @@
|
|||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
#include "cub_helpers.h"
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#else
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
@ -39,7 +34,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
@ -100,7 +95,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
@ -149,7 +144,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
|||||||
@ -20,17 +20,7 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
#include "../cub_helpers.h"
|
||||||
#ifndef USE_ROCM
|
|
||||||
#include <cub/util_type.cuh>
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#include <cuda/std/functional>
|
|
||||||
using AddOp = cuda::std::plus<float>;
|
|
||||||
#else
|
|
||||||
#include <hipcub/util_type.hpp>
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
using AddOp = cub::Sum;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -79,7 +69,7 @@ __launch_bounds__(TPB) __global__
|
|||||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
{
|
{
|
||||||
float_max = maxElem;
|
float_max = maxElem;
|
||||||
@ -94,7 +84,7 @@ __launch_bounds__(TPB) __global__
|
|||||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
|
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -7,17 +7,10 @@
|
|||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "../../cub_helpers.h"
|
||||||
#include "../../dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
#include "../vectorization_utils.cuh"
|
#include "../vectorization_utils.cuh"
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#include <cub/util_type.cuh>
|
|
||||||
#else
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#include <hipcub/util_type.hpp>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
static constexpr auto i8_min =
|
static constexpr auto i8_min =
|
||||||
@ -173,7 +166,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
|||||||
});
|
});
|
||||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage tmp;
|
__shared__ typename BlockReduce::TempStorage tmp;
|
||||||
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
|
||||||
__shared__ float absmax;
|
__shared__ float absmax;
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
absmax = block_max;
|
absmax = block_max;
|
||||||
|
|||||||
@ -1,15 +1,10 @@
|
|||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
#include "../../cub_helpers.h"
|
||||||
#include "../vectorization_utils.cuh"
|
#include "../vectorization_utils.cuh"
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#else
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template <typename scalar_t, typename fp8_type>
|
template <typename scalar_t, typename fp8_type>
|
||||||
@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
|
|||||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage tmp;
|
__shared__ typename BlockReduce::TempStorage tmp;
|
||||||
const float block_max =
|
const float block_max =
|
||||||
BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float token_scale;
|
__shared__ float token_scale;
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
|
|||||||
@ -8,11 +8,7 @@
|
|||||||
#include "quantization/utils.cuh"
|
#include "quantization/utils.cuh"
|
||||||
#include "quant_conversions.cuh"
|
#include "quant_conversions.cuh"
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#include "../../cub_helpers.h"
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#else
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@ -36,7 +32,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
|
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float s_rms;
|
__shared__ float s_rms;
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@ -73,7 +69,7 @@ __device__ void compute_dynamic_per_token_scales(
|
|||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
block_absmax_val_maybe =
|
block_absmax_val_maybe =
|
||||||
BlockReduce(reduceStore)
|
BlockReduce(reduceStore)
|
||||||
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
|
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float s_token_scale;
|
__shared__ float s_token_scale;
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@ -169,7 +165,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
|
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float s_rms;
|
__shared__ float s_rms;
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@ -240,7 +236,7 @@ __device__ void compute_dynamic_per_token_scales(
|
|||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
block_absmax_val_maybe =
|
block_absmax_val_maybe =
|
||||||
BlockReduce(reduceStore)
|
BlockReduce(reduceStore)
|
||||||
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
|
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float s_token_scale;
|
__shared__ float s_token_scale;
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user