mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 15:16:14 +08:00
[Kernel][Build/CI] Bump CUTLASS to 3.8 and add initializers for cutlass epilogues (#13797)
This commit is contained in:
parent
e1fe7591f2
commit
094b7d9496
@ -266,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
GIT_TAG v3.7.0
|
GIT_TAG v3.8.0
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -321,7 +321,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
@ -401,7 +401,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||||
)
|
)
|
||||||
@ -612,7 +612,7 @@ endif()
|
|||||||
|
|
||||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn SOURCE_DIR
|
vllm-flash-attn SOURCE_DIR
|
||||||
${VLLM_FLASH_ATTN_SRC_DIR}
|
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
)
|
)
|
||||||
|
|||||||
@ -122,8 +122,8 @@ struct ScaledEpilogue
|
|||||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||||
|
|
||||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||||
return ArgumentType{a_args, evt0_args};
|
return ArgumentType{a_args, evt0_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -167,8 +167,8 @@ struct ScaledEpilogueBias
|
|||||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||||
|
|
||||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||||
return ArgumentType{a_args, evt0_args, bias_args};
|
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp
|
|||||||
auto azp_adj_args =
|
auto azp_adj_args =
|
||||||
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
||||||
|
|
||||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
|
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
|
||||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
|
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
b_args, evt_azp_args, {}};
|
||||||
|
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken
|
|||||||
auto azp_adj_args =
|
auto azp_adj_args =
|
||||||
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
||||||
|
|
||||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
|
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
|
||||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
|
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
|
||||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
|
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
b_args, evt_acc_args, {}};
|
||||||
|
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}; // namespace vllm::c2x
|
}; // namespace vllm::c2x
|
||||||
|
|||||||
@ -146,8 +146,8 @@ struct ScaledEpilogue
|
|||||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||||
|
|
||||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||||
return ArgumentType{a_args, evt0_args};
|
return ArgumentType{a_args, evt0_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -193,8 +193,8 @@ struct ScaledEpilogueBias
|
|||||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||||
|
|
||||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||||
return ArgumentType{a_args, evt0_args, bias_args};
|
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -236,8 +236,8 @@ struct ScaledEpilogueColumnBias
|
|||||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||||
|
|
||||||
typename EVTCompute0::Arguments evt0_args{b_args};
|
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
|
||||||
return ArgumentType{a_args, evt0_args, bias_args};
|
return ArgumentType{a_args, evt0_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -297,9 +297,10 @@ struct ScaledEpilogueBiasAzp
|
|||||||
auto azp_adj_args =
|
auto azp_adj_args =
|
||||||
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
|
||||||
|
|
||||||
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
|
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
|
||||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
|
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
b_args, evt_azp_args, {}};
|
||||||
|
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -374,10 +375,11 @@ struct ScaledEpilogueBiasAzpToken
|
|||||||
auto azp_adj_args =
|
auto azp_adj_args =
|
||||||
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
|
||||||
|
|
||||||
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
|
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
|
||||||
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
|
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
|
||||||
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
|
typename EVTComputeScaleB::Arguments evt_scale_b_args{
|
||||||
return ArgumentType{a_args, evt_scale_b_args, bias_args};
|
b_args, evt_acc_args, {}};
|
||||||
|
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user