[Kernel][Build/CI] Bump CUTLASS to 3.8 and add initializers for cutlass epilogues (#13797)

This commit is contained in:
Henry Tsang 2025-02-25 18:52:03 -08:00 committed by GitHub
parent e1fe7591f2
commit 094b7d9496
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 29 deletions

View File

@ -266,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above. # Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.7.0 GIT_TAG v3.8.0
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -321,7 +321,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now). # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
@ -401,7 +401,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# FP4 Archs and flags # FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
) )
@ -612,7 +612,7 @@ endif()
if(VLLM_FLASH_ATTN_SRC_DIR) if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn SOURCE_DIR vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR} ${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
) )

View File

@ -122,8 +122,8 @@ struct ScaledEpilogue
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales); auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales); auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
typename EVTCompute0::Arguments evt0_args{b_args}; typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args}; return ArgumentType{a_args, evt0_args, {}};
} }
}; };
@ -167,8 +167,8 @@ struct ScaledEpilogueBias
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales); auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias); auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args}; typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args}; return ArgumentType{a_args, evt0_args, bias_args, {}};
} }
}; };
@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp
auto azp_adj_args = auto azp_adj_args =
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj); SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; typename EVTComputeScaleB::Arguments evt_scale_b_args{
return ArgumentType{a_args, evt_scale_b_args, bias_args}; b_args, evt_azp_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
} }
}; };
@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken
auto azp_adj_args = auto azp_adj_args =
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj); SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; typename EVTComputeScaleB::Arguments evt_scale_b_args{
return ArgumentType{a_args, evt_scale_b_args, bias_args}; b_args, evt_acc_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
} }
}; };
}; // namespace vllm::c2x }; // namespace vllm::c2x

View File

@ -146,8 +146,8 @@ struct ScaledEpilogue
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales); auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales); auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
typename EVTCompute0::Arguments evt0_args{b_args}; typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args}; return ArgumentType{a_args, evt0_args, {}};
} }
}; };
@ -193,8 +193,8 @@ struct ScaledEpilogueBias
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales); auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias); auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args}; typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args}; return ArgumentType{a_args, evt0_args, bias_args, {}};
} }
}; };
@ -236,8 +236,8 @@ struct ScaledEpilogueColumnBias
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales); auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias); auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args}; typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args}; return ArgumentType{a_args, evt0_args, bias_args, {}};
} }
}; };
@ -297,9 +297,10 @@ struct ScaledEpilogueBiasAzp
auto azp_adj_args = auto azp_adj_args =
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj); SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; typename EVTComputeScaleB::Arguments evt_scale_b_args{
return ArgumentType{a_args, evt_scale_b_args, bias_args}; b_args, evt_azp_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
} }
}; };
@ -374,10 +375,11 @@ struct ScaledEpilogueBiasAzpToken
auto azp_adj_args = auto azp_adj_args =
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj); SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; typename EVTComputeScaleB::Arguments evt_scale_b_args{
return ArgumentType{a_args, evt_scale_b_args, bias_args}; b_args, evt_acc_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
} }
}; };