#pragma once #include "quantization/vectorization.cuh" #include "quantization/utils.cuh" #include #ifndef USE_ROCM #include "nvidia/quant_utils.cuh" #else #include "amd/quant_utils.cuh" #endif // Determines the preferred FP8 type for the current platform. // Note that for CUDA this just returns true, // but on ROCm it will check device props. static bool is_fp8_ocp() { #ifndef USE_ROCM return true; #else auto dprops = at::cuda::getCurrentDeviceProperties(); std::string device_arch = dprops->gcnArchName; size_t substring = device_arch.find("gfx94"); return substring == std::string::npos; #endif } namespace vllm { __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { float old; old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : __uint_as_float( atomicMin((unsigned int*)addr, __float_as_uint(value))); return old; } template __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, float const scale) { float x = 0.0f; if constexpr (is_scale_inverted) { x = val * scale; } else { x = val / scale; } float r = fmaxf(-quant_type_max_v, fminf(x, quant_type_max_v)); #ifndef USE_ROCM // Use hardware cvt instruction for fp8 on nvidia // Currently only support fp8_type = c10::Float8_e4m3fn return fp8::vec_conversion(r); #else // Use hardware cvt instruction for fp8 on rocm return fp8::cvt_c10(r); #endif } } // namespace vllm