diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f76b24c025ff..87681d7eb960 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,29 +12,29 @@ repos: - id: yapf args: [--in-place, --verbose] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.3 + rev: v0.11.7 hooks: - id: ruff args: [--output-format, github, --fix] - repo: https://github.com/codespell-project/codespell - rev: v2.4.0 + rev: v2.4.1 hooks: - id: codespell additional_dependencies: ['tomli'] args: ['--toml', 'pyproject.toml'] - repo: https://github.com/PyCQA/isort - rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0 + rev: 6.0.1 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.7 + rev: v20.1.3 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.27 + rev: v0.9.29 hooks: - id: pymarkdown args: [fix] @@ -43,7 +43,7 @@ repos: hooks: - id: actionlint - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.6.2 + rev: 0.6.17 hooks: - id: pip-compile args: [requirements/test.in, -o, requirements/test.txt] diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index 47ecf109d0f5..a217401b3d7c 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h index 205b308fe511..3705216cada5 100644 --- a/csrc/moe/marlin_moe_wna16/marlin_template.h +++ b/csrc/moe/marlin_moe_wna16/marlin_template.h @@ -209,8 +209,8 @@ __device__ inline typename ScalarType::FragB dequant( const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -233,9 +233,9 @@ dequant(int q, // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t ADD = 0xC308C308; diff --git a/csrc/moe/moe_wna16_utils.h b/csrc/moe/moe_wna16_utils.h index 4396b80240ef..8ef03f0e6052 100644 --- a/csrc/moe/moe_wna16_utils.h +++ b/csrc/moe/moe_wna16_utils.h @@ -108,11 +108,11 @@ __device__ inline void dequant(int q, half2* res) { const int MUL = 0x2c002c00; const int ADD = 0xd400d400; - int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); q >>= 8; - int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); res[0] = __hsub2(*reinterpret_cast(&lo0), *reinterpret_cast(&SUB)); @@ -149,13 +149,13 @@ __device__ inline void dequant(int q, nv_bfloat162* res) { static constexpr uint32_t MASK = 0x000f000f; static constexpr uint32_t EX = 0x43004300; - int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t ADD = 0xC300C300; diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu index ec0bf2c3cb4b..ea3bb4299046 100644 --- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu +++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu @@ -347,7 +347,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { hmma16816_f32( C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx], - reinterpret_cast(BF_frag[reg_buf_idx][n_idx])); + reinterpret_cast(BF_frag[reg_buf_idx][n_idx])); } } } diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 83bbd1e6816a..a974c881eb83 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index ba0a2410c037..ea96326ed7e6 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index cd1830764cce..c96d68d9b29a 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 49eee4128ee7..b26505f771c8 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408;