diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py index efb62ca3799a9..3bf59dea30972 100644 --- a/tests/kernels/quantization/test_awq.py +++ b/tests/kernels/quantization/test_awq.py @@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch): qweight = torch.randint( -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32 ) - scales = torch.randint( + scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16) + qzeros = torch.randint( -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32 ) - qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16) split_k_iters = 8 - opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters)) + opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters)) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 52a58a082683d..2319655008c50 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -498,15 +498,15 @@ def awq_dequantize( def awq_gemm( input: torch.Tensor, qweight: torch.Tensor, - qzeros: torch.Tensor, scales: torch.Tensor, + qzeros: torch.Tensor, split_k_iters: int, ) -> torch.Tensor: if envs.VLLM_USE_TRITON_AWQ: from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton - return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters) - return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters) + return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters) + return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters) # gptq @@ -632,8 +632,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): def _awq_gemm_fake( input: torch.Tensor, qweight: torch.Tensor, - qzeros: torch.Tensor, scales: torch.Tensor, + qzeros: torch.Tensor, split_k_iters: torch.SymInt, ) -> torch.Tensor: num_in_feats = input.size(0)