huggingface · danieldk · Jul 29, 2024
diff --git a/server/marlin/marlin_kernels/__init__.pyi b/server/marlin/marlin_kernels/__init__.pyi
@@ -1,5 +1,11 @@
 import torch
 
+def awq_marlin_repack(
+    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    """Repack AWQ parameters for GPTQ-Marlin."""
+    ...
+
 def gptq_marlin_gemm(
     a: torch.Tensor,
     b_q_weight: torch.Tensor,
@@ -12,6 +18,8 @@ def gptq_marlin_gemm(
     size_n: int,
     size_k: int,
     is_k_full: bool,
+    has_zp: bool,
+    use_fp32_reduce: bool,
 ) -> torch.Tensor:
     """
     Matrix multiplication using Marlin kernels. This is an extension of

diff --git a/server/marlin/marlin_kernels/ext.hh b/server/marlin/marlin_kernels/ext.hh
@@ -14,7 +14,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                                torch::Tensor &g_idx, torch::Tensor &perm,
                                torch::Tensor &workspace, int64_t num_bits,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp);
+                               bool is_k_full, bool has_zp,
+                               bool use_fp32_reduce);
 
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                                   torch::Tensor &b_meta,