Rs/marlin downstream v0.3.2 (#43)

Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com>
neuralmagic · Feb 22, 2024 · 4b44479 · 4b44479
1 parent acb8615
commit 4b44479
Show file tree

Hide file tree

Showing 15 changed files with 1,563 additions and 17 deletions.
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -80,6 +80,15 @@ torch::Tensor awq_dequantize(
     int split_k_iters,
     int thx,
     int thy);
+
+torch::Tensor marlin_gemm(
+  torch::Tensor &a,
+  torch::Tensor &b_q_weight,
+  torch::Tensor &b_scales,
+  torch::Tensor &workspace,
+  int64_t size_m,
+  int64_t size_n,
+  int64_t size_k);
 #endif
 
 void squeezellm_gemm(

diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -52,6 +52,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
 #endif
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");