Add marlin implementation for fp8 decompress

vllm-project · mgoin · Jul 3, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 24, 2024
commit 7c5117565c6af82f86c73260d63aad4bb1d6000b
@@ -178,6 +178,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -92,6 +92,12 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& g_idx,
+                              torch::Tensor& perm, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k, bool is_k_full);
+
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,