[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
vllm-project · Aug 5, 2024 · 360bd67 · 360bd67
1 parent ef527be
commit 360bd67
Show file tree

Hide file tree

Showing 29 changed files with 4,970 additions and 21 deletions.
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
@@ -30,6 +30,11 @@ jobs:
       run: |
         EXCLUDES=(
             'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/quantization/gguf/ggml-common.h'
+            'csrc/quantization/gguf/dequantize.cuh'
+            'csrc/quantization/gguf/vecdotq.cuh'
+            'csrc/quantization/gguf/mmq.cuh'
+            'csrc/quantization/gguf/mmvq.cuh'
         )
         find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
             | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -208,6 +208,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+    "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -107,6 +107,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                 int64_t size_n, int64_t num_bits);
 
+torch::Tensor ggml_dequantize(torch::Tensor W, int8_t type, int64_t m,
+                              int64_t n);
+
+torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int8_t type,
+                                  int64_t row);
+
+torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int8_t type,
+                              int64_t row);
+
 torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,