Sync with whisper.cpp 🚀

ErcinDedeoglu · Jan 4, 2024 · 76a0996 · 76a0996
1 parent def360f
commit 76a0996
Show file tree

Hide file tree

Showing 18 changed files with 822 additions and 384 deletions.
diff --git a/src/whisper/Package.swift b/src/whisper/Package.swift
@@ -13,9 +13,13 @@ let package = Package(
     products: [
         .library(name: "whisper", targets: ["whisper"]),
     ],
+    dependencies: [
+        .package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
+    ],
     targets: [
         .target(
             name: "whisper",
+            dependencies: ["ggml"],
             path: ".",
             exclude: [
                "bindings",
@@ -32,14 +36,8 @@ let package = Package(
                "Makefile"
             ],
             sources: [
-                "ggml.c",
                 "whisper.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m"
             ],
-            resources: [.process("ggml-metal.metal")],
             publicHeadersPath: "spm-headers",
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),

diff --git a/src/whisper/bindings/javascript/package.json b/src/whisper/bindings/javascript/package.json
@@ -1,6 +1,6 @@
 {
   "name": "whisper.cpp",
-  "version": "1.5.2",
+  "version": "1.5.3",
   "description": "Whisper speech recognition",
   "main": "whisper.js",
   "scripts": {

diff --git a/src/whisper/bindings/ruby/ext/ggml-backend-impl.h b/src/whisper/bindings/ruby/ext/ggml-backend-impl.h
@@ -70,7 +70,7 @@ extern "C" {
         void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
         bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

diff --git a/src/whisper/bindings/ruby/ext/ggml-backend.c b/src/whisper/bindings/ruby/ext/ggml-backend.c
@@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
     backend->iface.graph_plan_compute(backend, plan);
 }
 
-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_compute(backend, cgraph);
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

diff --git a/src/whisper/bindings/ruby/ext/ggml-backend.h b/src/whisper/bindings/ruby/ext/ggml-backend.h
@@ -52,7 +52,7 @@ extern "C" {
 
     GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
 
     // tensor copy between different backends

diff --git a/src/whisper/extra/sync-ggml-am.sh b/src/whisper/extra/sync-ggml-am.sh
@@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc"
 cd $SRC_GGML
 
 git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
+git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
 
 if [ ! -s $SRC_WHISPER/ggml-commits ]; then
     rm -v $SRC_WHISPER/ggml-commits
@@ -48,11 +48,14 @@ while read c; do
         src/ggml*.m \
         src/ggml*.metal \
         src/ggml*.cu \
-        tests/test-opt.cpp \
-        tests/test-grad0.cpp \
-        tests/test-quantize-fns.cpp \
-        tests/test-quantize-perf.cpp \
-        tests/test-backend-ops.cpp \
+        examples/common.h \
+        examples/common.cpp \
+        examples/common-ggml.h \
+        examples/common-ggml.cpp \
+        examples/whisper/whisper.h \
+        examples/whisper/whisper.cpp \
+        examples/whisper/main.cpp \
+        examples/whisper/quantize.cpp \
         >> $SRC_WHISPER/ggml-src.patch
 done < $SRC_WHISPER/ggml-commits
 
@@ -87,7 +90,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
     # src/ggml-impl.h             -> ggml-impl.h
     # src/ggml-metal.h            -> ggml-metal.h
     # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-metal.metal        -> ggml-metal.metal
     # src/ggml-mpi.h              -> ggml-mpi.h
     # src/ggml-mpi.c              -> ggml-mpi.c
     # src/ggml-opencl.cpp         -> ggml-opencl.cpp
@@ -118,7 +120,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
         -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
         -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
         -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
         -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
         -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
         -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \

diff --git a/src/whisper/extra/sync-ggml.last b/src/whisper/extra/sync-ggml.last
@@ -1 +1 @@
-965137f49917768959679a9e860dc414e170fd55
+3fd01e00e40583ccd4b393a7c6502d6a4455a1d5
diff --git a/src/whisper/ggml-backend-impl.h b/src/whisper/ggml-backend-impl.h
@@ -90,7 +90,7 @@ extern "C" {
         void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
         bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

diff --git a/src/whisper/ggml-backend.c b/src/whisper/ggml-backend.c
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
     ggml_backend_synchronize(backend);
 }
 
-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    if (!backend->iface.graph_compute(backend, cgraph)) {
+        return false;
+    }
 
     // TODO: optional sync
     ggml_backend_synchronize(backend);
+    return true;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
     GGML_UNUSED(backend);
 }
 
-static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
     cplan.work_data = cpu_ctx->work_data;
 
     ggml_graph_compute(cgraph, &cplan);
+    return true;
 }
 
 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

diff --git a/src/whisper/ggml-backend.h b/src/whisper/ggml-backend.h
@@ -58,7 +58,7 @@ extern "C" {
 
     GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
 
     // tensor copy between different backends

diff --git a/src/whisper/ggml-cuda.cu b/src/whisper/ggml-cuda.cu
@@ -119,7 +119,9 @@
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
 
 #define GGML_CUDA_MAX_NODES 8192
 
@@ -133,7 +135,6 @@
 
 // TODO: improve this to be correct for more hardware
 //       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
 #if !defined(GGML_CUDA_FORCE_MMQ)
 #define CUDA_USE_TENSOR_CORES
 #endif
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
 // pool with virtual memory
 static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
 
 static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
@@ -8661,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
+    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
 #ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
+    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
+#endif // CUDA_USE_TENSOR_CORES
+
 #else
-    const bool use_tensor_cores = false;
-#endif
+
+    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
+    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+#ifdef CUDA_USE_TENSOR_CORES
+    // when tensor cores are available, use them for large batch size
+    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+    use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
+#endif // CUDA_USE_TENSOR_CORES
+
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
     // debug helpers
     //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@@ -8675,13 +8690,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
         // KQ + KQV multi-batch
         ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
@@ -8701,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
             }
         } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
             if (use_mul_mat_q) {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
             } else {
@@ -9903,7 +9910,7 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_ba
     UNUSED(plan);
 }
 
-static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
 
     ggml_cuda_set_main_device(cuda_ctx->device);
@@ -9960,6 +9967,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     }
 
     UNUSED(backend);
+
+    return true;
 }
 
 static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
@@ -10032,14 +10041,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
                 }
                 return false;
             } break;
+        case GGML_OP_DUP:
+        case GGML_OP_REPEAT:
+        case GGML_OP_CONCAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_NORM:
-        case GGML_OP_REPEAT:
-        case GGML_OP_DUP:
         case GGML_OP_ADD:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
@@ -10056,7 +10070,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
-        case GGML_OP_CONCAT:
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:

diff --git a/src/whisper/ggml-metal.h b/src/whisper/ggml-metal.h
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
 //
 // backend API
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		965137f49917768959679a9e860dc414e170fd55
		3fd01e00e40583ccd4b393a7c6502d6a4455a1d5